diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f48ace6883a..96c4f6420d0c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -327,6 +327,7 @@ set(LLVM_ALL_TARGETS
   ARM
   BPF
   Hexagon
+  JSBackend # @LOCALMOD
   Lanai
   Mips
   MSP430
diff --git a/emscripten-version.txt b/emscripten-version.txt
new file mode 100644
index 000000000000..2a788d314bae
--- /dev/null
+++ b/emscripten-version.txt
@@ -0,0 +1 @@
+"1.37.10"
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index d4130e1e85ae..eb9e42770b56 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -81,6 +81,7 @@ class Triple {
     nvptx64,        // NVPTX: 64-bit
     le32,           // le32: generic little-endian 32-bit CPU (PNaCl)
     le64,           // le64: generic little-endian 64-bit CPU (PNaCl)
+    asmjs,          // asm.js JavaScript subset @LOCALMOD Emscripten
     amdil,          // AMDIL
     amdil64,        // AMDIL with 64-bit pointers
     hsail,          // AMD HSAIL
@@ -161,6 +162,7 @@ class Triple {
     Haiku,
     Minix,
     RTEMS,
+    Emscripten, // Emscripten JavaScript runtime @LOCALMOD Emscripten
     NaCl,       // Native Client
     CNK,        // BG/P Compute-Node Kernel
     Bitrig,
@@ -546,6 +548,13 @@ class Triple {
     return getOS() == Triple::NaCl;
   }
 
+  // @LOCALMOD-START Emscripten
+  /// Tests whether the OS is Emscripten.
+  bool isOSEmscripten() const {
+    return getOS() == Triple::Emscripten;
+  }
+  // @LOCALMOD-END Emscripten
+
   /// Tests whether the OS is Linux.
   bool isOSLinux() const {
     return getOS() == Triple::Linux;
diff --git a/include/llvm/Analysis/NaCl.h b/include/llvm/Analysis/NaCl.h
new file mode 100644
index 000000000000..eb894ef9b64a
--- /dev/null
+++ b/include/llvm/Analysis/NaCl.h
@@ -0,0 +1,74 @@
+//===-- NaCl.h - NaCl Analysis ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_NACL_H
+#define LLVM_ANALYSIS_NACL_H
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+namespace llvm {
+
+class FunctionPass;
+class ModulePass;
+extern cl::opt<bool> PNaClABIAllowDebugMetadata;
+
+class PNaClABIErrorReporter {
+  PNaClABIErrorReporter(const PNaClABIErrorReporter&) = delete;
+  void operator=(const PNaClABIErrorReporter&) = delete;
+ public:
+  PNaClABIErrorReporter() : ErrorCount(0), Errors(ErrorString),
+                            UseFatalErrors(true) {}
+  ~PNaClABIErrorReporter() {}
+  // Return the number of verification errors from the last run.
+  int getErrorCount() const { return ErrorCount; }
+  // Print the error messages to O
+  void printErrors(llvm::raw_ostream &O) {
+    Errors.flush();
+    O << ErrorString;
+  }
+  // Increments the error count and returns an ostream to which the error
+  // message can be streamed.
+  raw_ostream &addError() {
+    ErrorCount++;
+    return Errors;
+  }
+  // Reset the error count and error messages.
+  void reset() {
+    ErrorCount = 0;
+    Errors.flush();
+    ErrorString.clear();
+  }
+  void setNonFatal() {
+    UseFatalErrors = false;
+  }
+  void checkForFatalErrors() {
+    if (UseFatalErrors && ErrorCount != 0) {
+      printErrors(errs());
+      report_fatal_error("PNaCl ABI verification failed");
+    }
+  }
+ private:
+  int ErrorCount;
+  std::string ErrorString;
+  raw_string_ostream Errors;
+  bool UseFatalErrors;
+};
+
+FunctionPass *createPNaClABIVerifyFunctionsPass(
+    PNaClABIErrorReporter *Reporter);
+ModulePass *createPNaClABIVerifyModulePass(PNaClABIErrorReporter *Reporter,
+                                           bool StreamingMode = false);
+
+}
+
+
+#endif
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 89ae94270888..f84fec6181ac 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -688,6 +688,51 @@ def int_convert_from_fp16 : Intrinsic<[llvm_anyfloat_ty], [llvm_i16_ty]>;
 def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
                                 [], "llvm.clear_cache">;
 
+// @LOCALMOD-BEGIN
+//===----------------------- Native Client Intrinsics ---------------------===//
+// NaCl-specific setjmp/longjmp intrinsics.
+// See https://code.google.com/p/nativeclient/issues/detail?id=3429
+def int_nacl_setjmp   : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty]>;
+def int_nacl_longjmp  : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrNoReturn]>;
+
+// Fast built-in version of NaCl's tls_get() IRT interface.
+def int_nacl_read_tp : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
+
+// Atomic intrinsics.
+//
+// Volatiles and atomics are encoded through these intrinsics to make
+// them platform-independent, remove some of LLVM's legacy, and isolate
+// PNaCl from future changes to IR. The intrinsics allow user code to
+// use `__sync_*` builtins as well as C11/C++11 atomics.
+//
+// These are further documented in docs/PNaClLangRef.rst.
+//
+// Note that IntrReadWriteArgMem is used in all cases to prevent
+// reordering.
+def int_nacl_atomic_load : Intrinsic<[llvm_anyint_ty],
+    [LLVMPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+    [IntrArgMemOnly]>;
+def int_nacl_atomic_store : Intrinsic<[],
+    [llvm_anyint_ty, LLVMPointerType<LLVMMatchType<0>>, llvm_i32_ty],
+    [IntrArgMemOnly]>;
+def int_nacl_atomic_rmw : Intrinsic<[llvm_anyint_ty],
+    [llvm_i32_ty, LLVMPointerType<LLVMMatchType<0>>, LLVMMatchType<0>,
+     llvm_i32_ty],
+    [IntrArgMemOnly]>;
+def int_nacl_atomic_cmpxchg : Intrinsic<[llvm_anyint_ty],
+    [LLVMPointerType<LLVMMatchType<0>>, LLVMMatchType<0>, LLVMMatchType<0>,
+     llvm_i32_ty, llvm_i32_ty],
+    [IntrArgMemOnly]>;
+def int_nacl_atomic_fence : Intrinsic<[], [llvm_i32_ty],
+    [IntrArgMemOnly]>;
+def int_nacl_atomic_fence_all : Intrinsic<[], [],
+    [IntrArgMemOnly]>;
+def int_nacl_atomic_is_lock_free : Intrinsic<[llvm_i1_ty],
+    [llvm_i32_ty, llvm_ptr_ty], [IntrNoMem]>,
+    GCCBuiltin<"__nacl_atomic_is_lock_free">;
+// @LOCALMOD-END
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
diff --git a/include/llvm/IR/NaClAtomicIntrinsics.h b/include/llvm/IR/NaClAtomicIntrinsics.h
new file mode 100644
index 000000000000..e820b9df504d
--- /dev/null
+++ b/include/llvm/IR/NaClAtomicIntrinsics.h
@@ -0,0 +1,110 @@
+//===-- llvm/IR/NaClAtomicIntrinsics.h - NaCl Atomic Intrinsics -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes atomic intrinsic functions that are specific to NaCl.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_NACL_ATOMIC_INTRINSICS_H
+#define LLVM_IR_NACL_ATOMIC_INTRINSICS_H
+
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Compiler.h"
+#include <cstddef>
+
+namespace llvm {
+
+namespace NaCl {
+
+static const size_t NumAtomicIntrinsics = 6;
+static const size_t NumAtomicIntrinsicOverloadTypes = 4;
+static const size_t MaxAtomicIntrinsicsParameters = 5;
+
+/// Describe all the atomic intrinsics and their type signature. Most
+/// can be overloaded on a type.
+class AtomicIntrinsics {
+public:
+  enum ParamType {
+    NoP, /// No parameter.
+    Int, /// Overloaded.
+    Ptr, /// Overloaded.
+    RMW, /// Atomic RMW operation type.
+    Mem  /// Memory order.
+  };
+
+  struct AtomicIntrinsic {
+    Type *OverloadedType;
+    Intrinsic::ID ID;
+    uint8_t Overloaded : 1;
+    uint8_t NumParams : 7;
+    uint8_t ParamType[MaxAtomicIntrinsicsParameters];
+
+    Function *getDeclaration(Module *M) const {
+      // The atomic intrinsic can be overloaded on zero or one type,
+      // which is needed to create the function's declaration.
+      return Intrinsic::getDeclaration(
+          M, ID, ArrayRef<Type *>(&OverloadedType, Overloaded ? 1 : 0));
+    }
+  };
+
+  AtomicIntrinsics(LLVMContext &C);
+  ~AtomicIntrinsics() {}
+
+  typedef ArrayRef<AtomicIntrinsic> View;
+
+  /// Access all atomic intrinsics, which can then be iterated over.
+  View allIntrinsicsAndOverloads() const;
+  /// Access a particular atomic intrinsic.
+  /// \returns 0 if no intrinsic was found.
+  const AtomicIntrinsic *find(Intrinsic::ID ID, Type *OverloadedType) const;
+
+private:
+  AtomicIntrinsic I[NumAtomicIntrinsics][NumAtomicIntrinsicOverloadTypes];
+
+  AtomicIntrinsics() = delete;
+  AtomicIntrinsics(const AtomicIntrinsics &) = delete;
+  AtomicIntrinsics &operator=(const AtomicIntrinsics &) = delete;
+};
+
+/// Operations that can be represented by the @llvm.nacl.atomic.rmw
+/// intrinsic.
+///
+/// Do not reorder these values: their order offers forward
+/// compatibility of bitcode targeted to NaCl.
+enum AtomicRMWOperation {
+  AtomicInvalid = 0, // Invalid, keep first.
+  AtomicAdd,
+  AtomicSub,
+  AtomicOr,
+  AtomicAnd,
+  AtomicXor,
+  AtomicExchange,
+  AtomicNum // Invalid, keep last.
+};
+
+/// Memory orderings supported by C11/C++11.
+///
+/// Do not reorder these values: their order offers forward
+/// compatibility of bitcode targeted to NaCl.
+enum MemoryOrder {
+  MemoryOrderInvalid = 0, // Invalid, keep first.
+  MemoryOrderRelaxed,
+  MemoryOrderConsume,
+  MemoryOrderAcquire,
+  MemoryOrderRelease,
+  MemoryOrderAcquireRelease,
+  MemoryOrderSequentiallyConsistent,
+  MemoryOrderNum // Invalid, keep last.
+};
+
+} // End NaCl namespace
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index a34ebaf18a03..d5c98cddab4d 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -354,6 +354,69 @@ void initializeVirtRegMapPass(PassRegistry&);
 void initializeVirtRegRewriterPass(PassRegistry&);
 void initializeWholeProgramDevirtPass(PassRegistry &);
 void initializeWinEHPreparePass(PassRegistry&);
+
+// @LOCALMOD-BEGIN
+void initializeAddPNaClExternalDeclsPass(PassRegistry&);
+void initializeAllocateDataSegmentPass(PassRegistry&);
+void initializeBackendCanonicalizePass(PassRegistry&);
+void initializeCanonicalizeMemIntrinsicsPass(PassRegistry&);
+void initializeCleanupUsedGlobalsMetadataPass(PassRegistry&);
+void initializeConstantInsertExtractElementIndexPass(PassRegistry&);
+void initializeExpandAllocasPass(PassRegistry&);
+void initializeExpandArithWithOverflowPass(PassRegistry&);
+void initializeExpandByValPass(PassRegistry&);
+void initializeExpandConstantExprPass(PassRegistry&);
+void initializeExpandCtorsPass(PassRegistry&);
+void initializeExpandGetElementPtrPass(PassRegistry&);
+void initializeExpandIndirectBrPass(PassRegistry&);
+void initializeExpandLargeIntegersPass(PassRegistry&);
+void initializeExpandShuffleVectorPass(PassRegistry&);
+void initializeExpandSmallArgumentsPass(PassRegistry&);
+void initializeExpandStructRegsPass(PassRegistry&);
+void initializeExpandTlsConstantExprPass(PassRegistry&);
+void initializeExpandTlsPass(PassRegistry&);
+void initializeExpandVarArgsPass(PassRegistry&);
+void initializeFixVectorLoadStoreAlignmentPass(PassRegistry&);
+void initializeFlattenGlobalsPass(PassRegistry&);
+void initializeGlobalCleanupPass(PassRegistry&);
+void initializeGlobalizeConstantVectorsPass(PassRegistry&);
+void initializeInsertDivideCheckPass(PassRegistry&);
+void initializeInternalizeUsedGlobalsPass(PassRegistry&);
+void initializeNaClCcRewritePass(PassRegistry&);
+void initializeNormalizeAlignmentPass(PassRegistry&);
+void initializePNaClABIVerifyFunctionsPass(PassRegistry&);
+void initializePNaClABIVerifyModulePass(PassRegistry&);
+void initializePNaClSjLjEHPass(PassRegistry&);
+void initializePromoteI1OpsPass(PassRegistry&);
+void initializePromoteIntegersPass(PassRegistry&);
+void initializeRemoveAsmMemoryPass(PassRegistry&);
+void initializeRenameEntryPointPass(PassRegistry&);
+void initializeReplacePtrsWithIntsPass(PassRegistry&);
+void initializeResolveAliasesPass(PassRegistry&);
+void initializeResolvePNaClIntrinsicsPass(PassRegistry&);
+void initializeRewriteAtomicsPass(PassRegistry&);
+void initializeRewriteLLVMIntrinsicsPass(PassRegistry&);
+void initializeRewritePNaClLibraryCallsPass(PassRegistry&);
+void initializeSandboxIndirectCallsPass(PassRegistry&);
+void initializeSandboxMemoryAccessesPass(PassRegistry&);
+void initializeSimplifyAllocasPass(PassRegistry&);
+void initializeSimplifyStructRegSignaturesPass(PassRegistry&);
+void initializeStripAttributesPass(PassRegistry&);
+void initializeStripMetadataPass(PassRegistry&);
+void initializeStripModuleFlagsPass(PassRegistry&);
+void initializeStripDanglingDISubprogramsPass(PassRegistry&);
+void initializeStripTlsPass(PassRegistry&);
+void initializeSubstituteUndefsPass(PassRegistry&);
+// Emscripten passes:
+void initializeExpandI64Pass(PassRegistry&);
+void initializeExpandInsertExtractElementPass(PassRegistry&);
+void initializeLowerEmAsyncifyPass(PassRegistry&);
+void initializeLowerEmExceptionsPass(PassRegistry&);
+void initializeLowerEmSetjmpPass(PassRegistry&);
+void initializeNoExitRuntimePass(PassRegistry&);
+// Emscripten passes end.
+// @LOCALMOD-END
+
 void initializeWriteBitcodePassPass(PassRegistry &);
 void initializeWriteThinLTOBitcodePass(PassRegistry &);
 void initializeXRayInstrumentationPass(PassRegistry &);
diff --git a/include/llvm/Transforms/NaCl.h b/include/llvm/Transforms/NaCl.h
new file mode 100644
index 000000000000..56884e16a43f
--- /dev/null
+++ b/include/llvm/Transforms/NaCl.h
@@ -0,0 +1,109 @@
+//===-- NaCl.h - NaCl Transformations ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_NACL_H
+#define LLVM_TRANSFORMS_NACL_H
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+
+namespace llvm {
+
+class BasicBlockPass;
+class Function;
+class FunctionPass;
+class FunctionType;
+class Instruction;
+class ModulePass;
+class Triple;
+class Use;
+class Value;
+
+BasicBlockPass *createConstantInsertExtractElementIndexPass();
+BasicBlockPass *createExpandGetElementPtrPass();
+BasicBlockPass *createExpandShuffleVectorPass();
+BasicBlockPass *createFixVectorLoadStoreAlignmentPass();
+BasicBlockPass *createPromoteI1OpsPass();
+BasicBlockPass *createSimplifyAllocasPass();
+FunctionPass *createBackendCanonicalizePass();
+FunctionPass *createExpandConstantExprPass();
+FunctionPass *createExpandLargeIntegersPass();
+FunctionPass *createExpandStructRegsPass();
+FunctionPass *createInsertDivideCheckPass();
+FunctionPass *createNormalizeAlignmentPass();
+FunctionPass *createRemoveAsmMemoryPass();
+FunctionPass *createResolvePNaClIntrinsicsPass();
+ModulePass *createAddPNaClExternalDeclsPass();
+ModulePass *createCanonicalizeMemIntrinsicsPass();
+ModulePass *createCleanupUsedGlobalsMetadataPass();
+ModulePass *createExpandArithWithOverflowPass();
+ModulePass *createExpandByValPass();
+ModulePass *createExpandCtorsPass();
+ModulePass *createExpandIndirectBrPass();
+ModulePass *createExpandSmallArgumentsPass();
+ModulePass *createExpandTlsConstantExprPass();
+ModulePass *createExpandTlsPass();
+ModulePass *createExpandVarArgsPass();
+ModulePass *createFlattenGlobalsPass();
+ModulePass *createGlobalCleanupPass();
+ModulePass *createGlobalizeConstantVectorsPass();
+ModulePass *createInternalizeUsedGlobalsPass();
+ModulePass *createPNaClSjLjEHPass();
+ModulePass *createPromoteIntegersPass();
+ModulePass *createReplacePtrsWithIntsPass();
+ModulePass *createResolveAliasesPass();
+ModulePass *createRewriteAtomicsPass();
+ModulePass *createRewriteLLVMIntrinsicsPass();
+ModulePass *createRewritePNaClLibraryCallsPass();
+ModulePass *createSimplifyStructRegSignaturesPass();
+ModulePass *createStripAttributesPass();
+ModulePass *createStripMetadataPass();
+ModulePass *createStripModuleFlagsPass();
+ModulePass *createStripDanglingDISubprogramsPass();
+
+// Emscripten passes:
+FunctionPass *createExpandInsertExtractElementPass();
+ModulePass *createExpandI64Pass();
+ModulePass *createLowerEmAsyncifyPass();
+ModulePass *createLowerEmExceptionsPass();
+ModulePass *createLowerEmSetjmpPass();
+ModulePass *createNoExitRuntimePass();
+// Emscripten passes end.
+
+//void PNaClABISimplifyAddPreOptPasses(Triple *T, PassManagerBase &PM);
+//void PNaClABISimplifyAddPostOptPasses(Triple *T, PassManagerBase &PM);
+
+Instruction *PhiSafeInsertPt(Use *U);
+void PhiSafeReplaceUses(Use *U, Value *NewVal);
+
+// Copy debug information from Original to New, and return New.
+template <typename T> T *CopyDebug(T *New, Instruction *Original) {
+  New->setDebugLoc(Original->getDebugLoc());
+  return New;
+}
+
+template <class InstType>
+static void CopyLoadOrStoreAttrs(InstType *Dest, InstType *Src) {
+  Dest->setVolatile(Src->isVolatile());
+  Dest->setAlignment(Src->getAlignment());
+  Dest->setOrdering(Src->getOrdering());
+  Dest->setSynchScope(Src->getSynchScope());
+}
+
+// In order to change a function's type, the function must be
+// recreated.  RecreateFunction() recreates Func with type NewType.
+// It copies or moves across everything except the argument values,
+// which the caller must update because the argument types might be
+// different.
+Function *RecreateFunction(Function *Func, FunctionType *NewType);
+
+}
+
+#endif
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index 11259cbe1815..c14b88780df2 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -37,6 +37,7 @@ add_llvm_library(LLVMCore
   Mangler.cpp
   Metadata.cpp
   Module.cpp
+  NaClAtomicIntrinsics.cpp
   ModuleSummaryIndex.cpp
   Operator.cpp
   OptBisect.cpp
diff --git a/lib/IR/NaClAtomicIntrinsics.cpp b/lib/IR/NaClAtomicIntrinsics.cpp
new file mode 100644
index 000000000000..8cd18a225b66
--- /dev/null
+++ b/lib/IR/NaClAtomicIntrinsics.cpp
@@ -0,0 +1,76 @@
+//=== llvm/IR/NaClAtomicIntrinsics.cpp - NaCl Atomic Intrinsics -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes atomic intrinsic functions that are specific to NaCl.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/NaClAtomicIntrinsics.h"
+#include "llvm/IR/Type.h"
+
+namespace llvm {
+
+namespace NaCl {
+
+AtomicIntrinsics::AtomicIntrinsics(LLVMContext &C) {
+  Type *IT[NumAtomicIntrinsicOverloadTypes] = { Type::getInt8Ty(C),
+                                                Type::getInt16Ty(C),
+                                                Type::getInt32Ty(C),
+                                                Type::getInt64Ty(C) };
+  size_t CurIntrin = 0;
+
+  // Initialize each of the atomic intrinsics and their overloads. They
+  // have up to 5 parameters, the following macro will take care of
+  // overloading.
+#define INIT(P0, P1, P2, P3, P4, INTRIN)                                       \
+  do {                                                                         \
+    for (size_t CurType = 0; CurType != NumAtomicIntrinsicOverloadTypes;       \
+         ++CurType) {                                                          \
+      size_t Param = 0;                                                        \
+      I[CurIntrin][CurType].OverloadedType = IT[CurType];                      \
+      I[CurIntrin][CurType].ID = Intrinsic::nacl_atomic_##INTRIN;              \
+      I[CurIntrin][CurType].Overloaded =                                       \
+          P0 == Int || P0 == Ptr || P1 == Int || P1 == Ptr || P2 == Int ||     \
+          P2 == Ptr || P3 == Int || P3 == Ptr || P4 == Int || P4 == Ptr;       \
+      I[CurIntrin][CurType].NumParams =                                        \
+          (P0 != NoP) + (P1 != NoP) + (P2 != NoP) + (P3 != NoP) + (P4 != NoP); \
+      I[CurIntrin][CurType].ParamType[Param++] = P0;                           \
+      I[CurIntrin][CurType].ParamType[Param++] = P1;                           \
+      I[CurIntrin][CurType].ParamType[Param++] = P2;                           \
+      I[CurIntrin][CurType].ParamType[Param++] = P3;                           \
+      I[CurIntrin][CurType].ParamType[Param++] = P4;                           \
+    }                                                                          \
+    ++CurIntrin;                                                               \
+  } while (0)
+
+  INIT(Ptr, Mem, NoP, NoP, NoP, load);
+  INIT(Ptr, Int, Mem, NoP, NoP, store);
+  INIT(RMW, Ptr, Int, Mem, NoP, rmw);
+  INIT(Ptr, Int, Int, Mem, Mem, cmpxchg);
+  INIT(Mem, NoP, NoP, NoP, NoP, fence);
+  INIT(NoP, NoP, NoP, NoP, NoP, fence_all);
+}
+
+AtomicIntrinsics::View AtomicIntrinsics::allIntrinsicsAndOverloads() const {
+  return View(&I[0][0], NumAtomicIntrinsics * NumAtomicIntrinsicOverloadTypes);
+}
+
+const AtomicIntrinsics::AtomicIntrinsic *
+AtomicIntrinsics::find(Intrinsic::ID ID, Type *OverloadedType) const {
+  View R = allIntrinsicsAndOverloads();
+  for (const AtomicIntrinsic *AI = R.begin(), *E = R.end(); AI != E; ++AI)
+    if (AI->ID == ID && AI->OverloadedType == OverloadedType)
+      return AI;
+  return 0;
+}
+
+} // End NaCl namespace
+
+} // End llvm namespace
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 5855059a189c..b13908ff994e 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -655,10 +655,10 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
       AssertDI(false, "!dbg attachment of global variable must be a DIGlobalVariableExpression");
   }
 
-  if (!GV.hasInitializer()) {
+  //if (!GV.hasInitializer()) { // XXX EMSCRIPTEN - do not do extra verification below, 40x slower linking on some big projects
     visitGlobalValue(GV);
     return;
-  }
+  //}
 
   // Walk any aggregate initializers looking for bitcasts between address spaces
   visitConstantExprsRecursively(GV.getInitializer());
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 9cf2d5d3fcd4..a2d94c7b8eb3 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -56,6 +56,7 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case nvptx64:        return "nvptx64";
   case le32:           return "le32";
   case le64:           return "le64";
+  case asmjs:          return "asmjs"; // @LOCALMOD Emscripten
   case amdil:          return "amdil";
   case amdil64:        return "amdil64";
   case hsail:          return "hsail";
@@ -124,6 +125,8 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
   case le32:        return "le32";
   case le64:        return "le64";
 
+  case asmjs:       return "asmjs"; // @LOCALMOD Emscripten
+
   case amdil:
   case amdil64:     return "amdil";
 
@@ -187,6 +190,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case Haiku: return "haiku";
   case Minix: return "minix";
   case RTEMS: return "rtems";
+  case Emscripten: return "emscripten"; // @LOCALMOD Emscripten
   case NaCl: return "nacl";
   case CNK: return "cnk";
   case Bitrig: return "bitrig";
@@ -285,6 +289,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("nvptx64", nvptx64)
     .Case("le32", le32)
     .Case("le64", le64)
+    .Case("asmjs", asmjs) // @LOCALMOD Emscripten
     .Case("amdil", amdil)
     .Case("amdil64", amdil64)
     .Case("hsail", hsail)
@@ -399,6 +404,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("nvptx64", Triple::nvptx64)
     .Case("le32", Triple::le32)
     .Case("le64", Triple::le64)
+    .Case("asmjs", Triple::asmjs) // @LOCALMOD Emscripten
     .Case("amdil", Triple::amdil)
     .Case("amdil64", Triple::amdil64)
     .Case("hsail", Triple::hsail)
@@ -466,6 +472,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("haiku", Triple::Haiku)
     .StartsWith("minix", Triple::Minix)
     .StartsWith("rtems", Triple::RTEMS)
+    .StartsWith("emscripten", Triple::Emscripten) // @LOCALMOD Emscripten
     .StartsWith("nacl", Triple::NaCl)
     .StartsWith("cnk", Triple::CNK)
     .StartsWith("bitrig", Triple::Bitrig)
@@ -604,6 +611,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::amdil:
   case Triple::amdil64:
   case Triple::armeb:
+  case Triple::asmjs: // @LOCALMOD Emscripten
   case Triple::avr:
   case Triple::bpfeb:
   case Triple::bpfel:
@@ -1150,6 +1158,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::armeb:
   case llvm::Triple::hexagon:
   case llvm::Triple::le32:
+  case llvm::Triple::asmjs: // @LOCALMOD Emscripten
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
   case llvm::Triple::nvptx:
@@ -1233,6 +1242,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::hexagon:
   case Triple::kalimba:
   case Triple::le32:
+  case Triple::asmjs: // @LOCALMOD Emscripten
   case Triple::mips:
   case Triple::mipsel:
   case Triple::nvptx:
@@ -1286,6 +1296,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::tce:
   case Triple::tcele:
   case Triple::xcore:
+  case Triple::asmjs: // @LOCALMOD Emscripten
   case Triple::sparcel:
   case Triple::shave:
     T.setArch(UnknownArch);
@@ -1345,6 +1356,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::amdgcn:
   case Triple::amdil64:
   case Triple::amdil:
+  case Triple::asmjs:
   case Triple::avr:
   case Triple::hexagon:
   case Triple::hsail64:
@@ -1428,6 +1440,7 @@ bool Triple::isLittleEndian() const {
   case Triple::amdil64:
   case Triple::amdil:
   case Triple::arm:
+  case Triple::asmjs:
   case Triple::avr:
   case Triple::bpfel:
   case Triple::hexagon:
diff --git a/lib/Target/JSBackend/AllocaManager.cpp b/lib/Target/JSBackend/AllocaManager.cpp
new file mode 100644
index 000000000000..4d4f2609bc25
--- /dev/null
+++ b/lib/Target/JSBackend/AllocaManager.cpp
@@ -0,0 +1,594 @@
+//===-- AllocaManager.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the AllocaManager class.
+//
+// The AllocaManager computes a frame layout, assigning every static alloca an
+// offset. It does alloca liveness analysis in order to reuse stack memory,
+// using lifetime intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "allocamanager"
+#include "AllocaManager.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumAllocas, "Number of allocas eliminated");
+
+static const char *TimerGroupName = "AllocaManager";
+static const char *TimerGroupDesc = "Alloca manager";
+
+// Return the size of the given alloca.
+uint64_t AllocaManager::getSize(const AllocaInst *AI) {
+  assert(AI->isStaticAlloca());
+  return DL->getTypeAllocSize(AI->getAllocatedType()) *
+         cast<ConstantInt>(AI->getArraySize())->getValue().getZExtValue();
+}
+
+// Return the alignment of the given alloca.
+unsigned AllocaManager::getAlignment(const AllocaInst *AI) {
+  assert(AI->isStaticAlloca());
+  unsigned Alignment = std::max(AI->getAlignment(),
+                                DL->getABITypeAlignment(AI->getAllocatedType()));
+  MaxAlignment = std::max(Alignment, MaxAlignment);
+  return Alignment;
+}
+
+AllocaManager::AllocaInfo AllocaManager::getInfo(const AllocaInst *AI, unsigned Index) {
+  assert(AI->isStaticAlloca());
+  return AllocaInfo(AI, getSize(AI), getAlignment(AI), Index);
+}
+
+// Given a lifetime_start or lifetime_end intrinsic, determine if it's
+// describing a single pointer suitable for our analysis. If so,
+// return the pointer, otherwise return NULL.
+const Value *
+AllocaManager::getPointerFromIntrinsic(const CallInst *CI) {
+  const IntrinsicInst *II = cast<IntrinsicInst>(CI);
+  assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
+         II->getIntrinsicID() == Intrinsic::lifetime_end);
+
+  // Lifetime intrinsics have a size as their first argument and a pointer as
+  // their second argument.
+  const Value *Size = II->getArgOperand(0);
+  const Value *Ptr = II->getArgOperand(1);
+
+  // Check to see if we can convert the size to a host integer. If we can't,
+  // it's probably not worth worrying about.
+  const ConstantInt *SizeCon = dyn_cast<ConstantInt>(Size);
+  if (!SizeCon) return NULL;
+  const APInt &SizeAP = SizeCon->getValue();
+  if (SizeAP.getActiveBits() > 64) return NULL;
+  uint64_t MarkedSize = SizeAP.getZExtValue();
+
+  // Test whether the pointer operand is an alloca. This ought to be pretty
+  // simple, but e.g. PRE can decide to PRE bitcasts and no-op geps and
+  // split critical edges and insert phis for them, even though it's all
+  // just no-ops, so we have to dig through phis to see whether all the
+  // inputs are in fact the same pointer after stripping away casts.
+  const Value *Result = NULL;
+  SmallPtrSet<const PHINode *, 8> VisitedPhis;
+  SmallVector<const Value *, 8> Worklist;
+  Worklist.push_back(Ptr);
+  do {
+      const Value *P = Worklist.pop_back_val()->stripPointerCasts();
+
+      if (const PHINode *Phi = dyn_cast<PHINode>(P)) {
+        if (!VisitedPhis.insert(Phi).second)
+          continue;
+        for (unsigned i = 0, e = Phi->getNumOperands(); i < e; ++i)
+          Worklist.push_back(Phi->getOperand(i));
+        continue;
+      }
+      if (const SelectInst *Select = dyn_cast<SelectInst>(P)) {
+        Worklist.push_back(Select->getTrueValue());
+        Worklist.push_back(Select->getFalseValue());
+        continue;
+      }
+
+      if (Result == NULL)
+        Result = P;
+      else if (Result != P)
+        return NULL;
+  } while (!Worklist.empty());
+
+  // If it's a static Alloca, make sure the size is suitable. We test this here
+  // because if this fails, we need to be as conservative as if we don't know
+  // what the pointer is.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(Result)) {
+    if (AI->isStaticAlloca() && MarkedSize < getSize(AI))
+      return NULL;
+  } else if (isa<Instruction>(Result)) {
+    // And if it's any other kind of non-object/argument, we have to be
+    // similarly conservative, because we may be dealing with an escaped alloca
+    // that we can't see.
+    return NULL;
+  }
+
+  // Yay, it's all just one Value!
+  return Result;
+}
+
+// Test whether the given value is an alloca which we have a hope of
+const AllocaInst *AllocaManager::isFavorableAlloca(const Value *V) {
+  const AllocaInst *AI = dyn_cast<AllocaInst>(V);
+  if (!AI) return NULL;
+
+  if (!AI->isStaticAlloca()) return NULL;
+
+  return AI;
+}
+
+int AllocaManager::AllocaSort(const AllocaInfo *li, const AllocaInfo *ri) {
+  // Sort by alignment to minimize padding.
+  if (li->getAlignment() > ri->getAlignment()) return -1;
+  if (li->getAlignment() < ri->getAlignment()) return 1;
+
+  // Ensure a stable sort by comparing an index value which we've kept for
+  // this purpose.
+  if (li->getIndex() > ri->getIndex()) return -1;
+  if (li->getIndex() < ri->getIndex()) return 1;
+
+  return 0;
+}
+
+// Collect allocas
+void AllocaManager::collectMarkedAllocas() {
+  NamedRegionTimer Timer("collect-marked-allocas", "Collect Marked Allocas",
+                         TimerGroupName, TimerGroupDesc, TimePassesIsEnabled);
+
+  // Weird semantics: If an alloca *ever* appears in a lifetime start or end
+  // within the same function, its lifetime begins only at the explicit lifetime
+  // starts and ends only at the explicit lifetime ends and function exit
+  // points. Otherwise, its lifetime begins in the entry block and it is live
+  // everywhere.
+  //
+  // And so, instead of just walking the entry block to find all the static
+  // allocas, we walk the whole body to find the intrinsics so we can find the
+  // set of static allocas referenced in the intrinsics.
+  for (Function::const_iterator FI = F->begin(), FE = F->end();
+       FI != FE; ++FI) {
+    for (BasicBlock::const_iterator BI = FI->begin(), BE = FI->end();
+         BI != BE; ++BI) {
+      const CallInst *CI = dyn_cast<CallInst>(BI);
+      if (!CI) continue;
+
+      const Value *Callee = CI->getCalledValue();
+      if (Callee == LifetimeStart || Callee == LifetimeEnd) {
+        if (const Value *Ptr = getPointerFromIntrinsic(CI)) {
+          if (const AllocaInst *AI = isFavorableAlloca(Ptr))
+            Allocas.insert(std::make_pair(AI, 0));
+        } else if (isa<Instruction>(CI->getArgOperand(1)->stripPointerCasts())) {
+          // Oh noes, There's a lifetime intrinsics with something that
+          // doesn't appear to resolve to an alloca. This means that it's
+          // possible that it may be declaring a lifetime for some escaping
+          // alloca. Look out!
+          Allocas.clear();
+          assert(AllocasByIndex.empty());
+          return;
+        }
+      }
+    }
+  }
+
+  // All that said, we still want the intrinsics in the order they appear in the
+  // block, so that we can represent later ones with earlier ones and skip
+  // worrying about dominance, so run through the entry block and index those
+  // allocas which we identified above.
+  AllocasByIndex.reserve(Allocas.size());
+  const BasicBlock *EntryBB = &F->getEntryBlock();
+  for (BasicBlock::const_iterator BI = EntryBB->begin(), BE = EntryBB->end();
+       BI != BE; ++BI) {
+    const AllocaInst *AI = dyn_cast<AllocaInst>(BI);
+    if (!AI || !AI->isStaticAlloca()) continue;
+
+    AllocaMap::iterator I = Allocas.find(AI);
+    if (I != Allocas.end()) {
+      I->second = AllocasByIndex.size();
+      AllocasByIndex.push_back(getInfo(AI, AllocasByIndex.size()));
+    }
+  }
+  assert(AllocasByIndex.size() == Allocas.size());
+}
+
+// Calculate the starting point from which inter-block liveness will be
+// computed.
+void AllocaManager::collectBlocks() {
+  NamedRegionTimer Timer("collect-blocks", "Collect Blocks", TimerGroupName,
+                         TimerGroupDesc, TimePassesIsEnabled);
+
+  size_t AllocaCount = AllocasByIndex.size();
+
+  BitVector Seen(AllocaCount);
+
+  for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    const BasicBlock *BB = &*I;
+
+    BlockLifetimeInfo &BLI = BlockLiveness[BB];
+    BLI.Start.resize(AllocaCount);
+    BLI.End.resize(AllocaCount);
+
+    // Track which allocas we've seen. This is used because if a lifetime start
+    // is the first lifetime marker for an alloca in a block, the alloca is
+    // live-in.
+    Seen.reset();
+
+    // Walk the instructions and compute the Start and End sets.
+    for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ++BI) {
+      const CallInst *CI = dyn_cast<CallInst>(BI);
+      if (!CI) continue;
+
+      const Value *Callee = CI->getCalledValue();
+      if (Callee == LifetimeStart) {
+        if (const Value *Ptr = getPointerFromIntrinsic(CI)) {
+          if (const AllocaInst *AI = isFavorableAlloca(Ptr)) {
+            AllocaMap::const_iterator MI = Allocas.find(AI);
+            if (MI != Allocas.end()) {
+              size_t AllocaIndex = MI->second;
+              if (!Seen.test(AllocaIndex)) {
+                BLI.Start.set(AllocaIndex);
+              }
+              BLI.End.reset(AllocaIndex);
+              Seen.set(AllocaIndex);
+            }
+          }
+        }
+      } else if (Callee == LifetimeEnd) {
+        if (const Value *Ptr = getPointerFromIntrinsic(CI)) {
+          if (const AllocaInst *AI = isFavorableAlloca(Ptr)) {
+            AllocaMap::const_iterator MI = Allocas.find(AI);
+            if (MI != Allocas.end()) {
+              size_t AllocaIndex = MI->second;
+              BLI.End.set(AllocaIndex);
+              Seen.set(AllocaIndex);
+            }
+          }
+        }
+      }
+    }
+
+    // Lifetimes that start in this block and do not end here are live-out.
+    BLI.LiveOut = BLI.Start;
+    BLI.LiveOut.reset(BLI.End);
+    if (BLI.LiveOut.any()) {
+      for (succ_const_iterator SI = succ_begin(BB), SE = succ_end(BB);
+           SI != SE; ++SI) {
+        InterBlockTopDownWorklist.insert(*SI);
+      }
+    }
+
+    // Lifetimes that end in this block and do not start here are live-in.
+    // TODO: Is this actually true? What are the semantics of a standalone
+    // lifetime end? See also the code in computeInterBlockLiveness.
+    BLI.LiveIn = BLI.End;
+    BLI.LiveIn.reset(BLI.Start);
+    if (BLI.LiveIn.any()) {
+      for (const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+           PI != PE; ++PI) {
+        InterBlockBottomUpWorklist.insert(*PI);
+      }
+    }
+  }
+}
+
+// Compute the LiveIn and LiveOut sets for each block in F.
+void AllocaManager::computeInterBlockLiveness() {
+  NamedRegionTimer Timer("compute-inter-block-liveness", "Compute inter-block liveness",
+                         TimerGroupName, TimerGroupDesc, TimePassesIsEnabled);
+
+  size_t AllocaCount = AllocasByIndex.size();
+
+  BitVector Temp(AllocaCount);
+
+  // Proporgate liveness backwards.
+  while (!InterBlockBottomUpWorklist.empty()) {
+    const BasicBlock *BB = InterBlockBottomUpWorklist.pop_back_val();
+    BlockLifetimeInfo &BLI = BlockLiveness[BB];
+
+    // Compute the new live-out set.
+    for (succ_const_iterator SI = succ_begin(BB), SE = succ_end(BB);
+         SI != SE; ++SI) {
+      Temp |= BlockLiveness[*SI].LiveIn;
+    }
+
+    // If it contains new live blocks, prepare to propagate them.
+    // TODO: As above, what are the semantics of a standalone lifetime end?
+    Temp.reset(BLI.Start);
+    if (Temp.test(BLI.LiveIn)) {
+      BLI.LiveIn |= Temp;
+      for (const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+           PI != PE; ++PI) {
+        InterBlockBottomUpWorklist.insert(*PI);
+      }
+    }
+    Temp.reset();
+  }
+
+  // Proporgate liveness forwards.
+  while (!InterBlockTopDownWorklist.empty()) {
+    const BasicBlock *BB = InterBlockTopDownWorklist.pop_back_val();
+    BlockLifetimeInfo &BLI = BlockLiveness[BB];
+
+    // Compute the new live-in set.
+    for (const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+         PI != PE; ++PI) {
+      Temp |= BlockLiveness[*PI].LiveOut;
+    }
+
+    // Also record the live-in values.
+    BLI.LiveIn |= Temp;
+
+    // If it contains new live blocks, prepare to propagate them.
+    Temp.reset(BLI.End);
+    if (Temp.test(BLI.LiveOut)) {
+      BLI.LiveOut |= Temp;
+      for (succ_const_iterator SI = succ_begin(BB), SE = succ_end(BB);
+           SI != SE; ++SI) {
+        InterBlockTopDownWorklist.insert(*SI);
+      }
+    }
+    Temp.reset();
+  }
+}
+
+// Determine overlapping liveranges within blocks.
+void AllocaManager::computeIntraBlockLiveness() {
+  NamedRegionTimer Timer("compute-intra-block-liveness", "Compute intra-block liveness",
+                         TimerGroupName, TimerGroupDesc, TimePassesIsEnabled);
+
+  size_t AllocaCount = AllocasByIndex.size();
+
+  BitVector Current(AllocaCount);
+
+  AllocaCompatibility.resize(AllocaCount, BitVector(AllocaCount, true));
+
+  for (Function::const_iterator I = F->begin(), E = F->end(); I != E; ++I) {
+    const BasicBlock *BB = &*I;
+    const BlockLifetimeInfo &BLI = BlockLiveness[BB];
+
+    Current = BLI.LiveIn;
+
+    for (int i = Current.find_first(); i >= 0; i = Current.find_next(i)) {
+      AllocaCompatibility[i].reset(Current);
+    }
+
+    for (BasicBlock::const_iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ++BI) {
+      const CallInst *CI = dyn_cast<CallInst>(BI);
+      if (!CI) continue;
+
+      const Value *Callee = CI->getCalledValue();
+      if (Callee == LifetimeStart) {
+        if (const Value *Ptr = getPointerFromIntrinsic(CI)) {
+          if (const AllocaInst *AI = isFavorableAlloca(Ptr)) {
+            size_t AIndex = Allocas[AI];
+            // We conflict with everything else that's currently live.
+            AllocaCompatibility[AIndex].reset(Current);
+            // Everything else that's currently live conflicts with us.
+            for (int i = Current.find_first(); i >= 0; i = Current.find_next(i)) {
+              AllocaCompatibility[i].reset(AIndex);
+            }
+            // We're now live.
+            Current.set(AIndex);
+          }
+        }
+      } else if (Callee == LifetimeEnd) {
+        if (const Value *Ptr = getPointerFromIntrinsic(CI)) {
+          if (const AllocaInst *AI = isFavorableAlloca(Ptr)) {
+            size_t AIndex = Allocas[AI];
+            // We're no longer live.
+            Current.reset(AIndex);
+          }
+        }
+      }
+    }
+  }
+}
+
+// Decide which allocas will represent which other allocas, and if so what their
+// size and alignment will need to be.
+void AllocaManager::computeRepresentatives() {
+  NamedRegionTimer Timer("compute-representatives", "Compute Representatives",
+                         TimerGroupName, TimerGroupDesc, TimePassesIsEnabled);
+
+  for (size_t i = 0, e = AllocasByIndex.size(); i != e; ++i) {
+    // If we've already represented this alloca with another, don't visit it.
+    if (AllocasByIndex[i].isForwarded()) continue;
+    if (i > size_t(INT_MAX)) continue;
+
+    // Find compatible allocas. This is a simple greedy algorithm.
+    for (int j = int(i); ; ) {
+      assert(j >= int(i));
+      j = AllocaCompatibility[i].find_next(j);
+      assert(j != int(i));
+      if (j < 0) break;
+      if (!AllocaCompatibility[j][i]) continue;
+
+      DEBUG(dbgs() << "Allocas: "
+                      "Representing "
+                   << AllocasByIndex[j].getInst()->getName() << " "
+                      "with "
+                   << AllocasByIndex[i].getInst()->getName() << "\n");
+      ++NumAllocas;
+
+      assert(!AllocasByIndex[j].isForwarded());
+
+      AllocasByIndex[i].mergeSize(AllocasByIndex[j].getSize());
+      AllocasByIndex[i].mergeAlignment(AllocasByIndex[j].getAlignment());
+      AllocasByIndex[j].forward(i);
+
+      AllocaCompatibility[i] &= AllocaCompatibility[j];
+      AllocaCompatibility[j].reset();
+    }
+  }
+}
+
+void AllocaManager::computeFrameOffsets() {
+  NamedRegionTimer Timer("compute-frame-offsets", "Compute Frame Offsets",
+                         TimerGroupName, TimerGroupDesc,
+                         TimePassesIsEnabled);
+
+  // Walk through the entry block and collect all the allocas, including the
+  // ones with no lifetime markers that we haven't looked at yet. We walk in
+  // reverse order so that we can set the representative allocas as those that
+  // dominate the others as we go.
+  const BasicBlock *EntryBB = &F->getEntryBlock();
+  for (BasicBlock::const_iterator BI = EntryBB->begin(), BE = EntryBB->end();
+       BI != BE; ++BI) {
+    const AllocaInst *AI = dyn_cast<AllocaInst>(BI);
+    if (!AI || !AI->isStaticAlloca()) continue;
+
+    AllocaMap::const_iterator I = Allocas.find(AI);
+    if (I != Allocas.end()) {
+      // An alloca with lifetime markers. Emit the record we've crafted for it,
+      // if we've chosen to keep it as a representative.
+      const AllocaInfo &Info = AllocasByIndex[I->second];
+      if (!Info.isForwarded()) {
+        SortedAllocas.push_back(Info);
+      }
+    } else {
+      // An alloca with no lifetime markers.
+      SortedAllocas.push_back(getInfo(AI, SortedAllocas.size()));
+    }
+  }
+
+  // Sort the allocas to hopefully reduce padding.
+  array_pod_sort(SortedAllocas.begin(), SortedAllocas.end(), AllocaSort);
+
+  // Assign stack offsets.
+  uint64_t CurrentOffset = 0;
+  for (SmallVectorImpl<AllocaInfo>::const_iterator I = SortedAllocas.begin(),
+       E = SortedAllocas.end(); I != E; ++I) {
+    const AllocaInfo &Info = *I;
+    uint64_t NewOffset = alignTo(CurrentOffset, Info.getAlignment());
+
+    // For backwards compatibility, align every power-of-two multiple alloca to
+    // its greatest power-of-two factor, up to 8 bytes. In particular, cube2hash
+    // is known to depend on this.
+    // TODO: Consider disabling this and making people fix their code.
+    if (uint64_t Size = Info.getSize()) {
+      uint64_t P2 = uint64_t(1) << countTrailingZeros(Size);
+      unsigned CompatAlign = unsigned(std::min(P2, uint64_t(8)));
+      NewOffset = alignTo(NewOffset, CompatAlign);
+    }
+
+    const AllocaInst *AI = Info.getInst();
+    StaticAllocas[AI] = StaticAllocation(AI, NewOffset);
+
+    CurrentOffset = NewOffset + Info.getSize();
+  }
+
+  // Add allocas that were represented by other allocas to the StaticAllocas map
+  // so that our clients can look them up.
+  for (unsigned i = 0, e = AllocasByIndex.size(); i != e; ++i) {
+    const AllocaInfo &Info = AllocasByIndex[i];
+    if (!Info.isForwarded()) continue;
+    size_t j = Info.getForwardedID();
+    assert(!AllocasByIndex[j].isForwarded());
+
+    StaticAllocaMap::const_iterator I =
+      StaticAllocas.find(AllocasByIndex[j].getInst());
+    assert(I != StaticAllocas.end());
+
+    std::pair<StaticAllocaMap::const_iterator, bool> Pair =
+      StaticAllocas.insert(std::make_pair(AllocasByIndex[i].getInst(),
+                                          I->second));
+    assert(Pair.second); (void)Pair;
+  }
+
+  // Record the final frame size. Keep the stack pointer 16-byte aligned.
+  FrameSize = CurrentOffset;
+  FrameSize = alignTo(FrameSize, 16);
+
+  DEBUG(dbgs() << "Allocas: "
+                  "Statically allocated frame size is " << FrameSize << "\n");
+}
+
+AllocaManager::AllocaManager() : MaxAlignment(0) {
+}
+
+void AllocaManager::analyze(const Function &Func, const DataLayout &Layout,
+                            bool PerformColoring) {
+  NamedRegionTimer Timer("analyze", "Analyze", TimerGroupName, TimerGroupDesc,
+                         TimePassesIsEnabled);
+
+  assert(Allocas.empty());
+  assert(AllocasByIndex.empty());
+  assert(AllocaCompatibility.empty());
+  assert(BlockLiveness.empty());
+  assert(StaticAllocas.empty());
+  assert(SortedAllocas.empty());
+
+  DL = &Layout;
+  F = &Func;
+
+  // Get the declarations for the lifetime intrinsics so we can quickly test to
+  // see if they are used at all, and for use later if they are.
+  const Module *M = F->getParent();
+  LifetimeStart = M->getFunction(Intrinsic::getName(Intrinsic::lifetime_start));
+  LifetimeEnd = M->getFunction(Intrinsic::getName(Intrinsic::lifetime_end));
+
+  // If we are optimizing and the module contains any lifetime intrinsics, run
+  // the alloca coloring algorithm.
+  if (PerformColoring &&
+      ((LifetimeStart && !LifetimeStart->use_empty()) ||
+       (LifetimeEnd   && !LifetimeEnd->use_empty()))) {
+
+    collectMarkedAllocas();
+
+    if (!AllocasByIndex.empty()) {
+      DEBUG(dbgs() << "Allocas: "
+                   << AllocasByIndex.size() << " marked allocas found\n");
+
+      collectBlocks();
+      computeInterBlockLiveness();
+      computeIntraBlockLiveness();
+      BlockLiveness.clear();
+
+      computeRepresentatives();
+      AllocaCompatibility.clear();
+    }
+  }
+
+  computeFrameOffsets();
+  SortedAllocas.clear();
+  Allocas.clear();
+  AllocasByIndex.clear();
+}
+
+void AllocaManager::clear() {
+  StaticAllocas.clear();
+}
+
+bool
+AllocaManager::getFrameOffset(const AllocaInst *AI, uint64_t *Offset) const {
+  assert(AI->isStaticAlloca());
+  StaticAllocaMap::const_iterator I = StaticAllocas.find(AI);
+  assert(I != StaticAllocas.end());
+  *Offset = I->second.Offset;
+  return AI == I->second.Representative;
+}
+
+const AllocaInst *
+AllocaManager::getRepresentative(const AllocaInst *AI) const {
+  assert(AI->isStaticAlloca());
+  StaticAllocaMap::const_iterator I = StaticAllocas.find(AI);
+  assert(I != StaticAllocas.end());
+  return I->second.Representative;
+}
diff --git a/lib/Target/JSBackend/AllocaManager.h b/lib/Target/JSBackend/AllocaManager.h
new file mode 100644
index 000000000000..9aa833b71a6a
--- /dev/null
+++ b/lib/Target/JSBackend/AllocaManager.h
@@ -0,0 +1,182 @@
+//===-- AllocaManager.h ---------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass declares the AllocaManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef JSBACKEND_ALLOCAMANAGER_H
+#define JSBACKEND_ALLOCAMANAGER_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace llvm {
+
+class AllocaInst;
+class BasicBlock;
+class CallInst;
+class DataLayout;
+class Function;
+class Value;
+
+/// Compute frame layout for allocas.
+class AllocaManager {
+  const DataLayout *DL;
+  const Function *LifetimeStart;
+  const Function *LifetimeEnd;
+  const Function *F;
+
+  // Per-block lifetime information.
+  struct BlockLifetimeInfo {
+    BitVector Start;
+    BitVector End;
+    BitVector LiveIn;
+    BitVector LiveOut;
+  };
+  typedef DenseMap<const BasicBlock *, BlockLifetimeInfo> LivenessMap;
+  LivenessMap BlockLiveness;
+
+  // Worklist for inter-block liveness analysis.
+  typedef SmallSetVector<const BasicBlock *, 8> InterBlockWorklistVec;
+  InterBlockWorklistVec InterBlockTopDownWorklist;
+  InterBlockWorklistVec InterBlockBottomUpWorklist;
+
+  // Map allocas to their index in AllocasByIndex.
+  typedef DenseMap<const AllocaInst *, size_t> AllocaMap;
+  AllocaMap Allocas;
+
+  // Information about an alloca. Note that the size and alignment may vary
+  // from what's in the actual AllocaInst when an alloca is also representing
+  // another with perhaps greater size and/or alignment needs.
+  //
+  // When an alloca is represented by another, its AllocaInfo is marked as
+  // "forwarded", at which point it no longer holds a size and alignment, but
+  // the index of the representative AllocaInfo.
+  class AllocaInfo {
+    const AllocaInst *Inst;
+    uint64_t Size;
+    unsigned Alignment;
+    unsigned Index;
+
+  public:
+    AllocaInfo(const AllocaInst *I, uint64_t S, unsigned A, unsigned X)
+      : Inst(I), Size(S), Alignment(A), Index(X) {
+      assert(I != NULL);
+      assert(A != 0);
+      assert(!isForwarded());
+    }
+
+    bool isForwarded() const { return Alignment == 0; }
+
+    size_t getForwardedID() const {
+      assert(isForwarded());
+      return static_cast<size_t>(Size);
+    }
+
+    void forward(size_t i) {
+      assert(!isForwarded());
+      Alignment = 0;
+      Size = i;
+      assert(isForwarded());
+      assert(getForwardedID() == i);
+    }
+
+    const AllocaInst *getInst() const { return Inst; }
+
+    uint64_t getSize() const { assert(!isForwarded()); return Size; }
+    unsigned getAlignment() const { assert(!isForwarded()); return Alignment; }
+    unsigned getIndex() const { return Index; }
+
+    void mergeSize(uint64_t S) {
+      assert(!isForwarded());
+      Size = std::max(Size, S);
+      assert(!isForwarded());
+    }
+    void mergeAlignment(unsigned A) {
+      assert(A != 0);
+      assert(!isForwarded());
+      Alignment = std::max(Alignment, A);
+      assert(!isForwarded());
+    }
+  };
+  typedef SmallVector<AllocaInfo, 32> AllocaVec;
+  AllocaVec AllocasByIndex;
+
+  // For each alloca, which allocas can it safely represent? Allocas are
+  // identified by AllocasByIndex index.
+  // TODO: Vector-of-vectors isn't the fastest data structure possible here.
+  typedef SmallVector<BitVector, 32> AllocaCompatibilityVec;
+  AllocaCompatibilityVec AllocaCompatibility;
+
+  // This is for allocas that will eventually be sorted.
+  SmallVector<AllocaInfo, 32> SortedAllocas;
+
+  // Static allocation results.
+  struct StaticAllocation {
+    const AllocaInst *Representative;
+    uint64_t Offset;
+    StaticAllocation() {}
+    StaticAllocation(const AllocaInst *A, uint64_t O)
+      : Representative(A), Offset(O) {}
+  };
+  typedef DenseMap<const AllocaInst *, StaticAllocation> StaticAllocaMap;
+  StaticAllocaMap StaticAllocas;
+  uint64_t FrameSize;
+
+  uint64_t getSize(const AllocaInst *AI);
+  unsigned getAlignment(const AllocaInst *AI);
+  AllocaInfo getInfo(const AllocaInst *AI, unsigned Index);
+  const Value *getPointerFromIntrinsic(const CallInst *CI);
+  const AllocaInst *isFavorableAlloca(const Value *V);
+  static int AllocaSort(const AllocaInfo *l, const AllocaInfo *r);
+
+  void collectMarkedAllocas();
+  void collectBlocks();
+  void computeInterBlockLiveness();
+  void computeIntraBlockLiveness();
+  void computeRepresentatives();
+  void computeFrameOffsets();
+
+  unsigned MaxAlignment;
+
+public:
+  AllocaManager();
+
+  /// Analyze the given function and prepare for getRepresentative queries.
+  void analyze(const Function &Func, const DataLayout &Layout,
+               bool PerformColoring);
+
+  /// Reset all stored state.
+  void clear();
+
+  /// Return the representative alloca for the given alloca. When allocas are
+  /// merged, one is chosen as the representative to stand for the rest.
+  /// References to the alloca should take the form of references to the
+  /// representative.
+  const AllocaInst *getRepresentative(const AllocaInst *AI) const;
+
+  /// Set *offset to the frame offset for the given alloca. Return true if the
+  /// given alloca is representative, meaning that it needs an explicit
+  /// definition in the function entry. Return false if some other alloca
+  /// represents this one.
+  bool getFrameOffset(const AllocaInst *AI, uint64_t *offset) const;
+
+  /// Return the total frame size for all static allocas and associated padding.
+  uint64_t getFrameSize() const { return FrameSize; }
+
+  /// Return the largest alignment seen.
+  unsigned getMaxAlignment() const { return MaxAlignment; }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/JSBackend/CMakeLists.txt b/lib/Target/JSBackend/CMakeLists.txt
new file mode 100644
index 000000000000..942b9fef1be1
--- /dev/null
+++ b/lib/Target/JSBackend/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_llvm_target(JSBackendCodeGen
+  AllocaManager.cpp
+  ExpandBigSwitches.cpp
+  JSBackend.cpp
+  JSTargetMachine.cpp
+  JSTargetTransformInfo.cpp
+  Relooper.cpp
+  RemoveLLVMAssume.cpp
+  SimplifyAllocas.cpp
+  )
+
+add_dependencies(LLVMJSBackendCodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
+add_subdirectory(NaCl)
diff --git a/lib/Target/JSBackend/CallHandlers.h b/lib/Target/JSBackend/CallHandlers.h
new file mode 100644
index 000000000000..e4584541b5ac
--- /dev/null
+++ b/lib/Target/JSBackend/CallHandlers.h
@@ -0,0 +1,2212 @@
+// Call handlers: flexible map of call targets to arbitrary handling code
+//
+// Each handler needs DEF_CALL_HANDLER and SETUP_CALL_HANDLER
+//
+// Call handlers emit the code that the call will be replaced by. If that
+// emitted code contains calls, it must add the targets to Declares,
+// which are reported as declared but not implemented symbols, so that
+// JS linking brings them in.
+
+#include "llvm/Support/ScopedPrinter.h"
+
+typedef std::string (JSWriter::*CallHandler)(const Instruction*, std::string Name, int NumArgs);
+typedef std::map<std::string, CallHandler> CallHandlerMap;
+CallHandlerMap CallHandlers;
+
+// Definitions
+
+unsigned getNumArgOperands(const Instruction *I) {
+  return ImmutableCallSite(I).arg_size();
+}
+
+const Value *getActuallyCalledValue(const Instruction *I) {
+  const Value *CV = ImmutableCallSite(I).getCalledValue();
+
+  // if the called value is a bitcast of a function, then we just call it directly, properly
+  // for example, extern void x() in C will turn into void x(...) in LLVM IR, then the IR bitcasts
+  // it to the proper form right before the call. this both causes an unnecessary indirect
+  // call, and it is done with the wrong type. TODO: don't even put it into the function table
+  if (const Function *F = dyn_cast<const Function>(stripPointerCastsWithoutSideEffects(CV))) {
+    CV = F;
+  }
+  return CV;
+}
+
+// We can't and shouldn't try to invoke an LLVM intrinsic which we overload with a call hander -
+// it would end up in a function table, which makes no sense.
+bool canInvoke(const Value *V) {
+  const Function *F = dyn_cast<const Function>(V);
+  if (F && F->isDeclaration() && F->isIntrinsic()) {
+    auto Intrin = F->getIntrinsicID();
+    if (Intrin == Intrinsic::memcpy || Intrin == Intrinsic::memset || Intrin == Intrinsic::memmove) {
+      return false;
+    }
+  }
+  return true;
+}
+
+#define DEF_CALL_HANDLER(Ident, Code) \
+  std::string CH_##Ident(const Instruction *CI, std::string Name, int NumArgs=-1) { Code }
+
+DEF_CALL_HANDLER(__default__, {
+  if (!CI) return ""; // we are just called from a handler that was called from getFunctionIndex, only to ensure the handler was run at least once
+  const Value *CV = getActuallyCalledValue(CI);
+  bool NeedCasts = true;
+  FunctionType *FT;
+  bool Invoke = false;
+  bool Emulated = false;
+  if (InvokeState == 1) {
+    InvokeState = 2;
+    Invoke = canInvoke(CV);
+  }
+  std::string Sig;
+  bool IsMath = Name.find("Math_") == 0;
+  bool ForcedNumArgs = NumArgs != -1;
+  if (!ForcedNumArgs) NumArgs = getNumArgOperands(CI);
+
+  const Function *F = dyn_cast<const Function>(CV);
+  if (F) {
+    NeedCasts = F->isDeclaration(); // if ffi call, need casts
+    if (IsMath && !NeedCasts) {
+      // this was renamed to a math function, but the actual function is implemented, presumably from libc; use that
+      IsMath = false;
+      Name = getJSName(F);
+    }
+    FT = F->getFunctionType();
+  } else {
+    FT = dyn_cast<FunctionType>(dyn_cast<PointerType>(CV->getType())->getElementType());
+    if (isAbsolute(CV->stripPointerCasts())) {
+      Name = "abort /* segfault, call an absolute addr */ ";
+    } else {
+      // function pointer call
+      ensureFunctionTable(FT);
+      if (!Invoke) {
+        Sig = getFunctionSignature(FT);
+        if (!EmulatedFunctionPointers) {
+          Name = std::string("FUNCTION_TABLE_") + Sig + "[" + Name + " & #FM_" + Sig + "#]";
+          NeedCasts = false; // function table call, so stays in asm module
+        } else {
+          Name = std::string(Relocatable ? "mftCall_" : "ftCall_") + Sig + "(" + getCast(Name, Type::getInt32Ty(CI->getContext()));
+          if (NumArgs > 0) Name += ',';
+          Emulated = true;
+          if (WebAssembly) {
+            // this call uses the wasm Table
+            NeedCasts = false;
+          }
+        }
+      }
+    }
+  }
+
+  if (!FT->isVarArg() && !ForcedNumArgs) {
+    int TypeNumArgs = FT->getNumParams();
+    if (TypeNumArgs != NumArgs) {
+      if (EmscriptenAssertions) prettyWarning() << "unexpected number of arguments " << utostr(NumArgs) << " in call to '" << F->getName() << "', should be " << utostr(TypeNumArgs) << "\n";
+      if (NumArgs > TypeNumArgs) NumArgs = TypeNumArgs; // lop off the extra params that will not be used and just break validation
+    }
+    if (EmscriptenAssertions) {
+      for (int i = 0; i < std::min(TypeNumArgs, NumArgs); i++) {
+        Type *TypeType = FT->getParamType(i);
+        Type *ActualType = CI->getOperand(i)->getType();
+        if (getFunctionSignatureLetter(TypeType) != getFunctionSignatureLetter(ActualType)) {
+          prettyWarning() << "unexpected argument type " << *ActualType << " at index " << utostr(i) << " in call to '" << F->getName() << "', should be " << *TypeType << "\n";
+        }
+      }
+    }
+  }
+  if (EmscriptenAssertions) {
+    Type *TypeType = FT->getReturnType();
+    Type *ActualType = CI->getType();
+    if (getFunctionSignatureLetter(TypeType) != getFunctionSignatureLetter(ActualType)) {
+      prettyWarning() << "unexpected return type " << *ActualType << " in call to '" << F->getName() << "', should be " << *TypeType << "\n";
+    }
+  }
+
+  if (Invoke) {
+    Sig = getFunctionSignature(FT);
+    Name = "invoke_" + Sig;
+    NeedCasts = true;
+  }
+  std::string text = Name;
+  if (!Emulated) text += "(";
+  if (Invoke) {
+    // add first param
+    if (F) {
+      text += relocateFunctionPointer(utostr(getFunctionIndex(F))); // convert to function pointer
+    } else {
+      text += getValueAsCastStr(CV); // already a function pointer
+    }
+    if (NumArgs > 0) text += ",";
+  }
+  // this is an ffi call if we need casts, and it is not a special Math_ builtin or wasm-only intrinsic
+  bool FFI = NeedCasts;
+  if (FFI) {
+    if (IsMath && (Name == "Math_ceil" || Name == "Math_floor" || Name == "Math_min" || Name == "Math_max" || Name == "Math_sqrt" || Name == "Math_abs")) {
+      // This special Math builtin is optimizable with all types, including floats, so can treat it as non-ffi
+      FFI = false;
+    } else if (OnlyWebAssembly && Name == "f32_copysign") {
+      // f32_copysign doesn't need to use a +() coercion which an ffi would need, it's a simple f32 operation
+      FFI = false;
+    }
+  }
+  unsigned FFI_OUT = FFI ? ASM_FFI_OUT : 0;
+  for (int i = 0; i < NumArgs; i++) {
+    if (!NeedCasts) {
+      text += getValueAsStr(CI->getOperand(i));
+    } else {
+      text += getValueAsCastParenStr(CI->getOperand(i), ASM_NONSPECIFIC | FFI_OUT);
+    }
+    if (i < NumArgs - 1) text += ",";
+  }
+  text += ")";
+  // handle return value
+  Type *InstRT = CI->getType();
+  Type *ActualRT = FT->getReturnType();
+  if (!InstRT->isVoidTy() && ActualRT->isVoidTy()) {
+    // the function we are calling was cast to something returning a value, but it really
+    // does not return a value
+    getAssignIfNeeded(CI); // ensure the variable is defined, but do not emit it here
+                           // it should have 0 uses, but just to be safe
+  } else if (!ActualRT->isVoidTy()) {
+    unsigned FFI_IN = FFI ? ASM_FFI_IN : 0;
+    text = getAssignIfNeeded(CI) + "(" + getCast(text, ActualRT, ASM_NONSPECIFIC | FFI_IN) + ")";
+  }
+  return text;
+})
+
+// exceptions support
+DEF_CALL_HANDLER(emscripten_preinvoke, {
+  // InvokeState is normally 0 here, but might be otherwise if a block was split apart TODO: add a function attribute for this
+  InvokeState = 1;
+  return "__THREW__ = 0";
+})
+DEF_CALL_HANDLER(emscripten_postinvoke, {
+  // InvokeState is normally 2 here, but can be 1 if the call in between was optimized out, or 0 if a block was split apart
+  InvokeState = 0;
+  return getAssign(CI) + "__THREW__; __THREW__ = 0";
+})
+DEF_CALL_HANDLER(emscripten_landingpad, {
+  unsigned Num = getNumArgOperands(CI);
+  std::string target = "__cxa_find_matching_catch_" + utostr(Num);
+  Declares.insert(target);
+  std::string Ret = getAssign(CI) + "_" + target + "(";
+  for (unsigned i = 1; i < Num-1; i++) { // ignore personality and cleanup XXX - we probably should not be doing that!
+    if (i > 1) Ret += ",";
+    Ret += getValueAsCastStr(CI->getOperand(i));
+  }
+  Ret += ")|0";
+  return Ret;
+})
+DEF_CALL_HANDLER(emscripten_resume, {
+  Declares.insert("__resumeException");
+  return "___resumeException(" + getValueAsCastStr(CI->getOperand(0)) + ")";
+})
+
+std::string getTempRet0() {
+  return Relocatable ? "(getTempRet0() | 0)" : "tempRet0";
+}
+
+std::string setTempRet0(std::string Value) {
+  return Relocatable ? "setTempRet0((" + Value + ") | 0)" : "tempRet0 = (" + Value + ")";
+}
+
+// setjmp support
+
+DEF_CALL_HANDLER(emscripten_prep_setjmp, {
+  return getAdHocAssign("_setjmpTableSize", Type::getInt32Ty(CI->getContext())) + "4;" +
+         getAdHocAssign("_setjmpTable", Type::getInt32Ty(CI->getContext())) + "_malloc(40) | 0;" +
+         "HEAP32[_setjmpTable>>2]=0";
+})
+DEF_CALL_HANDLER(emscripten_cleanup_setjmp, {
+  return "_free(_setjmpTable|0)";
+})
+DEF_CALL_HANDLER(emscripten_setjmp, {
+  // env, label, table
+  Declares.insert("saveSetjmp");
+  return "_setjmpTable = _saveSetjmp(" + getValueAsStr(CI->getOperand(0)) + "," + getValueAsStr(CI->getOperand(1)) + ",_setjmpTable|0,_setjmpTableSize|0)|0;_setjmpTableSize = " + getTempRet0();
+})
+DEF_CALL_HANDLER(emscripten_longjmp, {
+  Declares.insert("longjmp");
+  return CH___default__(CI, "_longjmp");
+})
+DEF_CALL_HANDLER(emscripten_check_longjmp, {
+  std::string Threw = getValueAsStr(CI->getOperand(0));
+  std::string Target = getJSName(CI);
+  std::string Assign = getAssign(CI);
+  Declares.insert("testSetjmp");
+  Declares.insert("longjmp");
+  return "if (((" + Threw + "|0) != 0) & ((threwValue|0) != 0)) { " +
+           Assign + "_testSetjmp(HEAP32[" + Threw + ">>2]|0, _setjmpTable|0, _setjmpTableSize|0)|0; " +
+           "if ((" + Target + "|0) == 0) { _longjmp(" + Threw + "|0, threwValue|0); } " + // rethrow
+           setTempRet0("threwValue") + "; " +
+         "} else { " + Assign + "-1; }";
+})
+DEF_CALL_HANDLER(emscripten_get_longjmp_result, {
+  std::string Threw = getValueAsStr(CI->getOperand(0));
+  return getAssign(CI) + getTempRet0();
+})
+
+// supporting async functions, see `<emscripten>/src/library_async.js` for detail.
+DEF_CALL_HANDLER(emscripten_alloc_async_context, {
+  Declares.insert("emscripten_alloc_async_context");
+  // insert sp as the 2nd parameter
+  return getAssign(CI) + "_emscripten_alloc_async_context(" + getValueAsStr(CI->getOperand(0)) + ",sp)|0";
+})
+DEF_CALL_HANDLER(emscripten_check_async, {
+  return getAssign(CI) + "___async";
+})
+// prevent unwinding the stack
+// preserve the return value of the return inst
+DEF_CALL_HANDLER(emscripten_do_not_unwind, {
+  return "sp = STACKTOP";
+})
+// prevent unwinding the async stack
+DEF_CALL_HANDLER(emscripten_do_not_unwind_async, {
+  return "___async_unwind = 0";
+})
+DEF_CALL_HANDLER(emscripten_get_async_return_value_addr, {
+  return getAssign(CI) + "___async_retval";
+})
+
+// emscripten instrinsics
+DEF_CALL_HANDLER(emscripten_debugger, {
+  CantValidate = "emscripten_debugger is used";
+  return "debugger";
+})
+DEF_CALL_HANDLER(llvm_debugtrap, {
+  CantValidate = "llvm.debugtrap is used";
+  return "debugger";
+})
+
+// i64 support
+
+DEF_CALL_HANDLER(getHigh32, {
+  return getAssign(CI) + getTempRet0();
+})
+DEF_CALL_HANDLER(setHigh32, {
+  return setTempRet0(getValueAsStr(CI->getOperand(0)));
+})
+// XXX float handling here is not optimal
+#define TO_I(low, high) \
+DEF_CALL_HANDLER(low, { \
+  std::string Input = getValueAsStr(CI->getOperand(0)); \
+  if (PreciseF32 && CI->getOperand(0)->getType()->isFloatTy()) Input = "+" + Input; \
+  return getAssign(CI) + "(~~" + Input + ")>>>0"; \
+}) \
+DEF_CALL_HANDLER(high, { \
+  std::string Input = getValueAsStr(CI->getOperand(0)); \
+  if (PreciseF32 && CI->getOperand(0)->getType()->isFloatTy()) Input = "+" + Input; \
+  return getAssign(CI) + "+Math_abs(" + Input + ") >= +1 ? " + Input + " > +0 ? (~~+Math_min(+Math_floor(" + Input + " / +4294967296), +4294967295)) >>> 0 : ~~+Math_ceil((" + Input + " - +(~~" + Input + " >>> 0)) / +4294967296) >>> 0 : 0"; \
+})
+TO_I(FtoILow, FtoIHigh);
+TO_I(DtoILow, DtoIHigh);
+DEF_CALL_HANDLER(BDtoILow, {
+  return "HEAPF64[tempDoublePtr>>3] = " + getValueAsStr(CI->getOperand(0)) + ";" + getAssign(CI) + "HEAP32[tempDoublePtr>>2]|0";
+})
+DEF_CALL_HANDLER(BDtoIHigh, {
+  return getAssign(CI) + "HEAP32[tempDoublePtr+4>>2]|0";
+})
+DEF_CALL_HANDLER(SItoF, {
+  std::string Ret = "(+" + getValueAsCastParenStr(CI->getOperand(0), ASM_UNSIGNED) + ") + " +
+                                       "(+4294967296*(+" + getValueAsCastParenStr(CI->getOperand(1), ASM_SIGNED) +   "))";
+  if (PreciseF32 && CI->getType()->isFloatTy()) {
+    Ret = "Math_fround(" + Ret + ")";
+  }
+  return getAssign(CI) + Ret;
+})
+DEF_CALL_HANDLER(UItoF, {
+  std::string Ret = "(+" + getValueAsCastParenStr(CI->getOperand(0), ASM_UNSIGNED) + ") + " +
+                                       "(+4294967296*(+" + getValueAsCastParenStr(CI->getOperand(1), ASM_UNSIGNED) + "))";
+  if (PreciseF32 && CI->getType()->isFloatTy()) {
+    Ret = "Math_fround(" + Ret + ")";
+  }
+  return getAssign(CI) + Ret;
+})
+DEF_CALL_HANDLER(SItoD, {
+  return getAssign(CI) + "(+" + getValueAsCastParenStr(CI->getOperand(0), ASM_UNSIGNED) + ") + " +
+                                       "(+4294967296*(+" + getValueAsCastParenStr(CI->getOperand(1), ASM_SIGNED) +   "))";
+})
+DEF_CALL_HANDLER(UItoD, {
+  return getAssign(CI) + "(+" + getValueAsCastParenStr(CI->getOperand(0), ASM_UNSIGNED) + ") + " +
+                                       "(+4294967296*(+" + getValueAsCastParenStr(CI->getOperand(1), ASM_UNSIGNED) + "))";
+})
+DEF_CALL_HANDLER(BItoD, {
+  return "HEAP32[tempDoublePtr>>2] = " +   getValueAsStr(CI->getOperand(0)) + ";" +
+         "HEAP32[tempDoublePtr+4>>2] = " + getValueAsStr(CI->getOperand(1)) + ";" +
+         getAssign(CI) + "+HEAPF64[tempDoublePtr>>3]";
+})
+
+// misc
+
+DEF_CALL_HANDLER(llvm_nacl_atomic_store_i32, {
+  return "HEAP32[" + getValueAsStr(CI->getOperand(0)) + ">>2]=" + getValueAsStr(CI->getOperand(1));
+})
+
+#define CMPXCHG_HANDLER(name, HeapName) \
+DEF_CALL_HANDLER(name, { \
+  const Value *P = CI->getOperand(0); \
+  if (EnablePthreads) { \
+    return getAssign(CI) + "(Atomics_compareExchange(" HeapName ", " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ", " + getValueAsStr(CI->getOperand(2)) + ")|0)"; \
+  } else { \
+    return getLoad(CI, P, CI->getType(), 0) + ';' + \
+             "if ((" + getCast(getJSName(CI), CI->getType()) + ") == " + getValueAsCastParenStr(CI->getOperand(1)) + ") " + \
+                getStore(CI, P, CI->getType(), getValueAsStr(CI->getOperand(2)), 0); \
+  } \
+})
+
+CMPXCHG_HANDLER(llvm_nacl_atomic_cmpxchg_i8, "HEAP8");
+CMPXCHG_HANDLER(llvm_nacl_atomic_cmpxchg_i16, "HEAP16");
+CMPXCHG_HANDLER(llvm_nacl_atomic_cmpxchg_i32, "HEAP32");
+
+#define UNROLL_LOOP_MAX 8
+#define WRITE_LOOP_MAX 128
+
+DEF_CALL_HANDLER(llvm_memcpy_p0i8_p0i8_i32, {
+  if (CI) {
+    ConstantInt *AlignInt = dyn_cast<ConstantInt>(CI->getOperand(3));
+    if (AlignInt) {
+      ConstantInt *LenInt = dyn_cast<ConstantInt>(CI->getOperand(2));
+      if (LenInt) {
+        // we can emit inline code for this
+        unsigned Len = LenInt->getZExtValue();
+        if (Len <= WRITE_LOOP_MAX) {
+          unsigned Align = AlignInt->getZExtValue();
+          if (OnlyWebAssembly) {
+            // wasm
+            if (Align > 8) Align = 8;
+            else if (Align == 0) Align = 1; // align 0 means 1 in memcpy and memset (unlike other places where it means 'default')
+            unsigned Pos = 0;
+            std::string Ret;
+            std::string Dest = getValueAsStr(CI->getOperand(0));
+            std::string Src = getValueAsStr(CI->getOperand(1));
+            unsigned Size = 8; // start by writing out i64 copies
+            while (Len > 0) {
+              // handle as much as we can in the current size
+              unsigned CurrLen = Size*(Len/Size);
+              for (unsigned Offset = 0; Offset < CurrLen; Offset += Size) {
+                unsigned PosOffset = Pos + Offset;
+                std::string Add = PosOffset == 0 ? "" : ("+" + utostr(PosOffset) + " | 0");
+                Ret += "; store" + utostr(Size) + "(" + Dest + Add +
+                           ",load" + utostr(Size) + "(" + Src + Add + "," + utostr(std::min(Align, Size)) + ")" +
+                       "," + utostr(std::min(Align, Size)) + ")";
+              }
+              Pos += CurrLen;
+              Len -= CurrLen;
+              Size /= 2;
+            }
+            return Ret;
+          } else {
+            // asm.js
+            if (Align > 4) Align = 4;
+            else if (Align == 0) Align = 1; // align 0 means 1 in memcpy and memset (unlike other places where it means 'default/4')
+            if (Align == 1 && Len > 1 && WarnOnUnaligned) {
+              errs() << "emcc: warning: unaligned memcpy in  " << CI->getParent()->getParent()->getName() << ":" << *CI << " (compiler's fault?)\n";
+            }
+            unsigned Pos = 0;
+            std::string Ret;
+            std::string Dest = getValueAsStr(CI->getOperand(0));
+            std::string Src = getValueAsStr(CI->getOperand(1));
+            while (Len > 0) {
+              // handle as much as we can in the current alignment
+              unsigned CurrLen = Align*(Len/Align);
+              unsigned Factor = CurrLen/Align;
+              if (Factor <= UNROLL_LOOP_MAX) {
+                // unroll
+                for (unsigned Offset = 0; Offset < CurrLen; Offset += Align) {
+                  unsigned PosOffset = Pos + Offset;
+                  std::string Add = PosOffset == 0 ? "" : ("+" + utostr(PosOffset));
+                  Ret += ";" + getHeapAccess(Dest + Add, Align) + "=" + getHeapAccess(Src + Add, Align) + "|0";
+                }
+              } else {
+                // emit a loop
+                UsedVars["dest"] = UsedVars["src"] = UsedVars["stop"] = Type::getInt32Ty(TheModule->getContext());
+                std::string Add = Pos == 0 ? "" : ("+" + utostr(Pos) + "|0");
+                Ret += "dest=" + Dest + Add + "; src=" + Src + Add + "; stop=dest+" + utostr(CurrLen) + "|0; do { " + getHeapAccess("dest", Align) + "=" + getHeapAccess("src", Align) + "|0; dest=dest+" + utostr(Align) + "|0; src=src+" + utostr(Align) + "|0; } while ((dest|0) < (stop|0))";
+              }
+              Pos += CurrLen;
+              Len -= CurrLen;
+              Align /= 2;
+            }
+            return Ret;
+          }
+        }
+      }
+    }
+  }
+  Declares.insert("memcpy");
+  return CH___default__(CI, "_memcpy", 3) + "|0";
+})
+
+DEF_CALL_HANDLER(llvm_memset_p0i8_i32, {
+  if (CI) {
+    ConstantInt *AlignInt = dyn_cast<ConstantInt>(CI->getOperand(3));
+    if (AlignInt) {
+      ConstantInt *LenInt = dyn_cast<ConstantInt>(CI->getOperand(2));
+      if (LenInt) {
+        ConstantInt *ValInt = dyn_cast<ConstantInt>(CI->getOperand(1));
+        if (ValInt) {
+          // we can emit inline code for this
+          unsigned Len = LenInt->getZExtValue();
+          if (Len <= WRITE_LOOP_MAX) {
+            unsigned Align = AlignInt->getZExtValue();
+            if (OnlyWebAssembly) {
+              // wasm
+              uint64_t Val64 = ValInt->getZExtValue();
+              if (Align > 8) Align = 8;
+              else if (Align == 0) Align = 1; // align 0 means 1 in memcpy and memset (unlike other places where it means 'default')
+              unsigned Pos = 0;
+              std::string Ret;
+              std::string Dest = getValueAsStr(CI->getOperand(0));
+              std::string Src = getValueAsStr(CI->getOperand(1));
+              unsigned Size = 8; // start by writing out i64 copies
+              while (Len > 0) {
+                // handle as much as we can in the current size
+                unsigned CurrLen = Size*(Len/Size);
+                uint64_t FullVal = 0;
+                for (unsigned i = 0; i < Size; i++) {
+                  FullVal <<= 8;
+                  FullVal |= Val64;
+                }
+                std::string ValStr = Size < 8 ? utostr(FullVal) : emitI64Const(FullVal);
+                for (unsigned Offset = 0; Offset < CurrLen; Offset += Size) {
+                  unsigned PosOffset = Pos + Offset;
+                  std::string Add = PosOffset == 0 ? "" : ("+" + utostr(PosOffset) + "|0");
+                  Ret += "; store" + utostr(Size) + "(" + Dest + Add + "," + ValStr  + "," + utostr(std::min(Align, Size)) + ")";
+                }
+                Pos += CurrLen;
+                Len -= CurrLen;
+                Size /= 2;
+              }
+              return Ret;
+            } else {
+              // asm.js
+              unsigned Val = ValInt->getZExtValue();
+              if (Align > 4) Align = 4;
+              else if (Align == 0) Align = 1; // align 0 means 1 in memcpy and memset (unlike other places where it means 'default/4')
+              if (Align == 1 && Len > 1 && WarnOnUnaligned) {
+                errs() << "emcc: warning: unaligned memcpy in  " << CI->getParent()->getParent()->getName() << ":" << *CI << " (compiler's fault?)\n";
+              }
+              unsigned Pos = 0;
+              std::string Ret;
+              std::string Dest = getValueAsStr(CI->getOperand(0));
+              while (Len > 0) {
+                // handle as much as we can in the current alignment
+                unsigned CurrLen = Align*(Len/Align);
+                unsigned FullVal = 0;
+                for (unsigned i = 0; i < Align; i++) {
+                  FullVal <<= 8;
+                  FullVal |= Val;
+                }
+                unsigned Factor = CurrLen/Align;
+                if (Factor <= UNROLL_LOOP_MAX) {
+                  // unroll
+                  for (unsigned Offset = 0; Offset < CurrLen; Offset += Align) {
+                    unsigned PosOffset = Pos + Offset;
+                    std::string Add = PosOffset == 0 ? "" : ("+" + utostr(PosOffset));
+                    Ret += ";" + getHeapAccess(Dest + Add, Align) + "=" + utostr(FullVal) + "|0";
+                  }
+                } else {
+                  // emit a loop
+                  UsedVars["dest"] = UsedVars["stop"] = Type::getInt32Ty(TheModule->getContext());
+                  std::string Add = Pos == 0 ? "" : ("+" + utostr(Pos) + "|0");
+                  Ret += "dest=" + Dest + Add + "; stop=dest+" + utostr(CurrLen) + "|0; do { " + getHeapAccess("dest", Align) + "=" + utostr(FullVal) + "|0; dest=dest+" + utostr(Align) + "|0; } while ((dest|0) < (stop|0))";
+                }
+                Pos += CurrLen;
+                Len -= CurrLen;
+                Align /= 2;
+              }
+              return Ret;
+            }
+          }
+        }
+      }
+    }
+  }
+  Declares.insert("memset");
+  return CH___default__(CI, "_memset", 3) + "|0";
+})
+
+DEF_CALL_HANDLER(llvm_memmove_p0i8_p0i8_i32, {
+  Declares.insert("memmove");
+  return CH___default__(CI, "_memmove", 3) + "|0";
+})
+
+DEF_CALL_HANDLER(llvm_expect_i32, {
+  return getAssign(CI) + getValueAsStr(CI->getOperand(0));
+})
+DEF_CALL_HANDLER(llvm_expect_i1, {
+  return getAssign(CI) + getValueAsStr(CI->getOperand(0));
+})
+
+DEF_CALL_HANDLER(llvm_dbg_declare, {
+  if (!EnableCyberDWARF || !EnableCyberDWARFIntrinsics)
+    return "";
+
+  auto VariableOffset = "0";
+  auto AssignedValue = cast<MetadataAsValue>(CI->getOperand(0))->getMetadata();
+  auto const LocalVariableMD = cast<MetadataAsValue>(CI->getOperand(1))->getMetadata();
+  auto const LocalVariableDI = cast<DILocalVariable>(LocalVariableMD);
+  auto const LocalVariableType = LocalVariableDI->getRawType();
+  auto const DwarfOp = cast<MetadataAsValue>(CI->getOperand(2))->getMetadata();
+  std::string LocalVariableName = LocalVariableDI->getName().str();
+
+  auto VarMD = utostr(getIDForMetadata(LocalVariableType))
+  + "," + VariableOffset + "," + utostr(getIDForMetadata(DwarfOp))
+  + ",\"" + LocalVariableName + "\"";
+
+
+  if (auto const *ValAsAssign = dyn_cast<LocalAsMetadata>(AssignedValue)) {
+    Declares.insert("metadata_llvm_dbg_value_local");
+    auto LocalVarName = getJSName(ValAsAssign->getValue()->stripPointerCasts());
+    return "_metadata_llvm_dbg_value_local(" + LocalVarName + "," + VarMD + ")";
+  } else if (auto const *ValAsAssign = dyn_cast<ConstantAsMetadata>(AssignedValue)) {
+    Declares.insert("metadata_llvm_dbg_value_constant");
+    return "_metadata_llvm_dbg_value_constant(\"" + getValueAsStr(ValAsAssign->getValue())
+    + "," + VarMD + ")";
+  }
+
+  return "";
+})
+
+DEF_CALL_HANDLER(llvm_dbg_value, {
+  if (!EnableCyberDWARF || !EnableCyberDWARFIntrinsics)
+    return "";
+
+  auto VariableOffset = getValueAsStr(CI->getOperand(1));
+  auto AssignedValue = cast<MetadataAsValue>(CI->getOperand(0))->getMetadata();
+  auto const LocalVariableMD = cast<MetadataAsValue>(CI->getOperand(1))->getMetadata();
+  auto const LocalVariableDI = cast<DILocalVariable>(LocalVariableMD);
+  auto const LocalVariableType = LocalVariableDI->getRawType();
+  auto const DwarfOp = cast<MetadataAsValue>(CI->getOperand(2))->getMetadata();
+  std::string LocalVariableName = LocalVariableDI->getName().str();
+
+  auto VarMD = utostr(getIDForMetadata(LocalVariableType))
+               + "," + VariableOffset + "," + utostr(getIDForMetadata(DwarfOp))
+               + ",\"" + LocalVariableName + "\"";
+
+  if (auto const *ValAsAssign = dyn_cast<LocalAsMetadata>(AssignedValue)) {
+    Declares.insert("metadata_llvm_dbg_value_local");
+    auto LocalVarName = getJSName(ValAsAssign->getValue()->stripPointerCasts());
+    return "_metadata_llvm_dbg_value_local(" + LocalVarName + "," + VarMD + ")";
+  } else if (auto const *ValAsAssign = dyn_cast<ConstantAsMetadata>(AssignedValue)) {
+    Declares.insert("metadata_llvm_dbg_value_constant");
+    return "_metadata_llvm_dbg_value_constant(\"" + getValueAsStr(ValAsAssign->getValue())
+    + "," + VarMD + ")";
+  }
+
+  return "";
+})
+
+DEF_CALL_HANDLER(llvm_lifetime_start, {
+  return "";
+})
+
+DEF_CALL_HANDLER(llvm_lifetime_end, {
+  return "";
+})
+
+DEF_CALL_HANDLER(llvm_invariant_start_p0i8, {
+  return "";
+})
+
+DEF_CALL_HANDLER(llvm_invariant_end_p0i8, {
+  return "";
+})
+
+DEF_CALL_HANDLER(llvm_prefetch, {
+  return "";
+})
+
+DEF_CALL_HANDLER(llvm_objectsize_i32_p0i8, {
+  return  getAssign(CI) + ((cast<ConstantInt>(CI->getOperand(1)))->getZExtValue() == 0 ? "-1" : "0");
+})
+
+DEF_CALL_HANDLER(llvm_flt_rounds, {
+  // FLT_ROUNDS helper. We don't support setting the rounding mode dynamically,
+  // so it's always round-to-nearest (1).
+  return getAssign(CI) + "1";
+})
+
+DEF_CALL_HANDLER(bitshift64Lshr, {
+  Declares.insert("bitshift64Lshr");
+  return CH___default__(CI, "_bitshift64Lshr", 3);
+})
+
+DEF_CALL_HANDLER(bitshift64Ashr, {
+  Declares.insert("bitshift64Ashr");
+  return CH___default__(CI, "_bitshift64Ashr", 3);
+})
+
+DEF_CALL_HANDLER(bitshift64Shl, {
+  Declares.insert("bitshift64Shl");
+  return CH___default__(CI, "_bitshift64Shl", 3);
+})
+
+DEF_CALL_HANDLER(llvm_ctlz_i32, {
+  return CH___default__(CI, "Math_clz32", 1);
+})
+
+DEF_CALL_HANDLER(llvm_cttz_i32, {
+  if (OnlyWebAssembly) {
+    return CH___default__(CI, "i32_cttz", 1);
+  }
+  Declares.insert("llvm_cttz_i32");
+  return CH___default__(CI, "_llvm_cttz_i32", 1);
+})
+
+DEF_CALL_HANDLER(llvm_ctlz_i64, {
+  if (OnlyWebAssembly) {
+    return CH___default__(CI, "i64_ctlz", 1);
+  }
+  Declares.insert("llvm_ctlz_i64");
+  return CH___default__(CI, "_llvm_ctlz_i64");
+})
+
+DEF_CALL_HANDLER(llvm_cttz_i64, {
+  if (OnlyWebAssembly) {
+    return CH___default__(CI, "i64_cttz", 1);
+  }
+  Declares.insert("llvm_cttz_i64");
+  return CH___default__(CI, "_llvm_cttz_i64");
+})
+
+DEF_CALL_HANDLER(llvm_ctpop_i32, {
+  if (OnlyWebAssembly) {
+    return CH___default__(CI, "i32_ctpop", 1);
+  }
+  Declares.insert("llvm_ctpop_i32");
+  return CH___default__(CI, "_llvm_ctpop_i32");
+})
+
+DEF_CALL_HANDLER(llvm_ctpop_i64, {
+  if (OnlyWebAssembly) {
+    return CH___default__(CI, "i64_ctpop", 1);
+  }
+  Declares.insert("llvm_ctpop_i64");
+  return CH___default__(CI, "_llvm_ctpop_i64");
+})
+
+DEF_CALL_HANDLER(llvm_maxnum_f32, {
+  return CH___default__(CI, "Math_max", 2);
+})
+
+DEF_CALL_HANDLER(llvm_maxnum_f64, {
+  return CH___default__(CI, "Math_max", 2);
+})
+
+DEF_CALL_HANDLER(llvm_copysign_f32, {
+  if (OnlyWebAssembly) {
+    return CH___default__(CI, "f32_copysign", 2);
+  }
+  Declares.insert("llvm_copysign_f32");
+  return CH___default__(CI, "_llvm_copysign_f32", 2);
+})
+
+DEF_CALL_HANDLER(llvm_copysign_f64, {
+  if (OnlyWebAssembly) {
+    return CH___default__(CI, "(f64_copysign)", 2); // XXX add parens as this will be +f64_copysign(...), which triggers +f64 => f64.0. TODO fix regex in emscripten.py
+  }
+  Declares.insert("llvm_copysign_f64");
+  return CH___default__(CI, "_llvm_copysign_f64", 2);
+})
+
+// EM_ASM support
+
+std::string handleAsmConst(const Instruction *CI) {
+  unsigned Num = getNumArgOperands(CI);
+  std::string Sig;
+  Sig += getFunctionSignatureLetter(CI->getType());
+  for (unsigned i = 1; i < Num; i++) {
+    Sig += getFunctionSignatureLetter(CI->getOperand(i)->getType());
+  }
+  std::string func = "emscripten_asm_const_" + Sig;
+  std::string ret = "_" + func + "(" + utostr(getAsmConstId(CI->getOperand(0), Sig));
+  for (unsigned i = 1; i < Num; i++) {
+    ret += ", " + getValueAsCastParenStr(CI->getOperand(i), ASM_NONSPECIFIC);
+  }
+  return ret + ")";
+}
+
+DEF_CALL_HANDLER(emscripten_asm_const, {
+  Declares.insert("emscripten_asm_const");
+  return handleAsmConst(CI);
+})
+DEF_CALL_HANDLER(emscripten_asm_const_int, {
+  Declares.insert("emscripten_asm_const_int");
+  return getAssign(CI) + getCast(handleAsmConst(CI), Type::getInt32Ty(CI->getContext()));
+})
+DEF_CALL_HANDLER(emscripten_asm_const_double, {
+  Declares.insert("emscripten_asm_const_double");
+  return getAssign(CI) + getCast(handleAsmConst(CI), Type::getDoubleTy(CI->getContext()));
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_exchange_u8, {
+  return getAssign(CI) + "(Atomics_exchange(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_exchange_u16, {
+  return getAssign(CI) + "(Atomics_exchange(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_exchange_u32, {
+  return getAssign(CI) + "(Atomics_exchange(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_cas_u8, {
+  return getAssign(CI) + "(Atomics_compareExchange(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ", " + getValueAsStr(CI->getOperand(2)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_cas_u16, {
+  return getAssign(CI) + "(Atomics_compareExchange(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ", " + getValueAsStr(CI->getOperand(1)) + ", " + getValueAsStr(CI->getOperand(2)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_cas_u32, {
+  return getAssign(CI) + "(Atomics_compareExchange(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ", " + getValueAsStr(CI->getOperand(2)) + ")|0)";
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_load_u8, {
+  return getAssign(CI) + "(Atomics_load(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_load_u16, {
+  return getAssign(CI) + "(Atomics_load(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_load_u32, {
+  return getAssign(CI) + "(Atomics_load(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_load_f32, {
+  // TODO: If https://bugzilla.mozilla.org/show_bug.cgi?id=1131613 is implemented, we could use the commented out version. Until then,
+  // we must emulate manually.
+  Declares.insert("_Atomics_load_f32_emulated");
+  return getAssign(CI) + (PreciseF32 ? "Math_fround(" : "+") + "__Atomics_load_f32_emulated(" + getShiftedPtr(CI->getOperand(0), 4) + (PreciseF32 ? "))" : ")");
+//  return getAssign(CI) + "Atomics_load(HEAPF32, " + getShiftedPtr(CI->getOperand(0), 4) + ")";
+})
+DEF_CALL_HANDLER(emscripten_atomic_load_f64, {
+  // TODO: If https://bugzilla.mozilla.org/show_bug.cgi?id=1131624 is implemented, we could use the commented out version. Until then,
+  // we must emulate manually.
+  Declares.insert("emscripten_atomic_load_f64");
+  return getAssign(CI) + "+_emscripten_atomic_load_f64(" + getShiftedPtr(CI->getOperand(0), 8) + ")";
+//  return getAssign(CI) + "Atomics_load(HEAPF64, " + getShiftedPtr(CI->getOperand(0), 8) + ")";
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_store_u8, {
+  return getAssign(CI) + "(Atomics_store(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_store_u16, {
+  return getAssign(CI) + "(Atomics_store(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_store_u32, {
+  return getAssign(CI) + "(Atomics_store(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_store_f32, {
+  // TODO: If https://bugzilla.mozilla.org/show_bug.cgi?id=1131613 is implemented, we could use the commented out version. Until then,
+  // we must emulate manually.
+  Declares.insert("emscripten_atomic_store_f32");
+  return getAssign(CI) + "_emscripten_atomic_store_f32(" + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+//  return getAssign(CI) + "Atomics_store(HEAPF32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_atomic_store_f64, {
+  // TODO: If https://bugzilla.mozilla.org/show_bug.cgi?id=1131624 is implemented, we could use the commented out version. Until then,
+  // we must emulate manually.
+  Declares.insert("emscripten_atomic_store_f64");
+  return getAssign(CI) + "+_emscripten_atomic_store_f64(" + getShiftedPtr(CI->getOperand(0), 8) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+//  return getAssign(CI) + "Atomics_store(HEAPF64, " + getShiftedPtr(CI->getOperand(0), 8) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_add_u8, {
+  return getAssign(CI) + "(Atomics_add(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_add_u16, {
+  return getAssign(CI) + "(Atomics_add(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_add_u32, {
+  return getAssign(CI) + "(Atomics_add(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_sub_u8, {
+  return getAssign(CI) + "(Atomics_sub(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_sub_u16, {
+  return getAssign(CI) + "(Atomics_sub(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_sub_u32, {
+  return getAssign(CI) + "(Atomics_sub(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_and_u8, {
+  return getAssign(CI) + "(Atomics_and(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_and_u16, {
+  return getAssign(CI) + "(Atomics_and(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_and_u32, {
+  return getAssign(CI) + "(Atomics_and(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_or_u8, {
+  return getAssign(CI) + "(Atomics_or(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_or_u16, {
+  return getAssign(CI) + "(Atomics_or(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_or_u32, {
+  return getAssign(CI) + "(Atomics_or(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+
+DEF_CALL_HANDLER(emscripten_atomic_xor_u8, {
+  return getAssign(CI) + "(Atomics_xor(HEAP8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_xor_u16, {
+  return getAssign(CI) + "(Atomics_xor(HEAP16, " + getShiftedPtr(CI->getOperand(0), 2) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+DEF_CALL_HANDLER(emscripten_atomic_xor_u32, {
+  return getAssign(CI) + "(Atomics_xor(HEAP32, " + getShiftedPtr(CI->getOperand(0), 4) + ", " + getValueAsStr(CI->getOperand(1)) + ")|0)";
+})
+
+#define DEF_BUILTIN_HANDLER(name, to) \
+DEF_CALL_HANDLER(name, { \
+  return CH___default__(CI, #to); \
+})
+
+#define DEF_MAYBE_BUILTIN_HANDLER(name, to) \
+DEF_CALL_HANDLER(name, { \
+  if (!WebAssembly) return CH___default__(CI, #to); \
+  Declares.insert(#name); \
+  return CH___default__(CI, "_" #name); \
+})
+
+// Various simple redirects for our js libc, see library.js and LibraryManager.load
+DEF_BUILTIN_HANDLER(abs, Math_abs);
+DEF_BUILTIN_HANDLER(labs, Math_abs);
+DEF_MAYBE_BUILTIN_HANDLER(cos, Math_cos);
+DEF_MAYBE_BUILTIN_HANDLER(cosf, Math_cos);
+DEF_MAYBE_BUILTIN_HANDLER(cosl, Math_cos);
+DEF_MAYBE_BUILTIN_HANDLER(sin, Math_sin);
+DEF_MAYBE_BUILTIN_HANDLER(sinf, Math_sin);
+DEF_MAYBE_BUILTIN_HANDLER(sinl, Math_sin);
+DEF_MAYBE_BUILTIN_HANDLER(tan, Math_tan);
+DEF_MAYBE_BUILTIN_HANDLER(tanf, Math_tan);
+DEF_MAYBE_BUILTIN_HANDLER(tanl, Math_tan);
+DEF_MAYBE_BUILTIN_HANDLER(acos, Math_acos);
+DEF_MAYBE_BUILTIN_HANDLER(acosf, Math_acos);
+DEF_MAYBE_BUILTIN_HANDLER(acosl, Math_acos);
+DEF_MAYBE_BUILTIN_HANDLER(asin, Math_asin);
+DEF_MAYBE_BUILTIN_HANDLER(asinf, Math_asin);
+DEF_MAYBE_BUILTIN_HANDLER(asinl, Math_asin);
+DEF_MAYBE_BUILTIN_HANDLER(atan, Math_atan);
+DEF_MAYBE_BUILTIN_HANDLER(atanf, Math_atan);
+DEF_MAYBE_BUILTIN_HANDLER(atanl, Math_atan);
+DEF_MAYBE_BUILTIN_HANDLER(atan2, Math_atan2);
+DEF_MAYBE_BUILTIN_HANDLER(atan2f, Math_atan2);
+DEF_MAYBE_BUILTIN_HANDLER(atan2l, Math_atan2);
+DEF_MAYBE_BUILTIN_HANDLER(exp, Math_exp);
+DEF_MAYBE_BUILTIN_HANDLER(expf, Math_exp);
+DEF_MAYBE_BUILTIN_HANDLER(expl, Math_exp);
+DEF_MAYBE_BUILTIN_HANDLER(log, Math_log);
+DEF_MAYBE_BUILTIN_HANDLER(logf, Math_log);
+DEF_MAYBE_BUILTIN_HANDLER(logl, Math_log);
+DEF_BUILTIN_HANDLER(sqrt, Math_sqrt);
+DEF_BUILTIN_HANDLER(sqrtf, Math_sqrt);
+DEF_BUILTIN_HANDLER(sqrtl, Math_sqrt);
+DEF_BUILTIN_HANDLER(fabs, Math_abs);
+DEF_BUILTIN_HANDLER(fabsf, Math_abs);
+DEF_BUILTIN_HANDLER(fabsl, Math_abs);
+DEF_BUILTIN_HANDLER(llvm_fabs_f32, Math_abs);
+DEF_BUILTIN_HANDLER(llvm_fabs_f64, Math_abs);
+DEF_BUILTIN_HANDLER(ceil, Math_ceil);
+DEF_BUILTIN_HANDLER(ceilf, Math_ceil);
+DEF_BUILTIN_HANDLER(ceill, Math_ceil);
+DEF_BUILTIN_HANDLER(llvm_ceil_f32, Math_ceil);
+DEF_BUILTIN_HANDLER(llvm_ceil_f64, Math_ceil);
+DEF_BUILTIN_HANDLER(floor, Math_floor);
+DEF_BUILTIN_HANDLER(floorf, Math_floor);
+DEF_BUILTIN_HANDLER(floorl, Math_floor);
+DEF_BUILTIN_HANDLER(llvm_floor_f32, Math_floor);
+DEF_BUILTIN_HANDLER(llvm_floor_f64, Math_floor);
+DEF_MAYBE_BUILTIN_HANDLER(pow, Math_pow);
+DEF_MAYBE_BUILTIN_HANDLER(powf, Math_pow);
+DEF_MAYBE_BUILTIN_HANDLER(powl, Math_pow);
+DEF_BUILTIN_HANDLER(llvm_sqrt_f32, Math_sqrt);
+DEF_BUILTIN_HANDLER(llvm_sqrt_f64, Math_sqrt);
+DEF_BUILTIN_HANDLER(llvm_pow_f32, Math_pow); // XXX these will be slow in wasm, but need to link in libc before getting here, or stop
+DEF_BUILTIN_HANDLER(llvm_pow_f64, Math_pow); //     LLVM from creating these intrinsics
+DEF_MAYBE_BUILTIN_HANDLER(llvm_cos_f32, Math_cos);
+DEF_MAYBE_BUILTIN_HANDLER(llvm_cos_f64, Math_cos);
+DEF_MAYBE_BUILTIN_HANDLER(llvm_sin_f32, Math_sin);
+DEF_MAYBE_BUILTIN_HANDLER(llvm_sin_f64, Math_sin);
+
+DEF_CALL_HANDLER(llvm_powi_f32, {
+  return getAssign(CI) + getParenCast("Math_pow(" + getValueAsCastStr(CI->getOperand(0)) + ", " + getCast(getValueAsCastStr(CI->getOperand(1)), CI->getOperand(0)->getType()) + ")", CI->getType());
+})
+DEF_CALL_HANDLER(llvm_powi_f64, {
+  return getAssign(CI) + getParenCast("Math_pow(" + getValueAsCastStr(CI->getOperand(0)) + ", " + getCast(getValueAsCastStr(CI->getOperand(1)), CI->getOperand(0)->getType()) + ")", CI->getType());
+})
+
+DEF_BUILTIN_HANDLER(llvm_log_f32, Math_log);
+DEF_BUILTIN_HANDLER(llvm_log_f64, Math_log);
+DEF_BUILTIN_HANDLER(llvm_exp_f32, Math_exp);
+DEF_BUILTIN_HANDLER(llvm_exp_f64, Math_exp);
+
+// SIMD.js Float64x2
+DEF_BUILTIN_HANDLER(emscripten_float64x2_set, SIMD_Float64x2);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_splat, SIMD_Float64x2_splat);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_add, SIMD_Float64x2_add);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_sub, SIMD_Float64x2_sub);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_mul, SIMD_Float64x2_mul);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_div, SIMD_Float64x2_div);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_max, SIMD_Float64x2_max);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_min, SIMD_Float64x2_min);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_maxNum, SIMD_Float64x2_maxNum);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_minNum, SIMD_Float64x2_minNum);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_neg, SIMD_Float64x2_neg);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_sqrt, SIMD_Float64x2_sqrt);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_reciprocalApproximation, SIMD_Float64x2_reciprocalApproximation);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_reciprocalSqrtApproximation, SIMD_Float64x2_reciprocalSqrtApproximation);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_abs, SIMD_Float64x2_abs);
+// n.b. No emscripten_float64x2_and, only defined on boolean and integer SIMD types.
+// n.b. No emscripten_float64x2_xor, only defined on boolean and integer SIMD types.
+// n.b. No emscripten_float64x2_or, only defined on boolean and integer SIMD types.
+// n.b. No emscripten_float64x2_not, only defined on boolean and integer SIMD types.
+static std::string castBool64x2ToInt32x4(const std::string &valueStr) {
+  return std::string("SIMD_Int32x4_fromBool64x2Bits(") + valueStr + ')';
+}
+DEF_CALL_HANDLER(emscripten_float64x2_lessThan, {
+  return getAssign(CI) + castBool64x2ToInt32x4("SIMD_Float64x2_lessThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")");
+})
+DEF_CALL_HANDLER(emscripten_float64x2_lessThanOrEqual, {
+  return getAssign(CI) + castBool64x2ToInt32x4("SIMD_Float64x2_lessThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")");
+})
+DEF_CALL_HANDLER(emscripten_float64x2_greaterThan, {
+  return getAssign(CI) + castBool64x2ToInt32x4("SIMD_Float64x2_greaterThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")");
+})
+DEF_CALL_HANDLER(emscripten_float64x2_greaterThanOrEqual, {
+  return getAssign(CI) + castBool64x2ToInt32x4("SIMD_Float64x2_greaterThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")");
+})
+DEF_CALL_HANDLER(emscripten_float64x2_equal, {
+  return getAssign(CI) + castBool64x2ToInt32x4("SIMD_Float64x2_equal(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")");
+})
+DEF_CALL_HANDLER(emscripten_float64x2_notEqual, {
+  return getAssign(CI) + castBool64x2ToInt32x4("SIMD_Float64x2_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")");
+})
+// n.b. No emscripten_float64x2_anyTrue, only defined on boolean SIMD types.
+// n.b. No emscripten_float64x2_allTrue, only defined on boolean SIMD types.
+DEF_BUILTIN_HANDLER(emscripten_float64x2_select, SIMD_Float64x2_select);
+// n.b. No emscripten_float64x2_addSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+// n.b. No emscripten_float64x2_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+// n.b. No emscripten_float64x2_shiftLeftByScalar, only defined on integer SIMD types.
+// n.b. No emscripten_float64x2_shiftRightByScalar, only defined on integer SIMD types.
+DEF_BUILTIN_HANDLER(emscripten_float64x2_extractLane, SIMD_Float64x2_extractLane);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_replaceLane, SIMD_Float64x2_replaceLane);
+DEF_CALL_HANDLER(emscripten_float64x2_store, {
+  UsesSIMDFloat64x2 = true;
+  return "SIMD_Float64x2_store(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float64x2_store1, {
+  UsesSIMDFloat64x2 = true;
+  return "SIMD_Float64x2_store1(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float64x2_load, {
+  UsesSIMDFloat64x2 = true;
+  return getAssign(CI) + "SIMD_Float64x2_load(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float64x2_load1, {
+  UsesSIMDFloat64x2 = true;
+  return getAssign(CI) + "SIMD_Float64x2_load1(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_BUILTIN_HANDLER(emscripten_float64x2_fromFloat32x4Bits, SIMD_Float64x2_fromFloat32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_fromInt32x4Bits, SIMD_Float64x2_fromInt32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_fromUint32x4Bits, SIMD_Float64x2_fromUint32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_fromInt16x8Bits, SIMD_Float64x2_fromInt16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_fromUint16x8Bits, SIMD_Float64x2_fromUint16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_fromInt8x16Bits, SIMD_Float64x2_fromInt8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_fromUint8x16Bits, SIMD_Float64x2_fromUint8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_swizzle, SIMD_Float64x2_swizzle);
+DEF_BUILTIN_HANDLER(emscripten_float64x2_shuffle, SIMD_Float64x2_shuffle);
+
+// SIMD.js Float32x4
+DEF_BUILTIN_HANDLER(emscripten_float32x4_set, SIMD_Float32x4);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_splat, SIMD_Float32x4_splat);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_add, SIMD_Float32x4_add);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_sub, SIMD_Float32x4_sub);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_mul, SIMD_Float32x4_mul);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_div, SIMD_Float32x4_div);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_max, SIMD_Float32x4_max);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_min, SIMD_Float32x4_min);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_maxNum, SIMD_Float32x4_maxNum);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_minNum, SIMD_Float32x4_minNum);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_neg, SIMD_Float32x4_neg);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_sqrt, SIMD_Float32x4_sqrt);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_reciprocalApproximation, SIMD_Float32x4_reciprocalApproximation);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_reciprocalSqrtApproximation, SIMD_Float32x4_reciprocalSqrtApproximation);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_abs, SIMD_Float32x4_abs);
+// n.b. No emscripten_float32x4_and, only defined on boolean and integer SIMD types.
+// n.b. No emscripten_float32x4_xor, only defined on boolean and integer SIMD types.
+// n.b. No emscripten_float32x4_or, only defined on boolean and integer SIMD types.
+// n.b. No emscripten_float32x4_not, only defined on boolean and integer SIMD types.
+std::string castBoolVecToIntVec(int numElems, const std::string &str, bool signExtend)
+{
+  int elemWidth = 128 / numElems;
+  std::string simdType = "SIMD_Int" + llvm::to_string(elemWidth) + "x" + llvm::to_string(numElems);
+  return simdType + "_select(" + str + ", " + simdType + "_splat(" + (signExtend ? "-1" : "1") + "), " + simdType + "_splat(0))";
+}
+DEF_CALL_HANDLER(emscripten_float32x4_lessThan, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Float32x4_lessThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_float32x4_lessThanOrEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Float32x4_lessThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_float32x4_greaterThan, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Float32x4_greaterThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_float32x4_greaterThanOrEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Float32x4_greaterThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_float32x4_equal, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Float32x4_equal(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_float32x4_notEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Float32x4_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+// n.b. No emscripten_float32x4_anyTrue, only defined on boolean SIMD types.
+// n.b. No emscripten_float32x4_allTrue, only defined on boolean SIMD types.
+DEF_CALL_HANDLER(emscripten_float32x4_select, {
+  // FIXME: We really need a more general way of handling boolean types,
+  // including an optimization to allow more Int32x4 operations to be
+  // translated as Bool32x4 operations.
+  std::string Op;
+  if (SExtInst *SE = dyn_cast<SExtInst>(CI->getOperand(0))) {
+    Op = getValueAsStr(SE->getOperand(0));
+  } else {
+    Op = "SIMD_Int32x4_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", SIMD_Int32x4_splat(0))";
+  }
+  return getAssign(CI) + "SIMD_Float32x4_select(" + Op + "," + getValueAsStr(CI->getOperand(1)) + "," + getValueAsStr(CI->getOperand(2)) + ")";
+})
+// n.b. No emscripten_float32x4_addSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+// n.b. No emscripten_float32x4_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+// n.b. No emscripten_float32x4_shiftLeftByScalar, only defined on integer SIMD types.
+// n.b. No emscripten_float32x4_shiftRightByScalar, only defined on integer SIMD types.
+DEF_BUILTIN_HANDLER(emscripten_float32x4_extractLane, SIMD_Float32x4_extractLane);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_replaceLane, SIMD_Float32x4_replaceLane);
+DEF_CALL_HANDLER(emscripten_float32x4_store, {
+  UsesSIMDFloat32x4 = true;
+  return "SIMD_Float32x4_store(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float32x4_store1, {
+  UsesSIMDFloat32x4 = true;
+  return "SIMD_Float32x4_store1(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float32x4_store2, {
+  UsesSIMDFloat32x4 = true;
+  return "SIMD_Float32x4_store2(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float32x4_store3, {
+  UsesSIMDFloat32x4 = true;
+  return "SIMD_Float32x4_store3(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float32x4_load, {
+  UsesSIMDFloat32x4 = true;
+  return getAssign(CI) + "SIMD_Float32x4_load(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float32x4_load1, {
+  UsesSIMDFloat32x4 = true;
+  return getAssign(CI) + "SIMD_Float32x4_load1(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float32x4_load2, {
+  UsesSIMDFloat32x4 = true;
+  return getAssign(CI) + "SIMD_Float32x4_load2(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_float32x4_load3, {
+  UsesSIMDFloat32x4 = true;
+  return getAssign(CI) + "SIMD_Float32x4_load3(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromFloat64x2Bits, SIMD_Float32x4_fromFloat64x2Bits);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromInt32x4Bits, SIMD_Float32x4_fromInt32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromUint32x4Bits, SIMD_Float32x4_fromUint32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromInt16x8Bits, SIMD_Float32x4_fromInt16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromUint16x8Bits, SIMD_Float32x4_fromUint16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromInt8x16Bits, SIMD_Float32x4_fromInt8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromUint8x16Bits, SIMD_Float32x4_fromUint8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromInt32x4, SIMD_Float32x4_fromInt32x4);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_fromUint32x4, SIMD_Float32x4_fromUint32x4);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_swizzle, SIMD_Float32x4_swizzle);
+DEF_BUILTIN_HANDLER(emscripten_float32x4_shuffle, SIMD_Float32x4_shuffle);
+
+// SIMD.js Int32x4
+DEF_BUILTIN_HANDLER(emscripten_int32x4_set, SIMD_Int32x4);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_splat, SIMD_Int32x4_splat);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_add, SIMD_Int32x4_add);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_sub, SIMD_Int32x4_sub);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_mul, SIMD_Int32x4_mul);
+// n.b. No emscripten_int32x4_div, division is only defined on floating point types.
+// n.b. No emscripten_int32x4_max, only defined on floating point types.
+// n.b. No emscripten_int32x4_min, only defined on floating point types.
+// n.b. No emscripten_int32x4_maxNum, only defined on floating point types.
+// n.b. No emscripten_int32x4_minNum, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_int32x4_neg, SIMD_Int32x4_neg);
+// n.b. No emscripten_int32x4_sqrt, only defined on floating point types.
+// n.b. No emscripten_int32x4_reciprocalApproximation, only defined on floating point types.
+// n.b. No emscripten_int32x4_reciprocalSqrtApproximation, only defined on floating point types.
+// n.b. No emscripten_int32x4_abs, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_int32x4_and, SIMD_Int32x4_and);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_xor, SIMD_Int32x4_xor);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_or, SIMD_Int32x4_or);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_not, SIMD_Int32x4_not);
+DEF_CALL_HANDLER(emscripten_int32x4_lessThan, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Int32x4_lessThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int32x4_lessThanOrEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Int32x4_lessThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int32x4_greaterThan, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Int32x4_greaterThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int32x4_greaterThanOrEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Int32x4_greaterThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int32x4_equal, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Int32x4_equal(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int32x4_notEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(4, "SIMD_Int32x4_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int32x4_select, {
+  // FIXME: We really need a more general way of handling boolean types,
+  // including an optimization to allow more Int32x4 operations to be
+  // translated as Bool32x4 operations.
+  std::string Op;
+  if (SExtInst *SE = dyn_cast<SExtInst>(CI->getOperand(0))) {
+    Op = getValueAsStr(SE->getOperand(0));
+  } else {
+    Op = "SIMD_Int32x4_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", SIMD_Int32x4_splat(0))";
+  }
+  return getAssign(CI) + "SIMD_Int32x4_select(" + Op + "," + getValueAsStr(CI->getOperand(1)) + "," + getValueAsStr(CI->getOperand(2)) + ")";
+})
+// n.b. No emscripten_int32x4_addSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+// n.b. No emscripten_int32x4_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+DEF_BUILTIN_HANDLER(emscripten_int32x4_shiftLeftByScalar, SIMD_Int32x4_shiftLeftByScalar);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_shiftRightByScalar, SIMD_Int32x4_shiftRightByScalar);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_extractLane, SIMD_Int32x4_extractLane);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_replaceLane, SIMD_Int32x4_replaceLane);
+DEF_CALL_HANDLER(emscripten_int32x4_store, {
+  UsesSIMDInt32x4 = true;
+  return "SIMD_Int32x4_store(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int32x4_store1, {
+  UsesSIMDInt32x4 = true;
+  return "SIMD_Int32x4_store1(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int32x4_store2, {
+  UsesSIMDInt32x4 = true;
+  return "SIMD_Int32x4_store2(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int32x4_store3, {
+  UsesSIMDInt32x4 = true;
+  return "SIMD_Int32x4_store3(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int32x4_load, {
+  UsesSIMDInt32x4 = true;
+  return getAssign(CI) + "SIMD_Int32x4_load(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int32x4_load1, {
+  UsesSIMDInt32x4 = true;
+  return getAssign(CI) + "SIMD_Int32x4_load1(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int32x4_load2, {
+  UsesSIMDInt32x4 = true;
+  return getAssign(CI) + "SIMD_Int32x4_load2(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int32x4_load3, {
+  UsesSIMDInt32x4 = true;
+  return getAssign(CI) + "SIMD_Int32x4_load3(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromFloat64x2Bits, SIMD_Int32x4_fromFloat64x2Bits);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromFloat32x4Bits, SIMD_Int32x4_fromFloat32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromUint32x4Bits, SIMD_Int32x4_fromUint32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromInt16x8Bits, SIMD_Int32x4_fromInt16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromUint16x8Bits, SIMD_Int32x4_fromUint16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromInt8x16Bits, SIMD_Int32x4_fromInt8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromUint8x16Bits, SIMD_Int32x4_fromUint8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromFloat32x4, SIMD_Int32x4_fromFloat32x4);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_fromUint32x4, SIMD_Int32x4_fromUint32x4);
+// TODO: emscripten_int32x4_fromFloat64x2?
+DEF_BUILTIN_HANDLER(emscripten_int32x4_swizzle, SIMD_Int32x4_swizzle);
+DEF_BUILTIN_HANDLER(emscripten_int32x4_shuffle, SIMD_Int32x4_shuffle);
+
+// SIMD.js Uint32x4
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_set, SIMD_Uint32x4);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_splat, SIMD_Uint32x4_splat);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_add, SIMD_Uint32x4_add);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_sub, SIMD_Uint32x4_sub);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_mul, SIMD_Uint32x4_mul);
+// n.b. No emscripten_uint32x4_div, division is only defined on floating point types.
+// n.b. No emscripten_uint32x4_max, only defined on floating point types.
+// n.b. No emscripten_uint32x4_min, only defined on floating point types.
+// n.b. No emscripten_uint32x4_maxNum, only defined on floating point types.
+// n.b. No emscripten_uint32x4_minNum, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_neg, SIMD_Uint32x4_neg);
+// n.b. No emscripten_uint32x4_sqrt, only defined on floating point types.
+// n.b. No emscripten_uint32x4_reciprocalApproximation, only defined on floating point types.
+// n.b. No emscripten_uint32x4_reciprocalSqrtApproximation, only defined on floating point types.
+// n.b. No emscripten_uint32x4_abs, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_and, SIMD_Uint32x4_and);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_xor, SIMD_Uint32x4_xor);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_or, SIMD_Uint32x4_or);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_not, SIMD_Uint32x4_not);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_lessThan, SIMD_Uint32x4_lessThan);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_lessThanOrEqual, SIMD_Uint32x4_lessThanOrEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_greaterThan, SIMD_Uint32x4_greaterThan);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_greaterThanOrEqual, SIMD_Uint32x4_greaterThanOrEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_equal, SIMD_Uint32x4_equal);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_notEqual, SIMD_Uint32x4_notEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_select, SIMD_Uint32x4_select);
+// n.b. No emscripten_uint32x4_addSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+// n.b. No emscripten_uint32x4_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_shiftLeftByScalar, SIMD_Uint32x4_shiftLeftByScalar);
+DEF_CALL_HANDLER(emscripten_uint32x4_shiftRightByScalar, {
+  UsesSIMDUint32x4 = true;
+  UsesSIMDInt32x4 = true;
+  return getAssign(CI) + "SIMD_Int32x4_fromUint32x4Bits(SIMD_Uint32x4_shiftRightByScalar(SIMD_Uint32x4_fromInt32x4Bits(" + getValueAsStr(CI->getOperand(0)) + "), " + getValueAsStr(CI->getOperand(1)) + "))";
+})
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_extractLane, SIMD_Uint32x4_extractLane);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_replaceLane, SIMD_Uint32x4_replaceLane);
+DEF_CALL_HANDLER(emscripten_uint32x4_store, {
+  UsesSIMDUint32x4 = true;
+  return "SIMD_Uint32x4_store(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_uint32x4_store1, {
+  UsesSIMDUint32x4 = true;
+  return "SIMD_Uint32x4_store1(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_uint32x4_store2, {
+  UsesSIMDUint32x4 = true;
+  return "SIMD_Uint32x4_store2(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_uint32x4_store3, {
+  UsesSIMDUint32x4 = true;
+  return "SIMD_Uint32x4_store3(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ", " + ")";
+})
+DEF_CALL_HANDLER(emscripten_uint32x4_load, {
+  UsesSIMDUint32x4 = true;
+  return getAssign(CI) + "SIMD_Uint32x4_load(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_uint32x4_load1, {
+  UsesSIMDUint32x4 = true;
+  return getAssign(CI) + "SIMD_Uint32x4_load1(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_uint32x4_load2, {
+  UsesSIMDUint32x4 = true;
+  return getAssign(CI) + "SIMD_Uint32x4_load2(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_uint32x4_load3, {
+  UsesSIMDUint32x4 = true;
+  return getAssign(CI) + "SIMD_Uint32x4_load3(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromFloat64x2Bits, SIMD_Uint32x4_fromFloat64x2Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromFloat32x4Bits, SIMD_Uint32x4_fromFloat32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromInt32x4Bits, SIMD_Uint32x4_fromInt32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromInt16x8Bits, SIMD_Uint32x4_fromInt16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromUint16x8Bits, SIMD_Uint32x4_fromUint16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromInt8x16Bits, SIMD_Uint32x4_fromInt8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromUint8x16Bits, SIMD_Uint32x4_fromUint8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromFloat32x4, SIMD_Uint32x4_fromFloat32x4);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_fromInt32x4, SIMD_Uint32x4_fromInt32x4);
+// TODO: emscripten_uint32x4_fromFloat64x2?
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_swizzle, SIMD_Uint32x4_swizzle);
+DEF_BUILTIN_HANDLER(emscripten_uint32x4_shuffle, SIMD_Uint32x4_shuffle);
+
+// SIMD.js Int16x8
+DEF_BUILTIN_HANDLER(emscripten_int16x8_set, SIMD_Int16x8);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_splat, SIMD_Int16x8_splat);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_add, SIMD_Int16x8_add);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_sub, SIMD_Int16x8_sub);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_mul, SIMD_Int16x8_mul);
+// n.b. No emscripten_int16x8_div, division is only defined on floating point types.
+// n.b. No emscripten_int16x8_max, only defined on floating point types.
+// n.b. No emscripten_int16x8_min, only defined on floating point types.
+// n.b. No emscripten_int16x8_maxNum, only defined on floating point types.
+// n.b. No emscripten_int16x8_minNum, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_int16x8_neg, SIMD_Int16x8_neg);
+// n.b. No emscripten_int16x8_sqrt, only defined on floating point types.
+// n.b. No emscripten_int16x8_reciprocalApproximation, only defined on floating point types.
+// n.b. No emscripten_int16x8_reciprocalSqrtApproximation, only defined on floating point types.
+// n.b. No emscripten_int16x8_abs, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_int16x8_and, SIMD_Int16x8_and);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_xor, SIMD_Int16x8_xor);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_or, SIMD_Int16x8_or);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_not, SIMD_Int16x8_not);
+DEF_CALL_HANDLER(emscripten_int16x8_lessThan, {
+  return getAssign(CI) + castBoolVecToIntVec(8, "SIMD_Int16x8_lessThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int16x8_lessThanOrEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(8, "SIMD_Int16x8_lessThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int16x8_greaterThan, {
+  return getAssign(CI) + castBoolVecToIntVec(8, "SIMD_Int16x8_greaterThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int16x8_greaterThanOrEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(8, "SIMD_Int16x8_greaterThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int16x8_equal, {
+  return getAssign(CI) + castBoolVecToIntVec(8, "SIMD_Int16x8_equal(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int16x8_notEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(8, "SIMD_Int16x8_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int16x8_select, {
+  // FIXME: We really need a more general way of handling boolean types,
+  // including an optimization to allow more Int16x8 operations to be
+  // translated as Bool16x8 operations.
+  std::string Op;
+  if (SExtInst *SE = dyn_cast<SExtInst>(CI->getOperand(0))) {
+    Op = getValueAsStr(SE->getOperand(0));
+  } else {
+    Op = "SIMD_Int16x8_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", SIMD_Int16x8_splat(0))";
+  }
+  return getAssign(CI) + "SIMD_Int16x8_select(" + Op + "," + getValueAsStr(CI->getOperand(1)) + "," + getValueAsStr(CI->getOperand(2)) + ")";
+})
+DEF_BUILTIN_HANDLER(emscripten_int16x8_addSaturate, SIMD_Int16x8_addSaturate);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_subSaturate, SIMD_Int16x8_subSaturate);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_shiftLeftByScalar, SIMD_Int16x8_shiftLeftByScalar);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_shiftRightByScalar, SIMD_Int16x8_shiftRightByScalar);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_extractLane, SIMD_Int16x8_extractLane);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_replaceLane, SIMD_Int16x8_replaceLane);
+DEF_CALL_HANDLER(emscripten_int16x8_store, {
+  UsesSIMDInt16x8 = true;
+  return "SIMD_Int16x8_store(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int16x8_load, {
+  UsesSIMDInt16x8 = true;
+  return getAssign(CI) + "SIMD_Int16x8_load(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_BUILTIN_HANDLER(emscripten_int16x8_fromFloat64x2Bits, SIMD_Int16x8_fromFloat64x2Bits);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_fromFloat32x4Bits, SIMD_Int16x8_fromFloat32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_fromInt32x4Bits, SIMD_Int16x8_fromInt32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_fromUint32x4Bits, SIMD_Int16x8_fromUint32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_fromUint16x8Bits, SIMD_Int16x8_fromUint16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_fromInt8x16Bits, SIMD_Int16x8_fromInt8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_fromUint8x16Bits, SIMD_Int16x8_fromUint8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_fromUint16x8, SIMD_Int16x8_fromUint16x8);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_swizzle, SIMD_Int16x8_swizzle);
+DEF_BUILTIN_HANDLER(emscripten_int16x8_shuffle, SIMD_Int16x8_shuffle);
+
+// SIMD.js Uint16x8
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_set, SIMD_Uint16x8);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_splat, SIMD_Uint16x8_splat);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_add, SIMD_Uint16x8_add);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_sub, SIMD_Uint16x8_sub);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_mul, SIMD_Uint16x8_mul);
+// n.b. No emscripten_uint16x8_div, division is only defined on floating point types.
+// n.b. No emscripten_uint16x8_max, only defined on floating point types.
+// n.b. No emscripten_uint16x8_min, only defined on floating point types.
+// n.b. No emscripten_uint16x8_maxNum, only defined on floating point types.
+// n.b. No emscripten_uint16x8_minNum, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_neg, SIMD_Uint16x8_neg);
+// n.b. No emscripten_uint16x8_sqrt, only defined on floating point types.
+// n.b. No emscripten_uint16x8_reciprocalApproximation, only defined on floating point types.
+// n.b. No emscripten_uint16x8_reciprocalSqrtApproximation, only defined on floating point types.
+// n.b. No emscripten_uint16x8_abs, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_and, SIMD_Uint16x8_and);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_xor, SIMD_Uint16x8_xor);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_or, SIMD_Uint16x8_or);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_not, SIMD_Uint16x8_not);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_lessThan, SIMD_Uint16x8_lessThan);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_lessThanOrEqual, SIMD_Uint16x8_lessThanOrEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_greaterThan, SIMD_Uint16x8_greaterThan);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_greaterThanOrEqual, SIMD_Uint16x8_greaterThanOrEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_equal, SIMD_Uint16x8_equal);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_notEqual, SIMD_Uint16x8_notEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_select, SIMD_Uint16x8_select);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_addSaturate, SIMD_Uint16x8_addSaturate);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_subSaturate, SIMD_Uint16x8_subSaturate);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_shiftLeftByScalar, SIMD_Uint16x8_shiftLeftByScalar);
+DEF_CALL_HANDLER(emscripten_uint16x8_shiftRightByScalar, {
+  UsesSIMDInt16x8 = true;
+  UsesSIMDUint16x8 = true;
+  return getAssign(CI) + "SIMD_Int16x8_fromUint16x8Bits(SIMD_Uint16x8_shiftRightByScalar(SIMD_Uint16x8_fromInt16x8Bits(" + getValueAsStr(CI->getOperand(0)) + "), " + getValueAsStr(CI->getOperand(1)) + "))";
+})
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_extractLane, SIMD_Uint16x8_extractLane);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_replaceLane, SIMD_Uint16x8_replaceLane);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_store, SIMD_Uint16x8_store);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_load, SIMD_Uint16x8_load);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_fromFloat64x2Bits, SIMD_Uint16x8_fromFloat64x2Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_fromFloat32x4Bits, SIMD_Uint16x8_fromFloat32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_fromInt32x4Bits, SIMD_Uint16x8_fromInt32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_fromUint32x4Bits, SIMD_Uint16x8_fromUint32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_fromInt16x8Bits, SIMD_Uint16x8_fromInt16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_fromInt8x16Bits, SIMD_Uint16x8_fromInt8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_fromUint8x16Bits, SIMD_Uint16x8_fromUint8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_fromInt16x8, SIMD_Uint16x8_fromInt16x8);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_swizzle, SIMD_Uint16x8_swizzle);
+DEF_BUILTIN_HANDLER(emscripten_uint16x8_shuffle, SIMD_Uint16x8_shuffle);
+
+// SIMD.js Int8x16
+DEF_BUILTIN_HANDLER(emscripten_int8x16_set, SIMD_Int8x16);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_splat, SIMD_Int8x16_splat);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_add, SIMD_Int8x16_add);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_sub, SIMD_Int8x16_sub);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_mul, SIMD_Int8x16_mul);
+// n.b. No emscripten_int8x16_div, division is only defined on floating point types.
+// n.b. No emscripten_int8x16_max, only defined on floating point types.
+// n.b. No emscripten_int8x16_min, only defined on floating point types.
+// n.b. No emscripten_int8x16_maxNum, only defined on floating point types.
+// n.b. No emscripten_int8x16_minNum, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_int8x16_neg, SIMD_Int8x16_neg);
+// n.b. No emscripten_int8x16_sqrt, only defined on floating point types.
+// n.b. No emscripten_int8x16_reciprocalApproximation, only defined on floating point types.
+// n.b. No emscripten_int8x16_reciprocalSqrtApproximation, only defined on floating point types.
+// n.b. No emscripten_int8x16_abs, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_int8x16_and, SIMD_Int8x16_and);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_xor, SIMD_Int8x16_xor);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_or, SIMD_Int8x16_or);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_not, SIMD_Int8x16_not);
+DEF_CALL_HANDLER(emscripten_int8x16_lessThan, {
+  return getAssign(CI) + castBoolVecToIntVec(16, "SIMD_Int8x16_lessThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int8x16_lessThanOrEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(16, "SIMD_Int8x16_lessThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int8x16_greaterThan, {
+  return getAssign(CI) + castBoolVecToIntVec(16, "SIMD_Int8x16_greaterThan(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int8x16_greaterThanOrEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(16, "SIMD_Int8x16_greaterThanOrEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int8x16_equal, {
+  return getAssign(CI) + castBoolVecToIntVec(16, "SIMD_Int8x16_equal(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int8x16_notEqual, {
+  return getAssign(CI) + castBoolVecToIntVec(16, "SIMD_Int8x16_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")", true);
+})
+DEF_CALL_HANDLER(emscripten_int8x16_select, {
+  // FIXME: We really need a more general way of handling boolean types,
+  // including an optimization to allow more Int8x16 operations to be
+  // translated as Bool8x16 operations.
+  std::string Op;
+  if (SExtInst *SE = dyn_cast<SExtInst>(CI->getOperand(0))) {
+    Op = getValueAsStr(SE->getOperand(0));
+  } else {
+    Op = "SIMD_Int8x16_notEqual(" + getValueAsStr(CI->getOperand(0)) + ", SIMD_Int8x16_splat(0))";
+  }
+  return getAssign(CI) + "SIMD_Int8x16_select(" + Op + "," + getValueAsStr(CI->getOperand(1)) + "," + getValueAsStr(CI->getOperand(2)) + ")";
+})
+DEF_BUILTIN_HANDLER(emscripten_int8x16_addSaturate, SIMD_Int8x16_addSaturate);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_subSaturate, SIMD_Int8x16_subSaturate);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_shiftLeftByScalar, SIMD_Int8x16_shiftLeftByScalar);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_shiftRightByScalar, SIMD_Int8x16_shiftRightByScalar);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_extractLane, SIMD_Int8x16_extractLane);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_replaceLane, SIMD_Int8x16_replaceLane);
+DEF_CALL_HANDLER(emscripten_int8x16_store, {
+  UsesSIMDInt8x16 = true;
+  return "SIMD_Int8x16_store(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ", " + getValueAsStr(CI->getOperand(1)) + ")";
+})
+DEF_CALL_HANDLER(emscripten_int8x16_load, {
+  UsesSIMDInt8x16 = true;
+  return getAssign(CI) + "SIMD_Int8x16_load(HEAPU8, " + getValueAsStr(CI->getOperand(0)) + ")";
+})
+DEF_BUILTIN_HANDLER(emscripten_int8x16_fromFloat64x2Bits, SIMD_Int8x16_fromFloat64x2Bits);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_fromFloat32x4Bits, SIMD_Int8x16_fromFloat32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_fromInt32x4Bits, SIMD_Int8x16_fromInt32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_fromUint32x4Bits, SIMD_Int8x16_fromUint32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_fromInt16x8Bits, SIMD_Int8x16_fromInt16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_fromUint16x8Bits, SIMD_Int8x16_fromUint16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_fromUint8x16Bits, SIMD_Int8x16_fromUint8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_fromUint8x16, SIMD_Int8x16_fromUint8x16);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_swizzle, SIMD_Int8x16_swizzle);
+DEF_BUILTIN_HANDLER(emscripten_int8x16_shuffle, SIMD_Int8x16_shuffle);
+
+// SIMD.js Uint8x16
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_set, SIMD_Uint8x16);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_splat, SIMD_Uint8x16_splat);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_add, SIMD_Uint8x16_add);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_sub, SIMD_Uint8x16_sub);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_mul, SIMD_Uint8x16_mul);
+// n.b. No emscripten_uint8x16_div, division is only defined on floating point types.
+// n.b. No emscripten_uint8x16_max, only defined on floating point types.
+// n.b. No emscripten_uint8x16_min, only defined on floating point types.
+// n.b. No emscripten_uint8x16_maxNum, only defined on floating point types.
+// n.b. No emscripten_uint8x16_minNum, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_neg, SIMD_Uint8x16_neg);
+// n.b. No emscripten_uint8x16_sqrt, only defined on floating point types.
+// n.b. No emscripten_uint8x16_reciprocalApproximation, only defined on floating point types.
+// n.b. No emscripten_uint8x16_reciprocalSqrtApproximation, only defined on floating point types.
+// n.b. No emscripten_uint8x16_abs, only defined on floating point types.
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_and, SIMD_Uint8x16_and);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_xor, SIMD_Uint8x16_xor);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_or, SIMD_Uint8x16_or);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_not, SIMD_Uint8x16_not);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_lessThan, SIMD_Uint8x16_lessThan);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_lessThanOrEqual, SIMD_Uint8x16_lessThanOrEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_greaterThan, SIMD_Uint8x16_greaterThan);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_greaterThanOrEqual, SIMD_Uint8x16_greaterThanOrEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_equal, SIMD_Uint8x16_equal);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_notEqual, SIMD_Uint8x16_notEqual);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_select, SIMD_Uint8x16_select);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_addSaturate, SIMD_Uint8x16_addSaturate);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_subSaturate, SIMD_Uint8x16_subSaturate);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_shiftLeftByScalar, SIMD_Uint8x16_shiftLeftByScalar);
+DEF_CALL_HANDLER(emscripten_uint8x16_shiftRightByScalar, {
+  UsesSIMDInt8x16 = true;
+  UsesSIMDUint8x16 = true;
+  return getAssign(CI) + "SIMD_Int8x16_fromUint8x16Bits(SIMD_Uint8x16_shiftRightByScalar(SIMD_Uint8x16_fromInt8x16Bits(" + getValueAsStr(CI->getOperand(0)) + "), " + getValueAsStr(CI->getOperand(1)) + "))";
+})
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_extractLane, SIMD_Uint8x16_extractLane);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_replaceLane, SIMD_Uint8x16_replaceLane);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_store, SIMD_Uint8x16_store);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_load, SIMD_Uint8x16_load);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_fromFloat64x2Bits, SIMD_Uint8x16_fromFloat64x2Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_fromFloat32x4Bits, SIMD_Uint8x16_fromFloat32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_fromInt32x4Bits, SIMD_Uint8x16_fromInt32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_fromUint32x4Bits, SIMD_Uint8x16_fromUint32x4Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_fromInt16x8Bits, SIMD_Uint8x16_fromInt16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_fromUint16x8Bits, SIMD_Uint8x16_fromUint16x8Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_fromInt8x16Bits, SIMD_Uint8x16_fromInt8x16Bits);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_fromInt8x16, SIMD_Uint8x16_fromInt8x16);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_swizzle, SIMD_Uint8x16_swizzle);
+DEF_BUILTIN_HANDLER(emscripten_uint8x16_shuffle, SIMD_Uint8x16_shuffle);
+
+// SIMD.js Bool64x2
+DEF_BUILTIN_HANDLER(emscripten_bool64x2_anyTrue, SIMD_Bool64x2_anyTrue);
+DEF_BUILTIN_HANDLER(emscripten_bool64x2_allTrue, SIMD_Bool64x2_allTrue);
+
+// SIMD.js Bool32x4
+DEF_BUILTIN_HANDLER(emscripten_bool32x4_anyTrue, SIMD_Bool32x4_anyTrue);
+DEF_BUILTIN_HANDLER(emscripten_bool32x4_allTrue, SIMD_Bool32x4_allTrue);
+
+// SIMD.js Bool16x8
+DEF_BUILTIN_HANDLER(emscripten_bool16x8_anyTrue, SIMD_Bool16x8_anyTrue);
+DEF_BUILTIN_HANDLER(emscripten_bool16x8_allTrue, SIMD_Bool16x8_allTrue);
+
+// SIMD.js Bool8x16
+DEF_BUILTIN_HANDLER(emscripten_bool8x16_anyTrue, SIMD_Bool8x16_anyTrue);
+DEF_BUILTIN_HANDLER(emscripten_bool8x16_allTrue, SIMD_Bool8x16_allTrue);
+
+DEF_CALL_HANDLER(emscripten_atomic_fence, {
+  if (EnablePthreads) return "(Atomics_add(HEAP32, 0, 0)|0) /* fence */";
+  else return "/* fence */";
+})
+
+// Setups
+
+void setupCallHandlers() {
+  assert(CallHandlers.empty());
+  #define SETUP_CALL_HANDLER(Ident) \
+    CallHandlers["_" #Ident] = &JSWriter::CH_##Ident;
+
+  SETUP_CALL_HANDLER(__default__);
+  SETUP_CALL_HANDLER(emscripten_preinvoke);
+  SETUP_CALL_HANDLER(emscripten_postinvoke);
+  SETUP_CALL_HANDLER(emscripten_landingpad);
+  SETUP_CALL_HANDLER(emscripten_resume);
+  SETUP_CALL_HANDLER(emscripten_prep_setjmp);
+  SETUP_CALL_HANDLER(emscripten_cleanup_setjmp);
+  SETUP_CALL_HANDLER(emscripten_setjmp);
+  SETUP_CALL_HANDLER(emscripten_longjmp);
+  SETUP_CALL_HANDLER(emscripten_check_longjmp);
+  SETUP_CALL_HANDLER(emscripten_get_longjmp_result);
+  SETUP_CALL_HANDLER(emscripten_alloc_async_context);
+  SETUP_CALL_HANDLER(emscripten_check_async);
+  SETUP_CALL_HANDLER(emscripten_do_not_unwind);
+  SETUP_CALL_HANDLER(emscripten_do_not_unwind_async);
+  SETUP_CALL_HANDLER(emscripten_get_async_return_value_addr);
+  SETUP_CALL_HANDLER(emscripten_debugger);
+  SETUP_CALL_HANDLER(llvm_debugtrap);
+  SETUP_CALL_HANDLER(getHigh32);
+  SETUP_CALL_HANDLER(setHigh32);
+  SETUP_CALL_HANDLER(FtoILow);
+  SETUP_CALL_HANDLER(FtoIHigh);
+  SETUP_CALL_HANDLER(DtoILow);
+  SETUP_CALL_HANDLER(DtoIHigh);
+  SETUP_CALL_HANDLER(BDtoILow);
+  SETUP_CALL_HANDLER(BDtoIHigh);
+  SETUP_CALL_HANDLER(SItoF);
+  SETUP_CALL_HANDLER(UItoF);
+  SETUP_CALL_HANDLER(SItoD);
+  SETUP_CALL_HANDLER(UItoD);
+  SETUP_CALL_HANDLER(BItoD);
+  SETUP_CALL_HANDLER(llvm_nacl_atomic_store_i32);
+  SETUP_CALL_HANDLER(llvm_nacl_atomic_cmpxchg_i8);
+  SETUP_CALL_HANDLER(llvm_nacl_atomic_cmpxchg_i16);
+  SETUP_CALL_HANDLER(llvm_nacl_atomic_cmpxchg_i32);
+  SETUP_CALL_HANDLER(llvm_memcpy_p0i8_p0i8_i32);
+  SETUP_CALL_HANDLER(llvm_memset_p0i8_i32);
+  SETUP_CALL_HANDLER(llvm_memmove_p0i8_p0i8_i32);
+  SETUP_CALL_HANDLER(llvm_expect_i32);
+  SETUP_CALL_HANDLER(llvm_expect_i1);
+  SETUP_CALL_HANDLER(llvm_dbg_declare);
+  SETUP_CALL_HANDLER(llvm_dbg_value);
+  SETUP_CALL_HANDLER(llvm_lifetime_start);
+  SETUP_CALL_HANDLER(llvm_lifetime_end);
+  SETUP_CALL_HANDLER(llvm_invariant_start_p0i8);
+  SETUP_CALL_HANDLER(llvm_invariant_end_p0i8);
+  SETUP_CALL_HANDLER(llvm_prefetch);
+  SETUP_CALL_HANDLER(llvm_objectsize_i32_p0i8);
+  SETUP_CALL_HANDLER(llvm_flt_rounds);
+  SETUP_CALL_HANDLER(bitshift64Lshr);
+  SETUP_CALL_HANDLER(bitshift64Ashr);
+  SETUP_CALL_HANDLER(bitshift64Shl);
+  SETUP_CALL_HANDLER(llvm_ctlz_i32);
+  SETUP_CALL_HANDLER(llvm_cttz_i32);
+  SETUP_CALL_HANDLER(llvm_ctlz_i64);
+  SETUP_CALL_HANDLER(llvm_cttz_i64);
+  SETUP_CALL_HANDLER(llvm_ctpop_i32);
+  SETUP_CALL_HANDLER(llvm_ctpop_i64);
+  SETUP_CALL_HANDLER(llvm_maxnum_f32);
+  SETUP_CALL_HANDLER(llvm_maxnum_f64);
+  SETUP_CALL_HANDLER(llvm_copysign_f32);
+  SETUP_CALL_HANDLER(llvm_copysign_f64);
+
+  // SIMD.js Float64x2
+  SETUP_CALL_HANDLER(emscripten_float64x2_set);
+  SETUP_CALL_HANDLER(emscripten_float64x2_splat);
+  SETUP_CALL_HANDLER(emscripten_float64x2_add);
+  SETUP_CALL_HANDLER(emscripten_float64x2_sub);
+  SETUP_CALL_HANDLER(emscripten_float64x2_mul);
+  SETUP_CALL_HANDLER(emscripten_float64x2_div);
+  SETUP_CALL_HANDLER(emscripten_float64x2_max);
+  SETUP_CALL_HANDLER(emscripten_float64x2_min);
+  SETUP_CALL_HANDLER(emscripten_float64x2_maxNum);
+  SETUP_CALL_HANDLER(emscripten_float64x2_minNum);
+  SETUP_CALL_HANDLER(emscripten_float64x2_neg);
+  SETUP_CALL_HANDLER(emscripten_float64x2_sqrt);
+  SETUP_CALL_HANDLER(emscripten_float64x2_reciprocalApproximation);
+  SETUP_CALL_HANDLER(emscripten_float64x2_reciprocalSqrtApproximation);
+  SETUP_CALL_HANDLER(emscripten_float64x2_abs);
+  // n.b. No emscripten_float64x2_and, only defined on boolean and integer SIMD types.
+  // n.b. No emscripten_float64x2_xor, only defined on boolean and integer SIMD types.
+  // n.b. No emscripten_float64x2_or, only defined on boolean and integer SIMD types.
+  // n.b. No emscripten_float64x2_not, only defined on boolean and integer SIMD types.
+  SETUP_CALL_HANDLER(emscripten_float64x2_lessThan);
+  SETUP_CALL_HANDLER(emscripten_float64x2_lessThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_float64x2_greaterThan);
+  SETUP_CALL_HANDLER(emscripten_float64x2_greaterThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_float64x2_equal);
+  SETUP_CALL_HANDLER(emscripten_float64x2_notEqual);
+  // n.b. No emscripten_float64x2_anyTrue, only defined on boolean SIMD types.
+  // n.b. No emscripten_float64x2_allTrue, only defined on boolean SIMD types.
+  SETUP_CALL_HANDLER(emscripten_float64x2_select);
+  // n.b. No emscripten_float64x2_addSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+  // n.b. No emscripten_float64x2_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+  // n.b. No emscripten_float64x2_shiftLeftByScalar, only defined on integer SIMD types.
+  // n.b. No emscripten_float64x2_shiftRightByScalar, only defined on integer SIMD types.
+  SETUP_CALL_HANDLER(emscripten_float64x2_extractLane);
+  SETUP_CALL_HANDLER(emscripten_float64x2_replaceLane);
+  SETUP_CALL_HANDLER(emscripten_float64x2_store);
+  SETUP_CALL_HANDLER(emscripten_float64x2_store1);
+  SETUP_CALL_HANDLER(emscripten_float64x2_load);
+  SETUP_CALL_HANDLER(emscripten_float64x2_load1);
+  SETUP_CALL_HANDLER(emscripten_float64x2_fromFloat32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_float64x2_fromInt32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_float64x2_fromUint32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_float64x2_fromInt16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_float64x2_fromUint16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_float64x2_fromInt8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_float64x2_fromUint8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_float64x2_swizzle);
+  SETUP_CALL_HANDLER(emscripten_float64x2_shuffle);
+
+  // SIMD.js Float32x4
+  SETUP_CALL_HANDLER(emscripten_float32x4_set);
+  SETUP_CALL_HANDLER(emscripten_float32x4_splat);
+  SETUP_CALL_HANDLER(emscripten_float32x4_add);
+  SETUP_CALL_HANDLER(emscripten_float32x4_sub);
+  SETUP_CALL_HANDLER(emscripten_float32x4_mul);
+  SETUP_CALL_HANDLER(emscripten_float32x4_div);
+  SETUP_CALL_HANDLER(emscripten_float32x4_max);
+  SETUP_CALL_HANDLER(emscripten_float32x4_min);
+  SETUP_CALL_HANDLER(emscripten_float32x4_maxNum);
+  SETUP_CALL_HANDLER(emscripten_float32x4_minNum);
+  SETUP_CALL_HANDLER(emscripten_float32x4_neg);
+  SETUP_CALL_HANDLER(emscripten_float32x4_sqrt);
+  SETUP_CALL_HANDLER(emscripten_float32x4_reciprocalApproximation);
+  SETUP_CALL_HANDLER(emscripten_float32x4_reciprocalSqrtApproximation);
+  SETUP_CALL_HANDLER(emscripten_float32x4_abs);
+  // n.b. No emscripten_float32x4_and, only defined on boolean and integer SIMD types.
+  // n.b. No emscripten_float32x4_xor, only defined on boolean and integer SIMD types.
+  // n.b. No emscripten_float32x4_or, only defined on boolean and integer SIMD types.
+  // n.b. No emscripten_float32x4_not, only defined on boolean and integer SIMD types.
+  SETUP_CALL_HANDLER(emscripten_float32x4_lessThan);
+  SETUP_CALL_HANDLER(emscripten_float32x4_lessThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_float32x4_greaterThan);
+  SETUP_CALL_HANDLER(emscripten_float32x4_greaterThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_float32x4_equal);
+  SETUP_CALL_HANDLER(emscripten_float32x4_notEqual);
+  // n.b. No emscripten_float32x4_anyTrue, only defined on boolean SIMD types.
+  // n.b. No emscripten_float32x4_allTrue, only defined on boolean SIMD types.
+  SETUP_CALL_HANDLER(emscripten_float32x4_select);
+  // n.b. No emscripten_float32x4_addSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+  // n.b. No emscripten_float32x4_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+  // n.b. No emscripten_float32x4_shiftLeftByScalar, only defined on integer SIMD types.
+  // n.b. No emscripten_float32x4_shiftRightByScalar, only defined on integer SIMD types.
+  SETUP_CALL_HANDLER(emscripten_float32x4_extractLane);
+  SETUP_CALL_HANDLER(emscripten_float32x4_replaceLane);
+  SETUP_CALL_HANDLER(emscripten_float32x4_store);
+  SETUP_CALL_HANDLER(emscripten_float32x4_store1);
+  SETUP_CALL_HANDLER(emscripten_float32x4_store2);
+  SETUP_CALL_HANDLER(emscripten_float32x4_store3);
+  SETUP_CALL_HANDLER(emscripten_float32x4_load);
+  SETUP_CALL_HANDLER(emscripten_float32x4_load1);
+  SETUP_CALL_HANDLER(emscripten_float32x4_load2);
+  SETUP_CALL_HANDLER(emscripten_float32x4_load3);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromFloat64x2Bits);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromInt32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromUint32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromInt16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromUint16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromInt8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromUint8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromInt32x4);
+  SETUP_CALL_HANDLER(emscripten_float32x4_fromUint32x4);
+  SETUP_CALL_HANDLER(emscripten_float32x4_swizzle);
+  SETUP_CALL_HANDLER(emscripten_float32x4_shuffle);
+
+  // SIMD.js Int32x4
+  SETUP_CALL_HANDLER(emscripten_int32x4_set);
+  SETUP_CALL_HANDLER(emscripten_int32x4_splat);
+  SETUP_CALL_HANDLER(emscripten_int32x4_add);
+  SETUP_CALL_HANDLER(emscripten_int32x4_sub);
+  SETUP_CALL_HANDLER(emscripten_int32x4_mul);
+  // n.b. No emscripten_int32x4_div, division is only defined on floating point types.
+  // n.b. No emscripten_int32x4_max, only defined on floating point types.
+  // n.b. No emscripten_int32x4_min, only defined on floating point types.
+  // n.b. No emscripten_int32x4_maxNum, only defined on floating point types.
+  // n.b. No emscripten_int32x4_minNum, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_int32x4_neg);
+  // n.b. No emscripten_int32x4_sqrt, only defined on floating point types.
+  // n.b. No emscripten_int32x4_reciprocalApproximation, only defined on floating point types.
+  // n.b. No emscripten_int32x4_reciprocalSqrtApproximation, only defined on floating point types.
+  // n.b. No emscripten_int32x4_abs, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_int32x4_and);
+  SETUP_CALL_HANDLER(emscripten_int32x4_xor);
+  SETUP_CALL_HANDLER(emscripten_int32x4_or);
+  SETUP_CALL_HANDLER(emscripten_int32x4_not);
+  SETUP_CALL_HANDLER(emscripten_int32x4_lessThan);
+  SETUP_CALL_HANDLER(emscripten_int32x4_lessThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_int32x4_greaterThan);
+  SETUP_CALL_HANDLER(emscripten_int32x4_greaterThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_int32x4_equal);
+  SETUP_CALL_HANDLER(emscripten_int32x4_notEqual);
+  // n.b. No emscripten_int32x4_anyTrue, only defined on boolean SIMD types.
+  // n.b. No emscripten_int32x4_allTrue, only defined on boolean SIMD types.
+  SETUP_CALL_HANDLER(emscripten_int32x4_select);
+  // n.b. No emscripten_int32x4_addSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+  // n.b. No emscripten_int32x4_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+  SETUP_CALL_HANDLER(emscripten_int32x4_shiftLeftByScalar);
+  SETUP_CALL_HANDLER(emscripten_int32x4_shiftRightByScalar);
+  SETUP_CALL_HANDLER(emscripten_int32x4_extractLane);
+  SETUP_CALL_HANDLER(emscripten_int32x4_replaceLane);
+  SETUP_CALL_HANDLER(emscripten_int32x4_store);
+  SETUP_CALL_HANDLER(emscripten_int32x4_store1);
+  SETUP_CALL_HANDLER(emscripten_int32x4_store2);
+  SETUP_CALL_HANDLER(emscripten_int32x4_store3);
+  SETUP_CALL_HANDLER(emscripten_int32x4_load);
+  SETUP_CALL_HANDLER(emscripten_int32x4_load1);
+  SETUP_CALL_HANDLER(emscripten_int32x4_load2);
+  SETUP_CALL_HANDLER(emscripten_int32x4_load3);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromFloat64x2Bits);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromFloat32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromUint32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromInt16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromUint16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromInt8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromUint8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromFloat32x4);
+  SETUP_CALL_HANDLER(emscripten_int32x4_fromUint32x4);
+//  SETUP_CALL_HANDLER(emscripten_int32x4_fromFloat64x2); // TODO: Unofficial extension
+  SETUP_CALL_HANDLER(emscripten_int32x4_swizzle);
+  SETUP_CALL_HANDLER(emscripten_int32x4_shuffle);
+
+  // SIMD.js Uint32x4
+  SETUP_CALL_HANDLER(emscripten_uint32x4_set);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_splat);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_add);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_sub);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_mul);
+  // n.b. No emscripten_uint32x4_div, division is only defined on floating point types.
+  // n.b. No emscripten_uint32x4_max, only defined on floating point types.
+  // n.b. No emscripten_uint32x4_min, only defined on floating point types.
+  // n.b. No emscripten_uint32x4_maxNum, only defined on floating point types.
+  // n.b. No emscripten_uint32x4_minNum, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_uint32x4_neg);
+  // n.b. No emscripten_uint32x4_sqrt, only defined on floating point types.
+  // n.b. No emscripten_uint32x4_reciprocalApproximation, only defined on floating point types.
+  // n.b. No emscripten_uint32x4_reciprocalSqrtApproximation, only defined on floating point types.
+  // n.b. No emscripten_uint32x4_abs, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_uint32x4_and);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_xor);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_or);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_not);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_lessThan);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_lessThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_greaterThan);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_greaterThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_equal);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_notEqual);
+  // n.b. No emscripten_uint32x4_anyTrue, only defined on boolean SIMD types.
+  // n.b. No emscripten_uint32x4_allTrue, only defined on boolean SIMD types.
+  SETUP_CALL_HANDLER(emscripten_uint32x4_select);
+  // n.b. No emscripten_uint32x4_addSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+  // n.b. No emscripten_uint32x4_subSaturate, only defined on 8-bit and 16-bit integer SIMD types.
+  SETUP_CALL_HANDLER(emscripten_uint32x4_shiftLeftByScalar);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_shiftRightByScalar);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_extractLane);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_replaceLane);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_store);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_store1);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_store2);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_store3);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_load);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_load1);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_load2);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_load3);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromFloat64x2Bits);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromFloat32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromInt32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromInt16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromUint16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromInt8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromUint8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromFloat32x4);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_fromInt32x4);
+  //  SETUP_CALL_HANDLER(emscripten_uint32x4_fromFloat64x2); // TODO: Unofficial extension
+  SETUP_CALL_HANDLER(emscripten_uint32x4_swizzle);
+  SETUP_CALL_HANDLER(emscripten_uint32x4_shuffle);
+
+  // SIMD.js Int16x8
+  SETUP_CALL_HANDLER(emscripten_int16x8_set);
+  SETUP_CALL_HANDLER(emscripten_int16x8_splat);
+  SETUP_CALL_HANDLER(emscripten_int16x8_add);
+  SETUP_CALL_HANDLER(emscripten_int16x8_sub);
+  SETUP_CALL_HANDLER(emscripten_int16x8_mul);
+  // n.b. No emscripten_int16x8_div, division is only defined on floating point types.
+  // n.b. No emscripten_int16x8_max, only defined on floating point types.
+  // n.b. No emscripten_int16x8_min, only defined on floating point types.
+  // n.b. No emscripten_int16x8_maxNum, only defined on floating point types.
+  // n.b. No emscripten_int16x8_minNum, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_int16x8_neg);
+  // n.b. No emscripten_int16x8_sqrt, only defined on floating point types.
+  // n.b. No emscripten_int16x8_reciprocalApproximation, only defined on floating point types.
+  // n.b. No emscripten_int16x8_reciprocalSqrtApproximation, only defined on floating point types.
+  // n.b. No emscripten_int16x8_abs, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_int16x8_and);
+  SETUP_CALL_HANDLER(emscripten_int16x8_xor);
+  SETUP_CALL_HANDLER(emscripten_int16x8_or);
+  SETUP_CALL_HANDLER(emscripten_int16x8_not);
+  SETUP_CALL_HANDLER(emscripten_int16x8_lessThan);
+  SETUP_CALL_HANDLER(emscripten_int16x8_lessThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_int16x8_greaterThan);
+  SETUP_CALL_HANDLER(emscripten_int16x8_greaterThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_int16x8_equal);
+  SETUP_CALL_HANDLER(emscripten_int16x8_notEqual);
+  // n.b. No emscripten_int16x8_anyTrue, only defined on boolean SIMD types.
+  // n.b. No emscripten_int16x8_allTrue, only defined on boolean SIMD types.
+  SETUP_CALL_HANDLER(emscripten_int16x8_select);
+  SETUP_CALL_HANDLER(emscripten_int16x8_addSaturate);
+  SETUP_CALL_HANDLER(emscripten_int16x8_subSaturate);
+  SETUP_CALL_HANDLER(emscripten_int16x8_shiftLeftByScalar);
+  SETUP_CALL_HANDLER(emscripten_int16x8_shiftRightByScalar);
+  SETUP_CALL_HANDLER(emscripten_int16x8_extractLane);
+  SETUP_CALL_HANDLER(emscripten_int16x8_replaceLane);
+  SETUP_CALL_HANDLER(emscripten_int16x8_store);
+  SETUP_CALL_HANDLER(emscripten_int16x8_load);
+  SETUP_CALL_HANDLER(emscripten_int16x8_fromFloat64x2Bits);
+  SETUP_CALL_HANDLER(emscripten_int16x8_fromFloat32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_int16x8_fromInt32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_int16x8_fromUint32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_int16x8_fromUint16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_int16x8_fromInt8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_int16x8_fromUint8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_int16x8_fromUint16x8);
+  SETUP_CALL_HANDLER(emscripten_int16x8_swizzle);
+  SETUP_CALL_HANDLER(emscripten_int16x8_shuffle);
+
+  // SIMD.js Uint16x8
+  SETUP_CALL_HANDLER(emscripten_uint16x8_set);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_splat);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_add);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_sub);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_mul);
+  // n.b. No emscripten_uint16x8_div, division is only defined on floating point types.
+  // n.b. No emscripten_uint16x8_max, only defined on floating point types.
+  // n.b. No emscripten_uint16x8_min, only defined on floating point types.
+  // n.b. No emscripten_uint16x8_maxNum, only defined on floating point types.
+  // n.b. No emscripten_uint16x8_minNum, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_uint16x8_neg);
+  // n.b. No emscripten_uint16x8_sqrt, only defined on floating point types.
+  // n.b. No emscripten_uint16x8_reciprocalApproximation, only defined on floating point types.
+  // n.b. No emscripten_uint16x8_reciprocalSqrtApproximation, only defined on floating point types.
+  // n.b. No emscripten_uint16x8_abs, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_uint16x8_and);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_xor);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_or);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_not);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_lessThan);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_lessThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_greaterThan);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_greaterThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_equal);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_notEqual);
+  // n.b. No emscripten_uint16x8_anyTrue, only defined on boolean SIMD types.
+  // n.b. No emscripten_uint16x8_allTrue, only defined on boolean SIMD types.
+  SETUP_CALL_HANDLER(emscripten_uint16x8_select);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_addSaturate);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_subSaturate);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_shiftLeftByScalar);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_shiftRightByScalar);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_extractLane);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_replaceLane);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_store);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_load);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_fromFloat64x2Bits);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_fromFloat32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_fromInt32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_fromUint32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_fromInt16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_fromInt8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_fromUint8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_fromInt16x8);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_swizzle);
+  SETUP_CALL_HANDLER(emscripten_uint16x8_shuffle);
+
+  // SIMD.js Int8x16
+  SETUP_CALL_HANDLER(emscripten_int8x16_set);
+  SETUP_CALL_HANDLER(emscripten_int8x16_splat);
+  SETUP_CALL_HANDLER(emscripten_int8x16_add);
+  SETUP_CALL_HANDLER(emscripten_int8x16_sub);
+  SETUP_CALL_HANDLER(emscripten_int8x16_mul);
+  // n.b. No emscripten_int8x16_div, division is only defined on floating point types.
+  // n.b. No emscripten_int8x16_max, only defined on floating point types.
+  // n.b. No emscripten_int8x16_min, only defined on floating point types.
+  // n.b. No emscripten_int8x16_maxNum, only defined on floating point types.
+  // n.b. No emscripten_int8x16_minNum, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_int8x16_neg);
+  // n.b. No emscripten_int8x16_sqrt, only defined on floating point types.
+  // n.b. No emscripten_int8x16_reciprocalApproximation, only defined on floating point types.
+  // n.b. No emscripten_int8x16_reciprocalSqrtApproximation, only defined on floating point types.
+  // n.b. No emscripten_int8x16_abs, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_int8x16_and);
+  SETUP_CALL_HANDLER(emscripten_int8x16_xor);
+  SETUP_CALL_HANDLER(emscripten_int8x16_or);
+  SETUP_CALL_HANDLER(emscripten_int8x16_not);
+  SETUP_CALL_HANDLER(emscripten_int8x16_lessThan);
+  SETUP_CALL_HANDLER(emscripten_int8x16_lessThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_int8x16_greaterThan);
+  SETUP_CALL_HANDLER(emscripten_int8x16_greaterThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_int8x16_equal);
+  SETUP_CALL_HANDLER(emscripten_int8x16_notEqual);
+  // n.b. No emscripten_int8x16_anyTrue, only defined on boolean SIMD types.
+  // n.b. No emscripten_int8x16_allTrue, only defined on boolean SIMD types.
+  SETUP_CALL_HANDLER(emscripten_int8x16_select);
+  SETUP_CALL_HANDLER(emscripten_int8x16_addSaturate);
+  SETUP_CALL_HANDLER(emscripten_int8x16_subSaturate);
+  SETUP_CALL_HANDLER(emscripten_int8x16_shiftLeftByScalar);
+  SETUP_CALL_HANDLER(emscripten_int8x16_shiftRightByScalar);
+  SETUP_CALL_HANDLER(emscripten_int8x16_extractLane);
+  SETUP_CALL_HANDLER(emscripten_int8x16_replaceLane);
+  SETUP_CALL_HANDLER(emscripten_int8x16_store);
+  SETUP_CALL_HANDLER(emscripten_int8x16_load);
+  SETUP_CALL_HANDLER(emscripten_int8x16_fromFloat64x2Bits);
+  SETUP_CALL_HANDLER(emscripten_int8x16_fromFloat32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_int8x16_fromInt32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_int8x16_fromUint32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_int8x16_fromInt16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_int8x16_fromUint16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_int8x16_fromUint8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_int8x16_fromUint8x16);
+  SETUP_CALL_HANDLER(emscripten_int8x16_swizzle);
+  SETUP_CALL_HANDLER(emscripten_int8x16_shuffle);
+
+  // SIMD.js Uint8x16
+  SETUP_CALL_HANDLER(emscripten_uint8x16_set);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_splat);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_add);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_sub);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_mul);
+  // n.b. No emscripten_uint8x16_div, division is only defined on floating point types.
+  // n.b. No emscripten_uint8x16_max, only defined on floating point types.
+  // n.b. No emscripten_uint8x16_min, only defined on floating point types.
+  // n.b. No emscripten_uint8x16_maxNum, only defined on floating point types.
+  // n.b. No emscripten_uint8x16_minNum, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_uint8x16_neg);
+  // n.b. No emscripten_uint8x16_sqrt, only defined on floating point types.
+  // n.b. No emscripten_uint8x16_reciprocalApproximation, only defined on floating point types.
+  // n.b. No emscripten_uint8x16_reciprocalSqrtApproximation, only defined on floating point types.
+  // n.b. No emscripten_uint8x16_abs, only defined on floating point types.
+  SETUP_CALL_HANDLER(emscripten_uint8x16_and);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_xor);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_or);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_not);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_lessThan);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_lessThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_greaterThan);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_greaterThanOrEqual);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_equal);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_notEqual);
+  // n.b. No emscripten_uint8x16_anyTrue, only defined on boolean SIMD types.
+  // n.b. No emscripten_uint8x16_allTrue, only defined on boolean SIMD types.
+  SETUP_CALL_HANDLER(emscripten_uint8x16_select);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_addSaturate);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_subSaturate);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_shiftLeftByScalar);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_shiftRightByScalar);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_extractLane);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_replaceLane);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_store);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_load);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_fromFloat64x2Bits);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_fromFloat32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_fromInt32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_fromUint32x4Bits);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_fromInt16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_fromUint16x8Bits);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_fromInt8x16Bits);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_fromInt8x16);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_swizzle);
+  SETUP_CALL_HANDLER(emscripten_uint8x16_shuffle);
+
+  // SIMD.js Bool64x2
+  SETUP_CALL_HANDLER(emscripten_bool64x2_anyTrue);
+  SETUP_CALL_HANDLER(emscripten_bool64x2_allTrue);
+
+  // SIMD.js Bool32x4
+  SETUP_CALL_HANDLER(emscripten_bool32x4_anyTrue);
+  SETUP_CALL_HANDLER(emscripten_bool32x4_allTrue);
+
+  // SIMD.js Bool16x8
+  SETUP_CALL_HANDLER(emscripten_bool16x8_anyTrue);
+  SETUP_CALL_HANDLER(emscripten_bool16x8_allTrue);
+
+  // SIMD.js Bool8x16
+  SETUP_CALL_HANDLER(emscripten_bool8x16_anyTrue);
+  SETUP_CALL_HANDLER(emscripten_bool8x16_allTrue);
+
+  SETUP_CALL_HANDLER(emscripten_asm_const);
+  SETUP_CALL_HANDLER(emscripten_asm_const_int);
+  SETUP_CALL_HANDLER(emscripten_asm_const_double);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_exchange_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_exchange_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_exchange_u32);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_cas_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_cas_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_cas_u32);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_load_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_load_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_load_u32);
+  SETUP_CALL_HANDLER(emscripten_atomic_load_f32);
+  SETUP_CALL_HANDLER(emscripten_atomic_load_f64);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_store_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_store_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_store_u32);
+  SETUP_CALL_HANDLER(emscripten_atomic_store_f32);
+  SETUP_CALL_HANDLER(emscripten_atomic_store_f64);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_add_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_add_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_add_u32);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_sub_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_sub_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_sub_u32);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_and_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_and_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_and_u32);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_or_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_or_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_or_u32);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_xor_u8);
+  SETUP_CALL_HANDLER(emscripten_atomic_xor_u16);
+  SETUP_CALL_HANDLER(emscripten_atomic_xor_u32);
+
+  SETUP_CALL_HANDLER(emscripten_atomic_fence);
+
+  SETUP_CALL_HANDLER(abs);
+  SETUP_CALL_HANDLER(labs);
+  SETUP_CALL_HANDLER(cos);
+  SETUP_CALL_HANDLER(cosf);
+  SETUP_CALL_HANDLER(cosl);
+  SETUP_CALL_HANDLER(sin);
+  SETUP_CALL_HANDLER(sinf);
+  SETUP_CALL_HANDLER(sinl);
+  SETUP_CALL_HANDLER(tan);
+  SETUP_CALL_HANDLER(tanf);
+  SETUP_CALL_HANDLER(tanl);
+  SETUP_CALL_HANDLER(acos);
+  SETUP_CALL_HANDLER(acosf);
+  SETUP_CALL_HANDLER(acosl);
+  SETUP_CALL_HANDLER(asin);
+  SETUP_CALL_HANDLER(asinf);
+  SETUP_CALL_HANDLER(asinl);
+  SETUP_CALL_HANDLER(atan);
+  SETUP_CALL_HANDLER(atanf);
+  SETUP_CALL_HANDLER(atanl);
+  SETUP_CALL_HANDLER(atan2);
+  SETUP_CALL_HANDLER(atan2f);
+  SETUP_CALL_HANDLER(atan2l);
+  SETUP_CALL_HANDLER(exp);
+  SETUP_CALL_HANDLER(expf);
+  SETUP_CALL_HANDLER(expl);
+  SETUP_CALL_HANDLER(log);
+  SETUP_CALL_HANDLER(logf);
+  SETUP_CALL_HANDLER(logl);
+  SETUP_CALL_HANDLER(sqrt);
+  SETUP_CALL_HANDLER(sqrtf);
+  SETUP_CALL_HANDLER(sqrtl);
+  SETUP_CALL_HANDLER(fabs);
+  SETUP_CALL_HANDLER(fabsf);
+  SETUP_CALL_HANDLER(fabsl);
+  SETUP_CALL_HANDLER(llvm_fabs_f32);
+  SETUP_CALL_HANDLER(llvm_fabs_f64);
+  SETUP_CALL_HANDLER(ceil);
+  SETUP_CALL_HANDLER(ceilf);
+  SETUP_CALL_HANDLER(ceill);
+  SETUP_CALL_HANDLER(llvm_ceil_f32);
+  SETUP_CALL_HANDLER(llvm_ceil_f64);
+  SETUP_CALL_HANDLER(floor);
+  SETUP_CALL_HANDLER(floorf);
+  SETUP_CALL_HANDLER(floorl);
+  SETUP_CALL_HANDLER(llvm_floor_f32);
+  SETUP_CALL_HANDLER(llvm_floor_f64);
+  SETUP_CALL_HANDLER(pow);
+  SETUP_CALL_HANDLER(powf);
+  SETUP_CALL_HANDLER(powl);
+  SETUP_CALL_HANDLER(llvm_sqrt_f32);
+  SETUP_CALL_HANDLER(llvm_sqrt_f64);
+  SETUP_CALL_HANDLER(llvm_pow_f32);
+  SETUP_CALL_HANDLER(llvm_pow_f64);
+  SETUP_CALL_HANDLER(llvm_powi_f32);
+  SETUP_CALL_HANDLER(llvm_powi_f64);
+  SETUP_CALL_HANDLER(llvm_log_f32);
+  SETUP_CALL_HANDLER(llvm_log_f64);
+  SETUP_CALL_HANDLER(llvm_exp_f32);
+  SETUP_CALL_HANDLER(llvm_exp_f64);
+  SETUP_CALL_HANDLER(llvm_cos_f32);
+  SETUP_CALL_HANDLER(llvm_cos_f64);
+  SETUP_CALL_HANDLER(llvm_sin_f32);
+  SETUP_CALL_HANDLER(llvm_sin_f64);
+}
+
+std::string handleCall(const Instruction *CI) {
+  const Value *CV = getActuallyCalledValue(CI);
+  if (const InlineAsm* IA = dyn_cast<const InlineAsm>(CV)) {
+    if (IA->hasSideEffects() && IA->getAsmString() == "") {
+      return "/* asm() memory 'barrier' */";
+    } else {
+      errs() << "In function " << CI->getParent()->getParent()->getName() << "()\n";
+      errs() << *IA << "\n";
+      report_fatal_error("asm() with non-empty content not supported, use EM_ASM() (see emscripten.h)");
+    }
+  }
+
+  // Get the name to call this function by. If it's a direct call, meaning
+  // which know which Function we're calling, avoid calling getValueAsStr, as
+  // we don't need to use a function index.
+  const std::string &Name = isa<Function>(CV) ? getJSName(CV) : getValueAsStr(CV);
+
+  CallHandlerMap::iterator CH = CallHandlers.find("___default__");
+  if (isa<Function>(CV)) {
+    CallHandlerMap::iterator Custom = CallHandlers.find(Name);
+    if (Custom != CallHandlers.end()) CH = Custom;
+  }
+  return (this->*(CH->second))(CI, Name, -1);
+}
diff --git a/lib/Target/JSBackend/ExpandBigSwitches.cpp b/lib/Target/JSBackend/ExpandBigSwitches.cpp
new file mode 100644
index 000000000000..c28e04a38db0
--- /dev/null
+++ b/lib/Target/JSBackend/ExpandBigSwitches.cpp
@@ -0,0 +1,157 @@
+//===-- ExpandBigSwitches.cpp - Alloca optimization ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-----------------------------------------------------------------------===//
+//
+// Very large switches can be a problem for JS engines. We split them up here.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "OptPasses.h"
+
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <vector>
+#include <algorithm>
+
+namespace llvm {
+
+struct ExpandBigSwitches : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  ExpandBigSwitches() : FunctionPass(ID) {}
+    // XXX initialize..(*PassRegistry::getPassRegistry()); }
+
+  bool runOnFunction(Function &Func) override;
+
+  StringRef getPassName() const override { return "ExpandBigSwitches"; }
+};
+
+char ExpandBigSwitches::ID = 0;
+
+// Check if we need to split a switch. If so, return the median, on which we will do so
+static bool ConsiderSplit(const SwitchInst *SI, int64_t& Median) {
+  int64_t Minn = INT64_MAX, Maxx = INT64_MIN;
+  std::vector<int64_t> Values;
+  for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) {
+    int64_t Curr = i.getCaseValue()->getSExtValue();
+    if (Curr < Minn) Minn = Curr;
+    if (Curr > Maxx) Maxx = Curr;
+    Values.push_back(Curr);
+  }
+  int64_t Range = Maxx - Minn;
+  int Num = SI->getNumCases();
+  if (Num < 1024 && Range <= 10*1024 && (Range/Num) <= 1024) return false;
+  // this is either too big, or too rangey
+  std::sort(Values.begin(), Values.end());
+  Median = Values[Values.size()/2];
+  return true;
+}
+
+static void DoSplit(SwitchInst *SI, int64_t Median) {
+  // switch (x) { ..very many.. }
+  //
+  //   ==>
+  //
+  // if (x < median) {
+  //   switch (x) { ..first half.. }
+  // } else {
+  //   switch (x) { ..second half.. }
+  // }
+
+  BasicBlock *SwitchBB = SI->getParent();
+  Function *F = SwitchBB->getParent();
+  Value *Condition = SI->getOperand(0);
+  BasicBlock *DD = SI->getDefaultDest();
+  unsigned NumItems = SI->getNumCases();
+  Type *T = Condition->getType();
+
+  Instruction *Check = new ICmpInst(SI, ICmpInst::ICMP_SLT, Condition, ConstantInt::get(T, Median), "switch-split");
+  BasicBlock *LowBB = BasicBlock::Create(SI->getContext(), "switchsplit_low", F);
+  BasicBlock *HighBB = BasicBlock::Create(SI->getContext(), "switchsplit_high", F);
+  BranchInst *Br = BranchInst::Create(LowBB, HighBB, Check, SwitchBB);
+
+  SwitchInst *LowSI = SwitchInst::Create(Condition, DD, NumItems/2, LowBB);
+  SwitchInst *HighSI = SwitchInst::Create(Condition, DD, NumItems/2, HighBB);
+
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) {
+    BasicBlock *BB = i.getCaseSuccessor();
+    auto Value = i.getCaseValue();
+    SwitchInst *NewSI = Value->getSExtValue() < Median ? LowSI : HighSI;
+    NewSI->addCase(Value, BB);
+    // update phis
+    BasicBlock *NewBB = NewSI->getParent();
+    for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+      PHINode *Phi = dyn_cast<PHINode>(I);
+      if (!Phi) break;
+      int Index = Phi->getBasicBlockIndex(SwitchBB);
+      if (Index < 0) continue;
+      Phi->addIncoming(Phi->getIncomingValue(Index), NewBB);
+      Phi->removeIncomingValue(Index);
+    }
+  }
+
+  // fix default dest
+  for (BasicBlock::iterator I = DD->begin(); I != DD->end(); ++I) {
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (!Phi) break;
+    int Index = Phi->getBasicBlockIndex(SwitchBB);
+    if (Index < 0) continue;
+    Phi->addIncoming(Phi->getIncomingValue(Index), LowBB);
+    Phi->addIncoming(Phi->getIncomingValue(Index), HighBB);
+    Phi->removeIncomingValue(Index);
+  }
+
+  // finish up
+  SI->eraseFromParent();
+  assert(SwitchBB->getTerminator() == Br);
+  assert(LowSI->getNumCases() + HighSI->getNumCases() == NumItems);
+  assert(LowSI->getNumCases() < HighSI->getNumCases() + 2);
+  assert(HighSI->getNumCases() < LowSI->getNumCases() + 2);
+}
+
+bool ExpandBigSwitches::runOnFunction(Function &Func) {
+  bool Changed = false;
+
+  struct SplitInfo {
+    SwitchInst *SI;
+    int64_t Median;
+  };
+
+  while (1) { // repetively split in 2
+    std::vector<SplitInfo> ToSplit;
+    // find switches we need to split
+    for (Function::iterator B = Func.begin(), E = Func.end(); B != E; ++B) {
+      Instruction *I = B->getTerminator();
+      SwitchInst *SI = dyn_cast<SwitchInst>(I);
+      if (!SI) continue;
+      SplitInfo Curr;
+      if (!ConsiderSplit(SI, Curr.Median)) continue;
+      Curr.SI = SI;
+      Changed = true;
+      ToSplit.push_back(Curr);
+    }
+    if (ToSplit.size() == 0) break;
+    // split them
+    for (auto& Curr : ToSplit) {
+      DoSplit(Curr.SI, Curr.Median);
+    }
+  }
+
+  return Changed;
+}
+
+//
+
+extern FunctionPass *createEmscriptenExpandBigSwitchesPass() {
+  return new ExpandBigSwitches();
+}
+
+} // End llvm namespace
diff --git a/lib/Target/JSBackend/JS.h b/lib/Target/JSBackend/JS.h
new file mode 100644
index 000000000000..6fe22426b8ea
--- /dev/null
+++ b/lib/Target/JSBackend/JS.h
@@ -0,0 +1,29 @@
+//===-- JS.h - Top-level interface for JS representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the JS
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_JS_H
+#define TARGET_JS_H
+
+namespace llvm {
+
+class ImmutablePass;
+class JSTargetMachine;
+
+/// createJSISelDag - This pass converts a legalized DAG into a
+/// \brief Creates an JS-specific Target Transformation Info pass.
+ImmutablePass *createJSTargetTransformInfoPass(const JSTargetMachine *TM);
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/JSBackend/JSBackend.cpp b/lib/Target/JSBackend/JSBackend.cpp
new file mode 100644
index 000000000000..8ff9911a0ced
--- /dev/null
+++ b/lib/Target/JSBackend/JSBackend.cpp
@@ -0,0 +1,4475 @@
+//===-- JSBackend.cpp - Library for converting LLVM code to JS       -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements compiling of LLVM IR, which is assumed to have been
+// simplified using the PNaCl passes, i64 legalization, and other necessary
+// transformations, into JavaScript in asm.js format, suitable for passing
+// to emscripten for final processing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JSTargetMachine.h"
+#include "MCTargetDesc/JSBackendMCTargetDesc.h"
+#include "AllocaManager.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Scalar.h"
+#include <algorithm>
+#include <cstdio>
+#include <map>
+#include <set> // TODO: unordered_set?
+#include <sstream>
+using namespace llvm;
+
+#include <OptPasses.h>
+#include <Relooper.h>
+
+raw_ostream &prettyWarning() {
+  errs().changeColor(raw_ostream::YELLOW);
+  errs() << "warning:";
+  errs().resetColor();
+  errs() << " ";
+  return errs();
+}
+
+static cl::opt<bool>
+PreciseF32("emscripten-precise-f32",
+           cl::desc("Enables Math.fround usage to implement precise float32 semantics and performance (see emscripten PRECISE_F32 option)"),
+           cl::init(false));
+
+static cl::opt<bool>
+EnablePthreads("emscripten-enable-pthreads",
+           cl::desc("Enables compilation targeting JavaScript Shared Array Buffer and Atomics API to implement support for pthreads-based multithreading"),
+           cl::init(false));
+
+static cl::opt<bool>
+WarnOnUnaligned("emscripten-warn-unaligned",
+                cl::desc("Warns about unaligned loads and stores (which can negatively affect performance)"),
+                cl::init(false));
+
+static cl::opt<bool>
+WarnOnNoncanonicalNans("emscripten-warn-noncanonical-nans",
+                cl::desc("Warns about detected noncanonical bit patterns in NaNs that will not be preserved in the generated output (this can cause code to run wrong if the exact bits were important)"),
+                cl::init(true));
+
+static cl::opt<int>
+ReservedFunctionPointers("emscripten-reserved-function-pointers",
+                         cl::desc("Number of reserved slots in function tables for functions to be added at runtime (see emscripten RESERVED_FUNCTION_POINTERS option)"),
+                         cl::init(0));
+
+static cl::opt<bool>
+EmulatedFunctionPointers("emscripten-emulated-function-pointers",
+                         cl::desc("Emulate function pointers, avoiding asm.js function tables (see emscripten EMULATED_FUNCTION_POINTERS option)"),
+                         cl::init(false));
+
+static cl::opt<int>
+EmscriptenAssertions("emscripten-assertions",
+                     cl::desc("Additional JS-specific assertions (see emscripten ASSERTIONS)"),
+                     cl::init(0));
+
+static cl::opt<bool>
+NoAliasingFunctionPointers("emscripten-no-aliasing-function-pointers",
+                           cl::desc("Forces function pointers to not alias (this is more correct, but rarely needed, and has the cost of much larger function tables; it is useful for debugging though; see emscripten ALIASING_FUNCTION_POINTERS option)"),
+                           cl::init(false));
+
+static cl::opt<int>
+GlobalBase("emscripten-global-base",
+           cl::desc("Where global variables start out in memory (see emscripten GLOBAL_BASE option)"),
+           cl::init(8));
+
+static cl::opt<bool>
+Relocatable("emscripten-relocatable",
+            cl::desc("Whether to emit relocatable code (see emscripten RELOCATABLE option)"),
+            cl::init(false));
+
+static cl::opt<bool>
+SideModule("emscripten-side-module",
+           cl::desc("Whether to emit a side module (see emscripten SIDE_MODULE option)"),
+           cl::init(false));
+
+static cl::opt<int>
+StackSize("emscripten-stack-size",
+           cl::desc("How large a stack to create (important in wasm side modules; see emscripten TOTAL_STACK option)"),
+           cl::init(0));
+
+static cl::opt<bool>
+EnableSjLjEH("enable-pnacl-sjlj-eh",
+             cl::desc("Enable use of SJLJ-based C++ exception handling "
+                      "as part of the pnacl-abi-simplify passes"),
+             cl::init(false));
+
+static cl::opt<bool>
+EnableEmCxxExceptions("enable-emscripten-cpp-exceptions",
+                      cl::desc("Enables C++ exceptions in emscripten"),
+                      cl::init(false));
+
+static cl::opt<bool>
+EnableEmAsyncify("emscripten-asyncify",
+                 cl::desc("Enable asyncify transformation (see emscripten ASYNCIFY option)"),
+                 cl::init(false));
+
+static cl::opt<bool>
+NoExitRuntime("emscripten-no-exit-runtime",
+              cl::desc("Generate code which assumes the runtime is never exited (so atexit etc. is unneeded; see emscripten NO_EXIT_RUNTIME setting)"),
+              cl::init(false));
+
+static cl::opt<bool>
+
+EnableCyberDWARF("enable-cyberdwarf",
+                 cl::desc("Include CyberDWARF debug information"),
+                 cl::init(false));
+
+static cl::opt<bool>
+EnableCyberDWARFIntrinsics("enable-debug-intrinsics",
+                           cl::desc("Include debug intrinsics in generated output"),
+                           cl::init(false));
+
+static cl::opt<bool>
+WebAssembly("emscripten-wasm",
+            cl::desc("Generate asm.js which will later be compiled to WebAssembly (see emscripten BINARYEN setting)"),
+            cl::init(false));
+
+static cl::opt<bool>
+OnlyWebAssembly("emscripten-only-wasm",
+                cl::desc("Generate code that will only ever be used as WebAssembly, and is not valid JS or asm.js"),
+                cl::init(false));
+
+
+extern "C" void LLVMInitializeJSBackendTarget() {
+  // Register the target.
+  RegisterTargetMachine<JSTargetMachine> X(TheJSBackendTarget);
+}
+
+namespace {
+  #define ASM_SIGNED 0
+  #define ASM_UNSIGNED 1
+  #define ASM_NONSPECIFIC 2 // nonspecific means to not differentiate ints. |0 for all, regardless of size and sign
+  #define ASM_FFI_IN 4 // FFI return values are limited to things that work in ffis
+  #define ASM_FFI_OUT 8 // params to FFIs are limited to things that work in ffis
+  #define ASM_MUST_CAST 16 // this value must be explicitly cast (or be an integer constant)
+  #define ASM_FORCE_FLOAT_AS_INTBITS 32 // if the value is a float, it should be returned as an integer representing the float bits (or NaN canonicalization will eat them away). This flag cannot be used with ASM_UNSIGNED set.
+  typedef unsigned AsmCast;
+
+  typedef std::map<const Value*,std::string> ValueMap;
+  typedef std::set<std::string> NameSet;
+  typedef std::set<int> IntSet;
+  typedef std::vector<unsigned char> HeapData;
+  typedef std::map<int, HeapData> HeapDataMap;
+  typedef std::vector<int> AlignedHeapStartMap;
+  struct Address {
+    unsigned Offset, Alignment;
+    bool ZeroInit;
+    Address() {}
+    Address(unsigned Offset, unsigned Alignment, bool ZeroInit) : Offset(Offset), Alignment(Alignment), ZeroInit(ZeroInit) {}
+  };
+  typedef std::map<std::string, Type *> VarMap;
+  typedef std::map<std::string, Address> GlobalAddressMap;
+  typedef std::vector<std::string> FunctionTable;
+  typedef std::map<std::string, FunctionTable> FunctionTableMap;
+  typedef std::map<std::string, std::string> StringMap;
+  typedef std::map<std::string, unsigned> NameIntMap;
+  typedef std::map<unsigned, IntSet> IntIntSetMap;
+  typedef std::map<const BasicBlock*, unsigned> BlockIndexMap;
+  typedef std::map<const Function*, BlockIndexMap> BlockAddressMap;
+  typedef std::map<const BasicBlock*, Block*> LLVMToRelooperMap;
+  struct AsmConstInfo {
+    int Id;
+    std::set<std::string> Sigs;
+  };
+
+  /// JSWriter - This class is the main chunk of code that converts an LLVM
+  /// module to JavaScript.
+  class JSWriter : public ModulePass {
+    raw_pwrite_stream &Out;
+    Module *TheModule;
+    unsigned UniqueNum;
+    unsigned NextFunctionIndex; // used with NoAliasingFunctionPointers
+    ValueMap ValueNames;
+    VarMap UsedVars;
+    AllocaManager Allocas;
+    HeapDataMap GlobalDataMap;
+    std::vector<int> ZeroInitSizes; // alignment => used offset in the zeroinit zone
+    AlignedHeapStartMap AlignedHeapStarts, ZeroInitStarts;
+    GlobalAddressMap GlobalAddresses;
+    NameSet Externals; // vars
+    NameSet Declares; // funcs
+    StringMap Redirects; // library function redirects actually used, needed for wrapper funcs in tables
+    std::vector<std::string> PostSets;
+    NameIntMap NamedGlobals; // globals that we export as metadata to JS, so it can access them by name
+    std::map<std::string, unsigned> IndexedFunctions; // name -> index
+    FunctionTableMap FunctionTables; // sig => list of functions
+    std::vector<std::string> GlobalInitializers;
+    std::vector<std::string> Exports; // additional exports
+    StringMap Aliases;
+    BlockAddressMap BlockAddresses;
+    std::map<std::string, AsmConstInfo> AsmConsts; // code => { index, list of seen sigs }
+    NameSet FuncRelocatableExterns; // which externals are accessed in this function; we load them once at the beginning (avoids a potential call in a heap access, and might be faster)
+    std::vector<std::string> ExtraFunctions;
+    std::set<const Function*> DeclaresNeedingTypeDeclarations; // list of declared funcs whose type we must declare asm.js-style with a usage, as they may not have another usage
+
+    struct {
+      // 0 is reserved for void type
+      unsigned MetadataNum = 1;
+      std::map<Metadata *, unsigned> IndexedMetadata;
+      std::map<unsigned, std::string> VtableOffsets;
+      std::ostringstream TypeDebugData;
+      std::ostringstream TypeNameMap;
+      std::ostringstream FunctionMembers;
+    } cyberDWARFData;
+
+    std::string CantValidate;
+    bool UsesSIMDUint8x16;
+    bool UsesSIMDInt8x16;
+    bool UsesSIMDUint16x8;
+    bool UsesSIMDInt16x8;
+    bool UsesSIMDUint32x4;
+    bool UsesSIMDInt32x4;
+    bool UsesSIMDFloat32x4;
+    bool UsesSIMDFloat64x2;
+    bool UsesSIMDBool8x16;
+    bool UsesSIMDBool16x8;
+    bool UsesSIMDBool32x4;
+    bool UsesSIMDBool64x2;
+    int InvokeState; // cycles between 0, 1 after preInvoke, 2 after call, 0 again after postInvoke. hackish, no argument there.
+    CodeGenOpt::Level OptLevel;
+    const DataLayout *DL;
+    bool StackBumped;
+    int GlobalBasePadding;
+    int MaxGlobalAlign;
+    int StaticBump;
+    const Instruction* CurrInstruction;
+    Type* i32; // the type of i32
+
+    #include "CallHandlers.h"
+
+  public:
+    static char ID;
+    JSWriter(raw_pwrite_stream &o, CodeGenOpt::Level OptLevel)
+      : ModulePass(ID), Out(o), UniqueNum(0), NextFunctionIndex(0), CantValidate(""),
+        UsesSIMDUint8x16(false), UsesSIMDInt8x16(false), UsesSIMDUint16x8(false),
+        UsesSIMDInt16x8(false), UsesSIMDUint32x4(false), UsesSIMDInt32x4(false),
+        UsesSIMDFloat32x4(false), UsesSIMDFloat64x2(false), UsesSIMDBool8x16(false),
+        UsesSIMDBool16x8(false), UsesSIMDBool32x4(false), UsesSIMDBool64x2(false), InvokeState(0),
+        OptLevel(OptLevel), StackBumped(false), GlobalBasePadding(0), MaxGlobalAlign(0),
+        CurrInstruction(nullptr) {}
+
+    StringRef getPassName() const override { return "JavaScript backend"; }
+
+    bool runOnModule(Module &M) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+      ModulePass::getAnalysisUsage(AU);
+    }
+
+    void printProgram(const std::string& fname, const std::string& modName );
+    void printModule(const std::string& fname, const std::string& modName );
+    void printFunction(const Function *F);
+
+    LLVM_ATTRIBUTE_NORETURN void error(const std::string& msg);
+
+    raw_pwrite_stream& nl(raw_pwrite_stream &Out, int delta = 0);
+
+  private:
+
+    // LLVM changed stripPointerCasts to use the "returned" attribute on
+    // calls and invokes, i.e., stripping pointer casts of a call to
+    // define internal i8* @strupr(i8* returned %str) #2 {
+    // will return the pointer, and ignore the call which has side
+    // effects. We sometimes do care about the side effects.
+    const Value* stripPointerCastsWithoutSideEffects(const Value* V) {
+      if (isa<CallInst>(V) || isa<InvokeInst>(V)) {
+        return V; // in theory we could check if there actually are side effects
+      }
+      return V->stripPointerCasts();
+    }
+
+    void printCommaSeparated(const HeapData v);
+
+    // parsing of constants has two phases: calculate, and then emit
+    void parseConstant(const std::string& name, const Constant* CV, int Alignment, bool calculate);
+
+    #define DEFAULT_MEM_ALIGN 8
+
+    #define STACK_ALIGN 16
+    #define STACK_ALIGN_BITS 128
+
+    unsigned stackAlign(unsigned x) {
+      return alignTo(x, STACK_ALIGN);
+    }
+    std::string stackAlignStr(std::string x) {
+      return "((" + x + "+" + utostr(STACK_ALIGN-1) + ")&-" + utostr(STACK_ALIGN) + ")";
+    }
+
+    void ensureAligned(int Alignment, HeapData* GlobalData) {
+      assert(isPowerOf2_32(Alignment) && Alignment > 0);
+      while (GlobalData->size() & (Alignment-1)) GlobalData->push_back(0);
+    }
+    void ensureAligned(int Alignment, HeapData& GlobalData) {
+      assert(isPowerOf2_32(Alignment) && Alignment > 0);
+      while (GlobalData.size() & (Alignment-1)) GlobalData.push_back(0);
+    }
+
+    HeapData *allocateAddress(const std::string& Name, unsigned Alignment) {
+      assert(isPowerOf2_32(Alignment) && Alignment > 0);
+      HeapData* GlobalData = &GlobalDataMap[Alignment];
+      ensureAligned(Alignment, GlobalData);
+      GlobalAddresses[Name] = Address(GlobalData->size(), Alignment*8, false);
+      return GlobalData;
+    }
+
+    void allocateZeroInitAddress(const std::string& Name, unsigned Alignment, unsigned Size) {
+      assert(isPowerOf2_32(Alignment) && Alignment > 0);
+      while (ZeroInitSizes.size() <= Alignment) ZeroInitSizes.push_back(0);
+      GlobalAddresses[Name] = Address(ZeroInitSizes[Alignment], Alignment*8, true);
+      ZeroInitSizes[Alignment] += Size;
+      while (ZeroInitSizes[Alignment] & (Alignment-1)) ZeroInitSizes[Alignment]++;
+    }
+
+    // return the absolute offset of a global
+    unsigned getGlobalAddress(const std::string &s) {
+      GlobalAddressMap::const_iterator I = GlobalAddresses.find(s);
+      if (I == GlobalAddresses.end()) {
+        report_fatal_error("cannot find global address " + Twine(s));
+      }
+      Address a = I->second;
+      int Alignment = a.Alignment/8;
+      assert(AlignedHeapStarts.size() > (unsigned)Alignment);
+      int Ret = a.Offset + (a.ZeroInit ? ZeroInitStarts[Alignment] : AlignedHeapStarts[Alignment]);
+      assert(Alignment < (int)(a.ZeroInit ? ZeroInitStarts.size() : AlignedHeapStarts.size()));
+      assert(Ret % Alignment == 0);
+      return Ret;
+    }
+    // returns the internal offset inside the proper block: GlobalData8, 32, 64
+    unsigned getRelativeGlobalAddress(const std::string &s) {
+      GlobalAddressMap::const_iterator I = GlobalAddresses.find(s);
+      if (I == GlobalAddresses.end()) {
+        report_fatal_error("cannot find global address " + Twine(s));
+      }
+      Address a = I->second;
+      return a.Offset;
+    }
+    char getFunctionSignatureLetter(Type *T) {
+      if (T->isVoidTy()) return 'v';
+      else if (T->isFloatingPointTy()) {
+        if (PreciseF32 && T->isFloatTy()) {
+          return 'f';
+        } else {
+          return 'd';
+        }
+      } else if (VectorType *VT = dyn_cast<VectorType>(T)) {
+        checkVectorType(VT);
+        if (VT->getElementType()->isIntegerTy()) {
+          return 'I';
+        } else {
+          return 'F';
+        }
+      } else {
+        if (OnlyWebAssembly && T->isIntegerTy() && T->getIntegerBitWidth() == 64) {
+          return 'j';
+        } else {
+          return 'i';
+        }
+      }
+    }
+    std::string getFunctionSignature(const FunctionType *F) {
+      std::string Ret;
+      Ret += getFunctionSignatureLetter(F->getReturnType());
+      for (FunctionType::param_iterator AI = F->param_begin(),
+             AE = F->param_end(); AI != AE; ++AI) {
+        Ret += getFunctionSignatureLetter(*AI);
+      }
+      return Ret;
+    }
+    FunctionTable& ensureFunctionTable(const FunctionType *FT) {
+      std::string Sig = getFunctionSignature(FT);
+      if (WebAssembly && EmulatedFunctionPointers) {
+        // wasm function pointer emulation uses a single simple wasm table. ensure the specific tables
+        // exist (so we have properly typed calls to the outside), but only fill in the singleton.
+        FunctionTables[Sig];
+        Sig = "X";
+      }
+      FunctionTable &Table = FunctionTables[Sig];
+      unsigned MinSize = ReservedFunctionPointers ? 2*(ReservedFunctionPointers+1) : 1; // each reserved slot must be 2-aligned
+      while (Table.size() < MinSize) Table.push_back("0");
+      return Table;
+    }
+    bool usesFloat32(FunctionType* F) {
+      if (F->getReturnType()->isFloatTy()) return true;
+      for (FunctionType::param_iterator AI = F->param_begin(),
+             AE = F->param_end(); AI != AE; ++AI) {
+        if ((*AI)->isFloatTy()) return true;
+      }
+      return false;
+    }
+    // create a lettered argument name (a, b, c, etc.)
+    std::string getArgLetter(int Index) {
+      std::string Ret = "";
+      while (1) {
+        auto Curr = Index % 26;
+        Ret += char('a' + Curr);
+        Index = Index / 26;
+        if (Index == 0) return Ret;
+      }
+    }
+    std::string makeFloat32Legalizer(const Function *F) {
+      auto* FT = F->getFunctionType();
+      const std::string& Name = getJSName(F);
+      std::string LegalName = Name + "$legalf32";
+      std::string LegalFunc = "function " + LegalName + "(";
+      std::string Declares = "";
+      std::string Call = Name + "(";
+      int Index = 0;
+      for (FunctionType::param_iterator AI = FT->param_begin(),
+             AE = FT->param_end(); AI != AE; ++AI) {
+        if (Index > 0) {
+          LegalFunc += ", ";
+          Declares += " ";
+          Call += ", ";
+        }
+        auto Arg = getArgLetter(Index);
+        LegalFunc += Arg;
+        Declares += Arg + " = " + getCast(Arg, *AI) + ';';
+        Call += getCast(Arg, *AI, ASM_NONSPECIFIC | ASM_FFI_OUT);
+        Index++;
+      }
+      LegalFunc += ") {\n ";
+      LegalFunc += Declares + "\n ";
+      Call += ")";
+      if (!FT->getReturnType()->isVoidTy()) {
+        Call = "return " + getCast(Call, FT->getReturnType(), ASM_FFI_IN);
+      }
+      LegalFunc += Call + ";\n}";
+      ExtraFunctions.push_back(LegalFunc);
+      return LegalName;
+    }
+    unsigned getFunctionIndex(const Function *F) {
+      const std::string &Name = getJSName(F);
+      if (IndexedFunctions.find(Name) != IndexedFunctions.end()) return IndexedFunctions[Name];
+      FunctionTable& Table = ensureFunctionTable(F->getFunctionType());
+      if (NoAliasingFunctionPointers) {
+        while (Table.size() < NextFunctionIndex) Table.push_back("0");
+      }
+      // XXX this is wrong, it's always 1. but, that's fine in the ARM-like ABI
+      // we have which allows unaligned func the one risk is if someone forces a
+      // function to be aligned, and relies on that. Could do F->getAlignment()
+      // instead.
+      unsigned Alignment = 1;
+      while (Table.size() % Alignment) Table.push_back("0");
+      unsigned Index = Table.size();
+      // add the name to the table. normally we can just add the function itself,
+      // however, that may not be valid in wasm. consider an imported function with an
+      // f32 parameter - due to asm.js ffi rules, we must send it f64s. So its
+      // uses will appear to use f64s, but when called through the function table,
+      // it must use an f32 for wasm correctness. so we must have an import with
+      // f64, and put a thunk in the table which accepts f32 and redirects to the
+      // import. Note that this cannot be done in a later stage, like binaryen's
+      // legalization, as f32/f64 asm.js overloading can mask it. Note that this
+      // isn't an issue for i64s even though they are illegal, precisely because
+      // f32/f64 overloading is possible but i64s don't overload in asm.js with
+      // anything.
+      // TODO: if there are no uses of F (aside from being in the table) then
+      //       we don't need this, as we'll add a use in
+      //       DeclaresNeedingTypeDeclarations which will have the proper type,
+      //       and nothing will contradict it/overload it.
+      if (WebAssembly && F->isDeclaration() && usesFloat32(F->getFunctionType())) {
+        Table.push_back(makeFloat32Legalizer(F));
+      } else {
+        Table.push_back(Name);
+      }
+      IndexedFunctions[Name] = Index;
+      if (NoAliasingFunctionPointers) {
+        NextFunctionIndex = Index+1;
+      }
+
+      // invoke the callHandler for this, if there is one. the function may only be indexed but never called directly, and we may need to do things in the handler
+      CallHandlerMap::const_iterator CH = CallHandlers.find(Name);
+      if (CH != CallHandlers.end()) {
+        (this->*(CH->second))(NULL, Name, -1);
+      }
+
+      // in asm.js, types are inferred from use. so if we have a method that *only* appears in a table, it therefore has no use,
+      // and we are in trouble; emit a fake dce-able use for it.
+      if (WebAssembly) {
+        if (F->isDeclaration()) {
+          DeclaresNeedingTypeDeclarations.insert(F);
+        }
+      }
+
+      return Index;
+    }
+
+    unsigned getBlockAddress(const Function *F, const BasicBlock *BB) {
+      BlockIndexMap& Blocks = BlockAddresses[F];
+      if (Blocks.find(BB) == Blocks.end()) {
+        Blocks[BB] = Blocks.size(); // block addresses start from 0
+      }
+      return Blocks[BB];
+    }
+
+    unsigned getBlockAddress(const BlockAddress *BA) {
+      return getBlockAddress(BA->getFunction(), BA->getBasicBlock());
+    }
+
+    const Value *resolveFully(const Value *V) {
+      bool More = true;
+      while (More) {
+        More = false;
+        if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+          V = GA->getAliasee();
+          More = true;
+        }
+        if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+          V = CE->getOperand(0); // ignore bitcasts
+          More = true;
+        }
+      }
+      return V;
+    }
+
+    std::string relocateFunctionPointer(std::string FP) {
+      if (Relocatable && WebAssembly && SideModule) {
+        return "(tableBase + (" + FP + ") | 0)";
+      }
+      return Relocatable ? "(fb + (" + FP + ") | 0)" : FP;
+    }
+
+    std::string relocateGlobal(std::string G) {
+      if (Relocatable && WebAssembly && SideModule) {
+        return "(memoryBase + (" + G + ") | 0)";
+      }
+      return Relocatable ? "(gb + (" + G + ") | 0)" : G;
+    }
+
+    unsigned getIDForMetadata(Metadata *MD) {
+      if (cyberDWARFData.IndexedMetadata.find(MD) == cyberDWARFData.IndexedMetadata.end()) {
+        cyberDWARFData.IndexedMetadata[MD] = cyberDWARFData.MetadataNum++;
+      }
+      return cyberDWARFData.IndexedMetadata[MD];
+    }
+
+    // Return a constant we are about to write into a global as a numeric offset. If the
+    // value is not known at compile time, emit a postSet to that location.
+    unsigned getConstAsOffset(const Value *V, unsigned AbsoluteTarget) {
+      V = resolveFully(V);
+      if (const Function *F = dyn_cast<const Function>(V)) {
+        if (Relocatable) {
+          PostSets.push_back("\n HEAP32[" + relocateGlobal(utostr(AbsoluteTarget)) + " >> 2] = " + relocateFunctionPointer(utostr(getFunctionIndex(F))) + ';');
+          return 0; // emit zero in there for now, until the postSet
+        }
+        return getFunctionIndex(F);
+      } else if (const BlockAddress *BA = dyn_cast<const BlockAddress>(V)) {
+        return getBlockAddress(BA);
+      } else {
+        if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+          if (!GV->hasInitializer()) {
+            // We don't have a constant to emit here, so we must emit a postSet
+            // All postsets are of external values, so they are pointers, hence 32-bit
+            std::string Name = getOpName(V);
+            Externals.insert(Name);
+            if (Relocatable) {
+              PostSets.push_back("\n temp = g$" + Name + "() | 0;"); // we access linked externs through calls, and must do so to a temp for heap growth validation
+              // see later down about adding to an offset
+              std::string access = "HEAP32[" + relocateGlobal(utostr(AbsoluteTarget)) + " >> 2]";
+              PostSets.push_back("\n " + access + " = (" + access + " | 0) + temp;");
+            } else {
+              PostSets.push_back("\n HEAP32[" + relocateGlobal(utostr(AbsoluteTarget)) + " >> 2] = " + Name + ';');
+            }
+            return 0; // emit zero in there for now, until the postSet
+          } else if (Relocatable) {
+            // this is one of our globals, but we must relocate it. we return zero, but the caller may store
+            // an added offset, which we read at postSet time; in other words, we just add to that offset
+            std::string access = "HEAP32[" + relocateGlobal(utostr(AbsoluteTarget)) + " >> 2]";
+            PostSets.push_back("\n " + access + " = (" + access + " | 0) + " + relocateGlobal(utostr(getGlobalAddress(V->getName().str()))) + ';');
+            return 0; // emit zero in there for now, until the postSet
+          }
+        }
+        assert(!Relocatable);
+        return getGlobalAddress(V->getName().str());
+      }
+    }
+
+    // Transform the string input into emscripten_asm_const_*(str, args1, arg2)
+    // into an id. We emit a map of id => string contents, and emscripten
+    // wraps it up so that calling that id calls that function.
+    unsigned getAsmConstId(const Value *V, std::string Sig) {
+      V = resolveFully(V);
+      const Constant *CI = cast<GlobalVariable>(V)->getInitializer();
+      std::string code;
+      if (isa<ConstantAggregateZero>(CI)) {
+        code = " ";
+      } else {
+        const ConstantDataSequential *CDS = cast<ConstantDataSequential>(CI);
+        code = CDS->getAsString();
+        // replace newlines quotes with escaped newlines
+        size_t curr = 0;
+        while ((curr = code.find("\\n", curr)) != std::string::npos) {
+          code = code.replace(curr, 2, "\\\\n");
+          curr += 3; // skip this one
+        }
+        // replace double quotes with escaped single quotes
+        curr = 0;
+        while ((curr = code.find('"', curr)) != std::string::npos) {
+          if (curr == 0 || code[curr-1] != '\\') {
+            code = code.replace(curr, 1, "\\" "\"");
+            curr += 2; // skip this one
+          } else { // already escaped, escape the slash as well
+            code = code.replace(curr, 1, "\\" "\\" "\"");
+            curr += 3; // skip this one
+          }
+        }
+      }
+      unsigned Id;
+      if (AsmConsts.count(code) > 0) {
+        auto& Info = AsmConsts[code];
+        Id = Info.Id;
+        Info.Sigs.insert(Sig);
+      } else {
+        AsmConstInfo Info;
+        Info.Id = Id = AsmConsts.size();
+        Info.Sigs.insert(Sig);
+        AsmConsts[code] = Info;
+      }
+      return Id;
+    }
+
+    // Test whether the given value is known to be an absolute value or one we turn into an absolute value
+    bool isAbsolute(const Value *P) {
+      if (const IntToPtrInst *ITP = dyn_cast<IntToPtrInst>(P)) {
+        return isa<ConstantInt>(ITP->getOperand(0));
+      }
+      if (isa<ConstantPointerNull>(P) || isa<UndefValue>(P)) {
+        return true;
+      }
+      return false;
+    }
+
+    void checkVectorType(Type *T) {
+      VectorType *VT = cast<VectorType>(T);
+      // LLVM represents the results of vector comparison as vectors of i1. We
+      // represent them as vectors of integers the size of the vector elements
+      // of the compare that produced them.
+      assert(VT->getElementType()->getPrimitiveSizeInBits() == 8 ||
+             VT->getElementType()->getPrimitiveSizeInBits() == 16 ||
+             VT->getElementType()->getPrimitiveSizeInBits() == 32 ||
+             VT->getElementType()->getPrimitiveSizeInBits() == 64 ||
+             VT->getElementType()->getPrimitiveSizeInBits() == 128 ||
+             VT->getElementType()->getPrimitiveSizeInBits() == 1);
+      assert(VT->getBitWidth() <= 128);
+      assert(VT->getNumElements() <= 16);
+      if (VT->getElementType()->isIntegerTy())
+      {
+        if (VT->getNumElements() <= 16 && VT->getElementType()->getPrimitiveSizeInBits() == 8) UsesSIMDInt8x16 = true;
+        else if (VT->getNumElements() <= 8 && VT->getElementType()->getPrimitiveSizeInBits() == 16) UsesSIMDInt16x8 = true;
+        else if (VT->getNumElements() <= 4 && VT->getElementType()->getPrimitiveSizeInBits() == 32) UsesSIMDInt32x4 = true;
+        else if (VT->getElementType()->getPrimitiveSizeInBits() == 1) {
+          if (VT->getNumElements() == 16) UsesSIMDBool8x16 = true;
+          else if (VT->getNumElements() == 8) UsesSIMDBool16x8 = true;
+          else if (VT->getNumElements() == 4) UsesSIMDBool32x4 = true;
+          else if (VT->getNumElements() == 2) UsesSIMDBool64x2 = true;
+          else report_fatal_error("Unsupported boolean vector type with numElems: " + Twine(VT->getNumElements()) + ", primitiveSize: " + Twine(VT->getElementType()->getPrimitiveSizeInBits()) + "!");
+        } else if (VT->getElementType()->getPrimitiveSizeInBits() != 1 && VT->getElementType()->getPrimitiveSizeInBits() != 128) {
+          report_fatal_error("Unsupported integer vector type with numElems: " + Twine(VT->getNumElements()) + ", primitiveSize: " + Twine(VT->getElementType()->getPrimitiveSizeInBits()) + "!");
+        }
+      }
+      else
+      {
+        if (VT->getNumElements() <= 4 && VT->getElementType()->getPrimitiveSizeInBits() == 32) UsesSIMDFloat32x4 = true;
+        else if (VT->getNumElements() <= 2 && VT->getElementType()->getPrimitiveSizeInBits() == 64) UsesSIMDFloat64x2 = true;
+        else report_fatal_error("Unsupported floating point vector type numElems: " + Twine(VT->getNumElements()) + ", primitiveSize: " + Twine(VT->getElementType()->getPrimitiveSizeInBits()) + "!");
+      }
+    }
+
+    std::string ensureCast(std::string S, Type *T, AsmCast sign) {
+      if (sign & ASM_MUST_CAST) return getCast(S, T);
+      return S;
+    }
+
+    static void emitDebugInfo(raw_ostream& Code, const Instruction *I) {
+      auto &Loc = I->getDebugLoc();
+      if (Loc) {
+        unsigned Line = Loc.getLine();
+        auto *Scope = cast_or_null<DIScope>(Loc.getScope());
+        if (Scope) {
+          StringRef File = Scope->getFilename();
+          if (Line > 0)
+            Code << " //@line " << utostr(Line) << " \"" << (File.size() > 0 ? File.str() : "?") << "\"";
+        }
+      }
+    }
+
+    std::string emitI64Const(uint64_t value) {
+      return "i64_const(" + itostr(value & uint32_t(-1)) + "," + itostr((value >> 32) & uint32_t(-1)) + ")";
+    }
+
+    std::string emitI64Const(APInt i) {
+      return emitI64Const(i.getZExtValue());
+    }
+
+    std::string ftostr(const ConstantFP *CFP, AsmCast sign) {
+      const APFloat &flt = CFP->getValueAPF();
+
+      // Emscripten has its own spellings for infinity and NaN.
+      if (flt.getCategory() == APFloat::fcInfinity) return ensureCast(flt.isNegative() ? "-inf" : "inf", CFP->getType(), sign);
+      else if (flt.getCategory() == APFloat::fcNaN) {
+        APInt i = flt.bitcastToAPInt();
+        if ((i.getBitWidth() == 32 && i != APInt(32, 0x7FC00000)) || (i.getBitWidth() == 64 && i != APInt(64, 0x7FF8000000000000ULL))) {
+          // If we reach here, things have already gone bad, and JS engine NaN canonicalization will kill the bits in the float. However can't make
+          // this a build error in order to not break people's existing code, so issue a warning instead.
+          if (WarnOnNoncanonicalNans) {
+            errs() << "emcc: warning: cannot represent a NaN literal '" << CFP << "' with custom bit pattern in NaN-canonicalizing JS engines (e.g. Firefox and Safari) without erasing bits!\n";
+            if (CurrInstruction) {
+              errs() << "  in " << *CurrInstruction << " in " << CurrInstruction->getParent()->getParent()->getName() << "() ";
+              emitDebugInfo(errs(), CurrInstruction);
+              errs() << '\n';
+            }
+          }
+        }
+        return ensureCast("nan", CFP->getType(), sign);
+      }
+
+      // Request 9 or 17 digits, aka FLT_DECIMAL_DIG or DBL_DECIMAL_DIG (our
+      // long double is the the same as our double), to avoid rounding errors.
+      SmallString<29> Str;
+      flt.toString(Str, PreciseF32 && CFP->getType()->isFloatTy() ? 9 : 17);
+
+      // asm.js considers literals to be floating-point literals when they contain a
+      // dot, however our output may be processed by UglifyJS, which doesn't
+      // currently preserve dots in all cases. Mark floating-point literals with
+      // unary plus to force them to floating-point.
+      if (APFloat(flt).roundToIntegral(APFloat::rmNearestTiesToEven) == APFloat::opOK) {
+        return '+' + Str.str().str();
+      }
+
+      return Str.str().str();
+    }
+
+    std::string getPtrLoad(const Value* Ptr);
+
+    /// Given a pointer to memory, returns the HEAP object and index to that object that is used to access that memory.
+    /// @param Ptr [in] The heap object.
+    /// @param HeapName [out] Receives the name of the HEAP object used to perform the memory acess.
+    /// @return The index to the heap HeapName for the memory access.
+    std::string getHeapNameAndIndex(const Value *Ptr, const char **HeapName);
+
+    // Like getHeapNameAndIndex(), but uses the given memory operation size and whether it is an Integer instead of the type of Ptr.
+    std::string getHeapNameAndIndex(const Value *Ptr, const char **HeapName, unsigned Bytes, bool Integer);
+
+    /// Like getHeapNameAndIndex(), but for global variables only.
+    std::string getHeapNameAndIndexToGlobal(const GlobalVariable *GV, unsigned Bytes, bool Integer, const char **HeapName);
+
+    /// Like getHeapNameAndIndex(), but for pointers represented in string expression form.
+    static std::string getHeapNameAndIndexToPtr(const std::string& Ptr, unsigned Bytes, bool Integer, const char **HeapName);
+
+    std::string getShiftedPtr(const Value *Ptr, unsigned Bytes);
+
+    /// Returns a string expression for accessing the given memory address.
+    std::string getPtrUse(const Value* Ptr);
+
+    /// Like getPtrUse(), but for pointers represented in string expression form.
+    static std::string getHeapAccess(const std::string& Name, unsigned Bytes, bool Integer=true);
+
+    std::string getUndefValue(Type* T, AsmCast sign=ASM_SIGNED);
+    std::string getConstant(const Constant*, AsmCast sign=ASM_SIGNED);
+    template<typename VectorType/*= ConstantVector or ConstantDataVector*/>
+    std::string getConstantVector(const VectorType *C);
+    std::string getValueAsStr(const Value*, AsmCast sign=ASM_SIGNED);
+    std::string getValueAsCastStr(const Value*, AsmCast sign=ASM_SIGNED);
+    std::string getValueAsParenStr(const Value*);
+    std::string getValueAsCastParenStr(const Value*, AsmCast sign=ASM_SIGNED);
+
+    const std::string &getJSName(const Value* val);
+
+    std::string getPhiCode(const BasicBlock *From, const BasicBlock *To);
+
+    void printAttributes(const AttributeSet &PAL, const std::string &name);
+    void printType(Type* Ty);
+    void printTypes(const Module* M);
+
+    std::string getAdHocAssign(const StringRef &, Type *);
+    std::string getAssign(const Instruction *I);
+    std::string getAssignIfNeeded(const Value *V);
+    std::string getCast(const StringRef &, Type *, AsmCast sign=ASM_SIGNED);
+    std::string getParenCast(const StringRef &, Type *, AsmCast sign=ASM_SIGNED);
+    std::string getDoubleToInt(const StringRef &);
+    std::string getIMul(const Value *, const Value *);
+    std::string getLoad(const Instruction *I, const Value *P, Type *T, unsigned Alignment, char sep=';');
+    std::string getStore(const Instruction *I, const Value *P, Type *T, const std::string& VS, unsigned Alignment, char sep=';');
+    std::string getStackBump(unsigned Size);
+    std::string getStackBump(const std::string &Size);
+
+    void addBlock(const BasicBlock *BB, Relooper& R, LLVMToRelooperMap& LLVMToRelooper);
+    void printFunctionBody(const Function *F);
+    void generateInsertElementExpression(const InsertElementInst *III, raw_string_ostream& Code);
+    void generateExtractElementExpression(const ExtractElementInst *EEI, raw_string_ostream& Code);
+    std::string getSIMDCast(VectorType *fromType, VectorType *toType, const std::string &valueStr, bool signExtend);
+    void generateShuffleVectorExpression(const ShuffleVectorInst *SVI, raw_string_ostream& Code);
+    void generateICmpExpression(const ICmpInst *I, raw_string_ostream& Code);
+    void generateFCmpExpression(const FCmpInst *I, raw_string_ostream& Code);
+    void generateShiftExpression(const BinaryOperator *I, raw_string_ostream& Code);
+    void generateUnrolledExpression(const User *I, raw_string_ostream& Code);
+    bool generateSIMDExpression(const User *I, raw_string_ostream& Code);
+    void generateExpression(const User *I, raw_string_ostream& Code);
+
+    // debug information
+    std::string generateDebugRecordForVar(Metadata *MD);
+    void buildCyberDWARFData();
+
+    std::string getOpName(const Value*);
+
+    void processConstants();
+
+    // nativization
+
+    typedef std::set<const Value*> NativizedVarsMap;
+    NativizedVarsMap NativizedVars;
+
+    void calculateNativizedVars(const Function *F);
+
+    // special analyses
+
+    bool canReloop(const Function *F);
+
+    // main entry point
+
+    void printModuleBody();
+  };
+} // end anonymous namespace.
+
+raw_pwrite_stream &JSWriter::nl(raw_pwrite_stream &Out, int delta) {
+  Out << '\n';
+  return Out;
+}
+
+static inline char halfCharToHex(unsigned char half) {
+  assert(half <= 15);
+  if (half <= 9) {
+    return '0' + half;
+  } else {
+    return 'A' + half - 10;
+  }
+}
+
+static inline void sanitizeGlobal(std::string& str) {
+  // Global names are prefixed with "_" to prevent them from colliding with
+  // names of things in normal JS.
+  str = "_" + str;
+
+  // functions and globals should already be in C-style format,
+  // in addition to . for llvm intrinsics and possibly $ and so forth.
+  // There is a risk of collisions here, we just lower all these
+  // invalid characters to _, but this should not happen in practice.
+  // TODO: in debug mode, check for such collisions.
+  size_t OriginalSize = str.size();
+  for (size_t i = 1; i < OriginalSize; ++i) {
+    unsigned char c = str[i];
+    if (!isalnum(c) && c != '_') str[i] = '_';
+  }
+}
+
+static inline void sanitizeLocal(std::string& str) {
+  // Local names are prefixed with "$" to prevent them from colliding with
+  // global names.
+  str = "$" + str;
+
+  // We need to convert every string that is not a valid JS identifier into
+  // a valid one, without collisions - we cannot turn "x.a" into "x_a" while
+  // also leaving "x_a" as is, for example.
+  //
+  // We leave valid characters 0-9a-zA-Z and _ unchanged. Anything else
+  // we replace with $ and append a hex representation of that value,
+  // so for example x.a turns into x$a2e, x..a turns into x$$a2e2e.
+  //
+  // As an optimization, we replace . with $ without appending anything,
+  // unless there is another illegal character. The reason is that . is
+  // a common illegal character, and we want to avoid resizing strings
+  // for perf reasons, and we If we do see we need to append something, then
+  // for . we just append Z (one character, instead of the hex code).
+  //
+
+  size_t OriginalSize = str.size();
+  int Queued = 0;
+  for (size_t i = 1; i < OriginalSize; ++i) {
+    unsigned char c = str[i];
+    if (!isalnum(c) && c != '_') {
+      str[i] = '$';
+      if (c == '.') {
+        Queued++;
+      } else {
+        size_t s = str.size();
+        str.resize(s+2+Queued);
+        for (int i = 0; i < Queued; i++) {
+          str[s++] = 'Z';
+        }
+        Queued = 0;
+        str[s] = halfCharToHex(c >> 4);
+        str[s+1] = halfCharToHex(c & 0xf);
+      }
+    }
+  }
+}
+
+static inline std::string ensureFloat(const std::string &S, Type *T) {
+  if (PreciseF32 && T->isFloatTy()) {
+    return "Math_fround(" + S + ')';
+  }
+  return S;
+}
+
+static inline std::string ensureFloat(const std::string &value, bool wrap) {
+  if (wrap) {
+    return "Math_fround(" + value + ')';
+  }
+  return value;
+}
+
+void JSWriter::error(const std::string& msg) {
+  report_fatal_error(msg);
+}
+
+std::string JSWriter::getPhiCode(const BasicBlock *From, const BasicBlock *To) {
+  // FIXME this is all quite inefficient, and also done once per incoming to each phi
+
+  // Find the phis, and generate assignments and dependencies
+  std::set<std::string> PhiVars;
+  for (BasicBlock::const_iterator I = To->begin(), E = To->end();
+       I != E; ++I) {
+    const PHINode* P = dyn_cast<PHINode>(I);
+    if (!P) break;
+    PhiVars.insert(getJSName(P));
+  }
+  typedef std::map<std::string, std::string> StringMap;
+  StringMap assigns; // variable -> assign statement
+  std::map<std::string, const Value*> values; // variable -> Value
+  StringMap deps; // variable -> dependency
+  StringMap undeps; // reverse: dependency -> variable
+  for (BasicBlock::const_iterator I = To->begin(), E = To->end();
+       I != E; ++I) {
+    const PHINode* P = dyn_cast<PHINode>(I);
+    if (!P) break;
+    int index = P->getBasicBlockIndex(From);
+    if (index < 0) continue;
+    // we found it
+    const std::string &name = getJSName(P);
+    assigns[name] = getAssign(P);
+    // Get the operand, and strip pointer casts, since normal expression
+    // translation also strips pointer casts, and we want to see the same
+    // thing so that we can detect any resulting dependencies.
+    const Value *V = P->getIncomingValue(index)->stripPointerCasts();
+    values[name] = V;
+    std::string vname = getValueAsStr(V);
+    if (const Instruction *VI = dyn_cast<const Instruction>(V)) {
+      if (VI->getParent() == To && PhiVars.find(vname) != PhiVars.end()) {
+        deps[name] = vname;
+        undeps[vname] = name;
+      }
+    }
+  }
+  // Emit assignments+values, taking into account dependencies, and breaking cycles
+  std::string pre = "", post = "";
+  while (assigns.size() > 0) {
+    bool emitted = false;
+    for (StringMap::iterator I = assigns.begin(); I != assigns.end();) {
+      StringMap::iterator last = I;
+      std::string curr = last->first;
+      const Value *V = values[curr];
+      std::string CV = getValueAsStr(V);
+      I++; // advance now, as we may erase
+      // if we have no dependencies, or we found none to emit and are at the end (so there is a cycle), emit
+      StringMap::const_iterator dep = deps.find(curr);
+      if (dep == deps.end() || (!emitted && I == assigns.end())) {
+        if (dep != deps.end()) {
+          // break a cycle
+          std::string depString = dep->second;
+          std::string temp = curr + "$phi";
+          pre += getAdHocAssign(temp, V->getType()) + CV + ';';
+          CV = temp;
+          deps.erase(curr);
+          undeps.erase(depString);
+        }
+        post += assigns[curr] + CV + ';';
+        assigns.erase(last);
+        emitted = true;
+      }
+    }
+  }
+  return pre + post;
+}
+
+const std::string &JSWriter::getJSName(const Value* val) {
+  ValueMap::const_iterator I = ValueNames.find(val);
+  if (I != ValueNames.end() && I->first == val)
+    return I->second;
+
+  // If this is an alloca we've replaced with another, use the other name.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(val)) {
+    if (AI->isStaticAlloca()) {
+      const AllocaInst *Rep = Allocas.getRepresentative(AI);
+      if (Rep != AI) {
+        return getJSName(Rep);
+      }
+    }
+  }
+
+  std::string name;
+  if (val->hasName()) {
+    name = val->getName().str();
+  } else {
+    name = utostr(UniqueNum++);
+  }
+
+  if (isa<Constant>(val)) {
+    sanitizeGlobal(name);
+  } else {
+    sanitizeLocal(name);
+  }
+
+  return ValueNames[val] = name;
+}
+
+std::string JSWriter::getAdHocAssign(const StringRef &s, Type *t) {
+  UsedVars[s] = t;
+  return (s + " = ").str();
+}
+
+std::string JSWriter::getAssign(const Instruction *I) {
+  return getAdHocAssign(getJSName(I), I->getType());
+}
+
+std::string JSWriter::getAssignIfNeeded(const Value *V) {
+  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+    if (!I->use_empty()) return getAssign(I);
+  }
+  return std::string();
+}
+
+int SIMDNumElements(VectorType *t) {
+  assert(t->getElementType()->getPrimitiveSizeInBits() <= 128);
+
+  if (t->getElementType()->getPrimitiveSizeInBits() == 1) { // Bool8x16, Bool16x8, Bool32x4 or Bool64x2
+    if (t->getNumElements() <= 2) return 2;
+    if (t->getNumElements() <= 4) return 4;
+    if (t->getNumElements() <= 8) return 8;
+    if (t->getNumElements() <= 16) return 16;
+    // fall-through to error
+  } else { // Int/Float 8x16, 16x8, 32x4 or 64x2
+    if (t->getElementType()->getPrimitiveSizeInBits() > 32 && t->getNumElements() <= 2) return 2;
+    if (t->getElementType()->getPrimitiveSizeInBits() > 16 && t->getNumElements() <= 4) return 4;
+    if (t->getElementType()->getPrimitiveSizeInBits() > 8 && t->getNumElements() <= 8) return 8;
+    if (t->getElementType()->getPrimitiveSizeInBits() <= 8 && t->getNumElements() <= 16) return 16;
+    // fall-through to error
+  }
+  errs() << *t << "\n";
+  report_fatal_error("Unsupported type!");
+  return 0;
+}
+
+const char *SIMDType(VectorType *t) {
+  assert(t->getElementType()->getPrimitiveSizeInBits() <= 128);
+
+  if (t->getElementType()->isIntegerTy()) {
+    if (t->getElementType()->getPrimitiveSizeInBits() == 1) {
+      if (t->getNumElements() == 2) return "Bool64x2";
+      if (t->getNumElements() <= 4) return "Bool32x4";
+      if (t->getNumElements() <= 8) return "Bool16x8";
+      if (t->getNumElements() <= 16) return "Bool8x16";
+      // fall-through to error
+    } else {
+      if (t->getElementType()->getPrimitiveSizeInBits() > 32 && t->getNumElements() <= 2) return "Int64x2";
+      if (t->getElementType()->getPrimitiveSizeInBits() > 16 && t->getNumElements() <= 4) return "Int32x4";
+      if (t->getElementType()->getPrimitiveSizeInBits() > 8 && t->getNumElements() <= 8) return "Int16x8";
+      if (t->getElementType()->getPrimitiveSizeInBits() <= 8 && t->getNumElements() <= 16) return "Int8x16";
+      // fall-through to error
+    }
+  } else { // float type
+    if (t->getElementType()->getPrimitiveSizeInBits() > 32 && t->getNumElements() <= 2) return "Float64x2";
+    if (t->getElementType()->getPrimitiveSizeInBits() > 16 && t->getNumElements() <= 4) return "Float32x4";
+    if (t->getElementType()->getPrimitiveSizeInBits() > 8 && t->getNumElements() <= 8) return "Float16x8";
+    if (t->getElementType()->getPrimitiveSizeInBits() <= 8 && t->getNumElements() <= 16) return "Float8x16";
+    // fall-through to error
+  }
+  errs() << *t << "\n";
+  report_fatal_error("Unsupported type!");
+}
+
+std::string JSWriter::getCast(const StringRef &s, Type *t, AsmCast sign) {
+  switch (t->getTypeID()) {
+    default: {
+      errs() << *t << "\n";
+      assert(false && "Unsupported type");
+    }
+    case Type::VectorTyID:
+      return std::string("SIMD_") + SIMDType(cast<VectorType>(t)) + "_check(" + s.str() + ")";
+    case Type::FloatTyID: {
+      if (PreciseF32 && !(sign & ASM_FFI_OUT)) {
+        if (sign & ASM_FFI_IN) {
+          return ("Math_fround(+(" + s + "))").str();
+        } else {
+          return ("Math_fround(" + s + ")").str();
+        }
+      }
+      // otherwise fall through to double
+    }
+    case Type::DoubleTyID: return ("+" + s).str();
+    case Type::IntegerTyID: {
+      // fall through to the end for nonspecific
+      switch (t->getIntegerBitWidth()) {
+        case 1:  if (!(sign & ASM_NONSPECIFIC)) return sign == ASM_UNSIGNED ? (s + "&1").str()     : (s + "<<31>>31").str();
+        case 8:  if (!(sign & ASM_NONSPECIFIC)) return sign == ASM_UNSIGNED ? (s + "&255").str()   : (s + "<<24>>24").str();
+        case 16: if (!(sign & ASM_NONSPECIFIC)) return sign == ASM_UNSIGNED ? (s + "&65535").str() : (s + "<<16>>16").str();
+        case 32: return (sign == ASM_SIGNED || (sign & ASM_NONSPECIFIC) ? s + "|0" : s + ">>>0").str();
+        case 64: return ("i64(" + s + ")").str();
+        default: llvm_unreachable("Unsupported integer cast bitwidth");
+      }
+    }
+    case Type::PointerTyID:
+      return (sign == ASM_SIGNED || (sign & ASM_NONSPECIFIC) ? s + "|0" : s + ">>>0").str();
+  }
+}
+
+std::string JSWriter::getParenCast(const StringRef &s, Type *t, AsmCast sign) {
+  return getCast(("(" + s + ")").str(), t, sign);
+}
+
+std::string JSWriter::getDoubleToInt(const StringRef &s) {
+  return ("~~(" + s + ")").str();
+}
+
+std::string JSWriter::getIMul(const Value *V1, const Value *V2) {
+  const ConstantInt *CI = NULL;
+  const Value *Other = NULL;
+  if ((CI = dyn_cast<ConstantInt>(V1))) {
+    Other = V2;
+  } else if ((CI = dyn_cast<ConstantInt>(V2))) {
+    Other = V1;
+  }
+  // we ignore optimizing the case of multiplying two constants - optimizer would have removed those
+  if (CI) {
+    std::string OtherStr = getValueAsStr(Other);
+    unsigned C = CI->getZExtValue();
+    if (C == 0) return "0";
+    if (C == 1) return OtherStr;
+    unsigned Orig = C, Shifts = 0;
+    while (C) {
+      if ((C & 1) && (C != 1)) break; // not power of 2
+      C >>= 1;
+      Shifts++;
+      if (C == 0) return OtherStr + "<<" + utostr(Shifts-1); // power of 2, emit shift
+    }
+    if (Orig < (1<<20)) return "(" + OtherStr + "*" + utostr(Orig) + ")|0"; // small enough, avoid imul
+  }
+  return "Math_imul(" + getValueAsStr(V1) + ", " + getValueAsStr(V2) + ")|0"; // unknown or too large, emit imul
+}
+
+static inline const char *getHeapName(int Bytes, int Integer)
+{
+  switch (Bytes) {
+    default: llvm_unreachable("Unsupported type");
+    case 8: return "HEAPF64";
+    case 4: return Integer ? "HEAP32" : "HEAPF32";
+    case 2: return "HEAP16";
+    case 1: return "HEAP8";
+  }
+}
+
+static inline int getHeapShift(int Bytes)
+{
+  switch (Bytes) {
+    default: llvm_unreachable("Unsupported type");
+    case 8: return 3;
+    case 4: return 2;
+    case 2: return 1;
+    case 1: return 0;
+  }
+}
+
+static inline const char *getHeapShiftStr(int Bytes)
+{
+  switch (Bytes) {
+    default: llvm_unreachable("Unsupported type");
+    case 8: return ">>3";
+    case 4: return ">>2";
+    case 2: return ">>1";
+    case 1: return ">>0";
+  }
+}
+
+std::string JSWriter::getHeapNameAndIndexToGlobal(const GlobalVariable *GV, unsigned Bytes, bool Integer, const char **HeapName)
+{
+  unsigned Addr = getGlobalAddress(GV->getName().str());
+  *HeapName = getHeapName(Bytes, Integer);
+  if (!Relocatable) {
+    return utostr(Addr >> getHeapShift(Bytes));
+  } else {
+    return relocateGlobal(utostr(Addr)) + getHeapShiftStr(Bytes);
+  }
+}
+
+std::string JSWriter::getHeapNameAndIndexToPtr(const std::string& Ptr, unsigned Bytes, bool Integer, const char **HeapName)
+{
+  *HeapName = getHeapName(Bytes, Integer);
+  return Ptr + getHeapShiftStr(Bytes);
+}
+
+std::string JSWriter::getHeapNameAndIndex(const Value *Ptr, const char **HeapName, unsigned Bytes, bool Integer)
+{
+  const GlobalVariable *GV;
+  if ((GV = dyn_cast<GlobalVariable>(Ptr->stripPointerCasts())) && GV->hasInitializer()) {
+    // Note that we use the type of the pointer, as it might be a bitcast of the underlying global. We need the right type.
+    return getHeapNameAndIndexToGlobal(GV, Bytes, Integer, HeapName);
+  } else {
+    return getHeapNameAndIndexToPtr(getValueAsStr(Ptr), Bytes, Integer, HeapName);
+  }
+}
+
+std::string JSWriter::getHeapNameAndIndex(const Value *Ptr, const char **HeapName)
+{
+  Type *t = cast<PointerType>(Ptr->getType())->getElementType();
+  return getHeapNameAndIndex(Ptr, HeapName, DL->getTypeAllocSize(t), t->isIntegerTy() || t->isPointerTy());
+}
+
+static const char *heapNameToAtomicTypeName(const char *HeapName)
+{
+  if (!strcmp(HeapName, "HEAPF32")) return "f32";
+  if (!strcmp(HeapName, "HEAPF64")) return "f64";
+  return "";
+}
+
+std::string JSWriter::getLoad(const Instruction *I, const Value *P, Type *T, unsigned Alignment, char sep) {
+  std::string Assign = getAssign(I);
+  unsigned Bytes = DL->getTypeAllocSize(T);
+  bool Aligned = Bytes <= Alignment || Alignment == 0;
+  if (OnlyWebAssembly) {
+    if (isAbsolute(P)) {
+      // loads from an absolute constants are either intentional segfaults (int x = *((int*)0)), or code problems
+      JSWriter::getAssign(I); // ensure the variable is defined, even if it isn't used
+      return "abort() /* segfault, load from absolute addr */";
+    }
+    if (T->isIntegerTy() || T->isPointerTy()) {
+      switch (Bytes) {
+        case 1: return Assign + "load1(" + getValueAsStr(P) + ")";
+        case 2: return Assign + "load2(" + getValueAsStr(P) + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        case 4: return Assign + "load4(" + getValueAsStr(P) + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        case 8: return Assign + "load8(" + getValueAsStr(P) + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        default: llvm_unreachable("invalid wasm-only int load size");
+      }
+    } else {
+      switch (Bytes) {
+        case 4: return Assign + "loadf(" + getValueAsStr(P) + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        case 8: return Assign + "loadd(" + getValueAsStr(P) + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        default: llvm_unreachable("invalid wasm-only float load size");
+      }
+    }
+  }
+  std::string text;
+  if (Aligned) {
+    if (EnablePthreads && cast<LoadInst>(I)->isVolatile()) {
+      const char *HeapName;
+      std::string Index = getHeapNameAndIndex(P, &HeapName);
+      if (!strcmp(HeapName, "HEAPF32") || !strcmp(HeapName, "HEAPF64")) {
+        bool fround = PreciseF32 && !strcmp(HeapName, "HEAPF32");
+        // TODO: If https://bugzilla.mozilla.org/show_bug.cgi?id=1131613 and https://bugzilla.mozilla.org/show_bug.cgi?id=1131624 are
+        // implemented, we could remove the emulation, but until then we must emulate manually.
+        text = Assign + (fround ? "Math_fround(" : "+") + "_emscripten_atomic_load_" + heapNameToAtomicTypeName(HeapName) + "(" + getValueAsStr(P) + (fround ? "))" : ")");
+      } else {
+        text = Assign + "(Atomics_load(" + HeapName + ',' + Index + ")|0)";
+      }
+    } else {
+      text = Assign + getPtrLoad(P);
+    }
+    if (isAbsolute(P)) {
+      // loads from an absolute constants are either intentional segfaults (int x = *((int*)0)), or code problems
+      text += "; abort() /* segfault, load from absolute addr */";
+    }
+  } else {
+    // unaligned in some manner
+
+    if (EnablePthreads && cast<LoadInst>(I)->isVolatile()) {
+      errs() << "emcc: warning: unable to implement unaligned volatile load as atomic in " << I->getParent()->getParent()->getName() << ":" << *I << " | ";
+      emitDebugInfo(errs(), I);
+      errs() << "\n";
+    }
+
+    if (WarnOnUnaligned) {
+      errs() << "emcc: warning: unaligned load in  " << I->getParent()->getParent()->getName() << ":" << *I << " | ";
+      emitDebugInfo(errs(), I);
+      errs() << "\n";
+    }
+    std::string PS = getValueAsStr(P);
+    switch (Bytes) {
+      case 8: {
+        switch (Alignment) {
+          case 4: {
+            text = "HEAP32[tempDoublePtr>>2]=HEAP32[" + PS + ">>2]" + sep +
+                    "HEAP32[tempDoublePtr+4>>2]=HEAP32[" + PS + "+4>>2]";
+            break;
+          }
+          case 2: {
+            text = "HEAP16[tempDoublePtr>>1]=HEAP16[" + PS + ">>1]" + sep +
+                   "HEAP16[tempDoublePtr+2>>1]=HEAP16[" + PS + "+2>>1]" + sep +
+                   "HEAP16[tempDoublePtr+4>>1]=HEAP16[" + PS + "+4>>1]" + sep +
+                   "HEAP16[tempDoublePtr+6>>1]=HEAP16[" + PS + "+6>>1]";
+            break;
+          }
+          case 1: {
+            text = "HEAP8[tempDoublePtr>>0]=HEAP8[" + PS + ">>0]" + sep +
+                   "HEAP8[tempDoublePtr+1>>0]=HEAP8[" + PS + "+1>>0]" + sep +
+                   "HEAP8[tempDoublePtr+2>>0]=HEAP8[" + PS + "+2>>0]" + sep +
+                   "HEAP8[tempDoublePtr+3>>0]=HEAP8[" + PS + "+3>>0]" + sep +
+                   "HEAP8[tempDoublePtr+4>>0]=HEAP8[" + PS + "+4>>0]" + sep +
+                   "HEAP8[tempDoublePtr+5>>0]=HEAP8[" + PS + "+5>>0]" + sep +
+                   "HEAP8[tempDoublePtr+6>>0]=HEAP8[" + PS + "+6>>0]" + sep +
+                   "HEAP8[tempDoublePtr+7>>0]=HEAP8[" + PS + "+7>>0]";
+            break;
+          }
+          default: assert(0 && "bad 8 store");
+        }
+        text += sep + Assign + "+HEAPF64[tempDoublePtr>>3]";
+        break;
+      }
+      case 4: {
+        if (T->isIntegerTy() || T->isPointerTy()) {
+          switch (Alignment) {
+            case 2: {
+              text = Assign + "HEAPU16[" + PS + ">>1]|" +
+                             "(HEAPU16[" + PS + "+2>>1]<<16)";
+              break;
+            }
+            case 1: {
+              text = Assign + "HEAPU8[" + PS + ">>0]|" +
+                             "(HEAPU8[" + PS + "+1>>0]<<8)|" +
+                             "(HEAPU8[" + PS + "+2>>0]<<16)|" +
+                             "(HEAPU8[" + PS + "+3>>0]<<24)";
+              break;
+            }
+            default: assert(0 && "bad 4i store");
+          }
+        } else { // float
+          assert(T->isFloatingPointTy());
+          switch (Alignment) {
+            case 2: {
+              text = "HEAP16[tempDoublePtr>>1]=HEAP16[" + PS + ">>1]" + sep +
+                     "HEAP16[tempDoublePtr+2>>1]=HEAP16[" + PS + "+2>>1]";
+              break;
+            }
+            case 1: {
+              text = "HEAP8[tempDoublePtr>>0]=HEAP8[" + PS + ">>0]" + sep +
+                     "HEAP8[tempDoublePtr+1>>0]=HEAP8[" + PS + "+1>>0]" + sep +
+                     "HEAP8[tempDoublePtr+2>>0]=HEAP8[" + PS + "+2>>0]" + sep +
+                     "HEAP8[tempDoublePtr+3>>0]=HEAP8[" + PS + "+3>>0]";
+              break;
+            }
+            default: assert(0 && "bad 4f store");
+          }
+          text += sep + Assign + getCast("HEAPF32[tempDoublePtr>>2]", Type::getFloatTy(TheModule->getContext()));
+        }
+        break;
+      }
+      case 2: {
+        text = Assign + "HEAPU8[" + PS + ">>0]|" +
+                       "(HEAPU8[" + PS + "+1>>0]<<8)";
+        break;
+      }
+      default: assert(0 && "bad store");
+    }
+  }
+  return text;
+}
+
+std::string JSWriter::getStore(const Instruction *I, const Value *P, Type *T, const std::string& VS, unsigned Alignment, char sep) {
+  assert(sep == ';'); // FIXME when we need that
+  unsigned Bytes = DL->getTypeAllocSize(T);
+  bool Aligned = Bytes <= Alignment || Alignment == 0;
+  if (OnlyWebAssembly) {
+    if (Alignment == 536870912) {
+      return "abort() /* segfault */";
+    }
+    if (T->isIntegerTy() || T->isPointerTy()) {
+      switch (Bytes) {
+        case 1: return "store1(" + getValueAsStr(P) + "," + VS + ")";
+        case 2: return "store2(" + getValueAsStr(P) + "," + VS + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        case 4: return "store4(" + getValueAsStr(P) + "," + VS + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        case 8: return "store8(" + getValueAsStr(P) + "," + VS + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        default: llvm_unreachable("invalid wasm-only int load size");
+      }
+    } else {
+      switch (Bytes) {
+        case 4: return "storef(" + getValueAsStr(P) + "," + VS + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        case 8: return "stored(" + getValueAsStr(P) + "," + VS + (Aligned ? "" : "," + itostr(Alignment)) + ")";
+        default: llvm_unreachable("invalid wasm-only float load size");
+      }
+    }
+  }
+  std::string text;
+  if (Aligned) {
+    if (EnablePthreads && cast<StoreInst>(I)->isVolatile()) {
+      const char *HeapName;
+      std::string Index = getHeapNameAndIndex(P, &HeapName);
+      if (!strcmp(HeapName, "HEAPF32") || !strcmp(HeapName, "HEAPF64")) {
+        // TODO: If https://bugzilla.mozilla.org/show_bug.cgi?id=1131613 and https://bugzilla.mozilla.org/show_bug.cgi?id=1131624 are
+        // implemented, we could remove the emulation, but until then we must emulate manually.
+        text = std::string("_emscripten_atomic_store_") + heapNameToAtomicTypeName(HeapName) + "(" + getValueAsStr(P) + ',' + VS + ')';
+        if (PreciseF32 && !strcmp(HeapName, "HEAPF32"))
+          text = "Math_fround(" + text + ")";
+        else
+          text = "+" + text;
+      } else {
+        text = std::string("Atomics_store(") + HeapName + ',' + Index + ',' + VS + ")|0";
+      }
+    } else {
+      text = getPtrUse(P) + " = " + VS;
+    }
+    if (Alignment == 536870912) text += "; abort() /* segfault */";
+  } else {
+    // unaligned in some manner
+
+    if (EnablePthreads && cast<StoreInst>(I)->isVolatile()) {
+      errs() << "emcc: warning: unable to implement unaligned volatile store as atomic in " << I->getParent()->getParent()->getName() << ":" << *I << " | ";
+      emitDebugInfo(errs(), I);
+      errs() << "\n";
+    }
+
+    if (WarnOnUnaligned) {
+      errs() << "emcc: warning: unaligned store in " << I->getParent()->getParent()->getName() << ":" << *I << " | ";
+      emitDebugInfo(errs(), I);
+      errs() << "\n";
+    }
+    std::string PS = getValueAsStr(P);
+    switch (Bytes) {
+      case 8: {
+        text = "HEAPF64[tempDoublePtr>>3]=" + VS + ';';
+        switch (Alignment) {
+          case 4: {
+            text += "HEAP32[" + PS + ">>2]=HEAP32[tempDoublePtr>>2];" +
+                    "HEAP32[" + PS + "+4>>2]=HEAP32[tempDoublePtr+4>>2]";
+            break;
+          }
+          case 2: {
+            text += "HEAP16[" + PS + ">>1]=HEAP16[tempDoublePtr>>1];" +
+                    "HEAP16[" + PS + "+2>>1]=HEAP16[tempDoublePtr+2>>1];" +
+                    "HEAP16[" + PS + "+4>>1]=HEAP16[tempDoublePtr+4>>1];" +
+                    "HEAP16[" + PS + "+6>>1]=HEAP16[tempDoublePtr+6>>1]";
+            break;
+          }
+          case 1: {
+            text += "HEAP8[" + PS + ">>0]=HEAP8[tempDoublePtr>>0];" +
+                    "HEAP8[" + PS + "+1>>0]=HEAP8[tempDoublePtr+1>>0];" +
+                    "HEAP8[" + PS + "+2>>0]=HEAP8[tempDoublePtr+2>>0];" +
+                    "HEAP8[" + PS + "+3>>0]=HEAP8[tempDoublePtr+3>>0];" +
+                    "HEAP8[" + PS + "+4>>0]=HEAP8[tempDoublePtr+4>>0];" +
+                    "HEAP8[" + PS + "+5>>0]=HEAP8[tempDoublePtr+5>>0];" +
+                    "HEAP8[" + PS + "+6>>0]=HEAP8[tempDoublePtr+6>>0];" +
+                    "HEAP8[" + PS + "+7>>0]=HEAP8[tempDoublePtr+7>>0]";
+            break;
+          }
+          default: assert(0 && "bad 8 store");
+        }
+        break;
+      }
+      case 4: {
+        if (T->isIntegerTy() || T->isPointerTy()) {
+          switch (Alignment) {
+            case 2: {
+              text = "HEAP16[" + PS + ">>1]=" + VS + "&65535;" +
+                     "HEAP16[" + PS + "+2>>1]=" + VS + ">>>16";
+              break;
+            }
+            case 1: {
+              text = "HEAP8[" + PS + ">>0]=" + VS + "&255;" +
+                     "HEAP8[" + PS + "+1>>0]=(" + VS + ">>8)&255;" +
+                     "HEAP8[" + PS + "+2>>0]=(" + VS + ">>16)&255;" +
+                     "HEAP8[" + PS + "+3>>0]=" + VS + ">>24";
+              break;
+            }
+            default: assert(0 && "bad 4i store");
+          }
+        } else { // float
+          assert(T->isFloatingPointTy());
+          text = "HEAPF32[tempDoublePtr>>2]=" + VS + ';';
+          switch (Alignment) {
+            case 2: {
+              text += "HEAP16[" + PS + ">>1]=HEAP16[tempDoublePtr>>1];" +
+                      "HEAP16[" + PS + "+2>>1]=HEAP16[tempDoublePtr+2>>1]";
+              break;
+            }
+            case 1: {
+              text += "HEAP8[" + PS + ">>0]=HEAP8[tempDoublePtr>>0];" +
+                      "HEAP8[" + PS + "+1>>0]=HEAP8[tempDoublePtr+1>>0];" +
+                      "HEAP8[" + PS + "+2>>0]=HEAP8[tempDoublePtr+2>>0];" +
+                      "HEAP8[" + PS + "+3>>0]=HEAP8[tempDoublePtr+3>>0]";
+              break;
+            }
+            default: assert(0 && "bad 4f store");
+          }
+        }
+        break;
+      }
+      case 2: {
+        text = "HEAP8[" + PS + ">>0]=" + VS + "&255;" +
+               "HEAP8[" + PS + "+1>>0]=" + VS + ">>8";
+        break;
+      }
+      default: assert(0 && "bad store");
+    }
+  }
+  return text;
+}
+
+std::string JSWriter::getStackBump(unsigned Size) {
+  return getStackBump(utostr(Size));
+}
+
+std::string JSWriter::getStackBump(const std::string &Size) {
+  std::string ret = "STACKTOP = STACKTOP + " + Size + "|0;";
+  if (EmscriptenAssertions) {
+    ret += " if ((STACKTOP|0) >= (STACK_MAX|0)) abortStackOverflow(" + Size + "|0);";
+  }
+  return ret;
+}
+
+std::string JSWriter::getOpName(const Value* V) { // TODO: remove this
+  return getJSName(V);
+}
+
+std::string JSWriter::getPtrLoad(const Value* Ptr) {
+  Type *t = cast<PointerType>(Ptr->getType())->getElementType();
+  return getCast(getPtrUse(Ptr), t, ASM_NONSPECIFIC);
+}
+
+std::string JSWriter::getHeapAccess(const std::string& Name, unsigned Bytes, bool Integer) {
+  const char *HeapName = 0;
+  std::string Index = getHeapNameAndIndexToPtr(Name, Bytes, Integer, &HeapName);
+  return std::string(HeapName) + '[' + Index + ']';
+}
+
+std::string JSWriter::getShiftedPtr(const Value *Ptr, unsigned Bytes) {
+  const char *HeapName = 0; // unused
+  return getHeapNameAndIndex(Ptr, &HeapName, Bytes, true /* Integer; doesn't matter */);
+}
+
+std::string JSWriter::getPtrUse(const Value* Ptr) {
+  const char *HeapName = 0;
+  std::string Index = getHeapNameAndIndex(Ptr, &HeapName);
+  return std::string(HeapName) + '[' + Index + ']';
+}
+
+std::string JSWriter::getUndefValue(Type* T, AsmCast sign) {
+  std::string S;
+  if (VectorType *VT = dyn_cast<VectorType>(T)) {
+    checkVectorType(VT);
+    S = std::string("SIMD_") + SIMDType(VT) + "_splat(" + ensureFloat("0", !VT->getElementType()->isIntegerTy()) + ')';
+  } else {
+    if (OnlyWebAssembly && T->isIntegerTy() && T->getIntegerBitWidth() == 64) {
+      return "i64(0)"; 
+    }
+    S = T->isFloatingPointTy() ? "+0" : "0"; // XXX refactor this
+    if (PreciseF32 && T->isFloatTy() && !(sign & ASM_FFI_OUT)) {
+      S = "Math_fround(" + S + ")";
+    }
+  }
+  return S;
+}
+
+std::string JSWriter::getConstant(const Constant* CV, AsmCast sign) {
+  if (isa<ConstantPointerNull>(CV)) return "0";
+
+  if (const Function *F = dyn_cast<Function>(CV)) {
+    return relocateFunctionPointer(utostr(getFunctionIndex(F)));
+  }
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+    if (GV->isDeclaration()) {
+      std::string Name = getOpName(GV);
+      Externals.insert(Name);
+      if (Relocatable) {
+        // we access linked externs through calls, which we load at the beginning of basic blocks
+        FuncRelocatableExterns.insert(Name);
+        Name = "t$" + Name;
+        UsedVars[Name] = i32;
+      }
+      return Name;
+    }
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(CV)) {
+      // Since we don't currently support linking of our output, we don't need
+      // to worry about weak or other kinds of aliases.
+      return getConstant(GA->getAliasee()->stripPointerCasts(), sign);
+    }
+    return relocateGlobal(utostr(getGlobalAddress(GV->getName().str())));
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    if (!(sign & ASM_FORCE_FLOAT_AS_INTBITS)) {
+      std::string S = ftostr(CFP, sign);
+      if (PreciseF32 && CV->getType()->isFloatTy() && !(sign & ASM_FFI_OUT)) {
+        S = "Math_fround(" + S + ")";
+      }
+      return S;
+    } else {
+      const APFloat &flt = CFP->getValueAPF();
+      APInt i = flt.bitcastToAPInt();
+      assert(!(sign & ASM_UNSIGNED));
+      if (i.getBitWidth() == 32) return itostr((int)(uint32_t)*i.getRawData());
+      else return itostr(*i.getRawData());
+    }
+  } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    if (sign != ASM_UNSIGNED && CI->getValue().getBitWidth() == 1) {
+      sign = ASM_UNSIGNED; // bools must always be unsigned: either 0 or 1
+    }
+    if (!OnlyWebAssembly || CI->getValue().getBitWidth() != 64) {
+      return CI->getValue().toString(10, sign != ASM_UNSIGNED);
+    } else {
+      // i64 constant. emit as 32 bits, 32 bits, for ease of parsing by a JS-style parser
+      return emitI64Const(CI->getValue());
+    }
+  } else if (isa<UndefValue>(CV)) {
+    return getUndefValue(CV->getType(), sign);
+  } else if (isa<ConstantAggregateZero>(CV)) {
+    if (VectorType *VT = dyn_cast<VectorType>(CV->getType())) {
+      checkVectorType(VT);
+      return std::string("SIMD_") + SIMDType(VT) + "_splat(" + ensureFloat("0", !VT->getElementType()->isIntegerTy()) + ')';
+    } else {
+      // something like [0 x i8*] zeroinitializer, which clang can emit for landingpads
+      return "0";
+    }
+  } else if (const ConstantDataVector *DV = dyn_cast<ConstantDataVector>(CV)) {
+    return getConstantVector(DV);
+  } else if (const ConstantVector *V = dyn_cast<ConstantVector>(CV)) {
+    return getConstantVector(V);
+  } else if (const ConstantArray *CA = dyn_cast<const ConstantArray>(CV)) {
+    // handle things like [i8* bitcast (<{ i32, i32, i32 }>* @_ZTISt9bad_alloc to i8*)] which clang can emit for landingpads
+    assert(CA->getNumOperands() == 1);
+    CV = CA->getOperand(0);
+    const ConstantExpr *CE = cast<ConstantExpr>(CV);
+    CV = CE->getOperand(0); // ignore bitcast
+    return getConstant(CV);
+  } else if (const BlockAddress *BA = dyn_cast<const BlockAddress>(CV)) {
+    return utostr(getBlockAddress(BA));
+  } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    std::string Code;
+    raw_string_ostream CodeStream(Code);
+    CodeStream << '(';
+    generateExpression(CE, CodeStream);
+    CodeStream << ')';
+    return CodeStream.str();
+  } else {
+    CV->dump();
+    llvm_unreachable("Unsupported constant kind");
+  }
+}
+
+template<typename VectorType/*= ConstantVector or ConstantDataVector*/>
+class VectorOperandAccessor
+{
+public:
+  static Constant *getOperand(const VectorType *C, unsigned index);
+};
+template<> Constant *VectorOperandAccessor<ConstantVector>::getOperand(const ConstantVector *C, unsigned index) { return C->getOperand(index); }
+template<> Constant *VectorOperandAccessor<ConstantDataVector>::getOperand(const ConstantDataVector *C, unsigned index) { return C->getElementAsConstant(index); }
+
+template<typename ConstantVectorType/*= ConstantVector or ConstantDataVector*/>
+std::string JSWriter::getConstantVector(const ConstantVectorType *C) {
+  checkVectorType(C->getType());
+  unsigned NumElts = cast<VectorType>(C->getType())->getNumElements();
+
+  bool isInt = C->getType()->getElementType()->isIntegerTy();
+
+  // Test if this is a float vector, but it contains NaNs that have non-canonical bits that can't be represented as nans.
+  // These must be casted via an integer vector.
+  bool hasSpecialNaNs = false;
+
+  if (!isInt) {
+    const APInt nan32(32, 0x7FC00000);
+    const APInt nan64(64, 0x7FF8000000000000ULL);
+
+    for (unsigned i = 0; i < NumElts; ++i) {
+      Constant *CV = VectorOperandAccessor<ConstantVectorType>::getOperand(C, i);
+      const ConstantFP *CFP = dyn_cast<ConstantFP>(CV);
+      if (CFP) {
+        const APFloat &flt = CFP->getValueAPF();
+        if (flt.getCategory() == APFloat::fcNaN) {
+          APInt i = flt.bitcastToAPInt();
+          if ((i.getBitWidth() == 32 && i != nan32) || (i.getBitWidth() == 64 && i != nan64)) {
+            hasSpecialNaNs = true;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  AsmCast cast = hasSpecialNaNs ? ASM_FORCE_FLOAT_AS_INTBITS : 0;
+
+  // Check for a splat.
+  bool allEqual = true;
+  std::string op0 = getConstant(VectorOperandAccessor<ConstantVectorType>::getOperand(C, 0), cast);
+  for (unsigned i = 1; i < NumElts; ++i) {
+    if (getConstant(VectorOperandAccessor<ConstantVectorType>::getOperand(C, i), cast) != op0) {
+      allEqual = false;
+      break;
+    }
+  }
+  if (allEqual) {
+    if (!hasSpecialNaNs) {
+      return std::string("SIMD_") + SIMDType(C->getType()) + "_splat(" + ensureFloat(op0, !isInt) + ')';
+    } else {
+      VectorType *IntTy = VectorType::getInteger(C->getType());
+      checkVectorType(IntTy);
+      return getSIMDCast(IntTy, C->getType(), std::string("SIMD_") + SIMDType(IntTy) + "_splat(" + op0 + ')', true);
+    }
+  }
+
+  const int SIMDJsRetNumElements = SIMDNumElements(C->getType());
+
+  std::string c;
+  if (!hasSpecialNaNs) {
+    c = std::string("SIMD_") + SIMDType(C->getType()) + '(' + ensureFloat(op0, !isInt);
+    for (unsigned i = 1; i < NumElts; ++i) {
+      c += ',' + ensureFloat(getConstant(VectorOperandAccessor<ConstantVectorType>::getOperand(C, i)), !isInt);
+    }
+    // Promote smaller than 128-bit vector types to 128-bit since smaller ones do not exist in SIMD.js. (pad with zero lanes)
+    for (int i = NumElts; i < SIMDJsRetNumElements; ++i) {
+      c += ',' + ensureFloat(isInt ? "0" : "+0", !isInt);
+    }
+
+    return c + ')';
+  } else {
+    VectorType *IntTy = VectorType::getInteger(C->getType());
+    checkVectorType(IntTy);
+    c = std::string("SIMD_") + SIMDType(IntTy) + '(' + op0;
+    for (unsigned i = 1; i < NumElts; ++i) {
+      c += ',' + getConstant(VectorOperandAccessor<ConstantVectorType>::getOperand(C, i), ASM_FORCE_FLOAT_AS_INTBITS);
+    }
+
+    // Promote smaller than 128-bit vector types to 128-bit since smaller ones do not exist in SIMD.js. (pad with zero lanes)
+    for (int i = NumElts; i < SIMDJsRetNumElements; ++i) {
+      c += ',' + ensureFloat(isInt ? "0" : "+0", !isInt);
+    }
+
+    return getSIMDCast(IntTy, C->getType(), c + ")", true);
+  }
+}
+
+std::string JSWriter::getValueAsStr(const Value* V, AsmCast sign) {
+  // Skip past no-op bitcasts and zero-index geps.
+  V = stripPointerCastsWithoutSideEffects(V);
+
+  if (const Constant *CV = dyn_cast<Constant>(V)) {
+    return getConstant(CV, sign);
+  } else {
+    return getJSName(V);
+  }
+}
+
+std::string JSWriter::getValueAsCastStr(const Value* V, AsmCast sign) {
+  // Skip past no-op bitcasts and zero-index geps.
+  V = stripPointerCastsWithoutSideEffects(V);
+
+  if (isa<ConstantInt>(V) || isa<ConstantFP>(V)) {
+    return getConstant(cast<Constant>(V), sign);
+  } else {
+    return getCast(getValueAsStr(V), V->getType(), sign);
+  }
+}
+
+std::string JSWriter::getValueAsParenStr(const Value* V) {
+  // Skip past no-op bitcasts and zero-index geps.
+  V = stripPointerCastsWithoutSideEffects(V);
+
+  if (const Constant *CV = dyn_cast<Constant>(V)) {
+    return getConstant(CV);
+  } else {
+    return "(" + getValueAsStr(V) + ")";
+  }
+}
+
+std::string JSWriter::getValueAsCastParenStr(const Value* V, AsmCast sign) {
+  // Skip past no-op bitcasts and zero-index geps.
+  V = stripPointerCastsWithoutSideEffects(V);
+
+  if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
+    return getConstant(cast<Constant>(V), sign);
+  } else {
+    return "(" + getCast(getValueAsStr(V), V->getType(), sign) + ")";
+  }
+}
+
+void JSWriter::generateInsertElementExpression(const InsertElementInst *III, raw_string_ostream& Code) {
+  // LLVM has no vector type constructor operator; it uses chains of
+  // insertelement instructions instead. It also has no splat operator; it
+  // uses an insertelement followed by a shuffle instead. If this insertelement
+  // is part of either such sequence, skip it for now; we'll process it when we
+  // reach the end.
+  if (III->hasOneUse()) {
+      const User *U = *III->user_begin();
+      if (isa<InsertElementInst>(U))
+          return;
+      if (isa<ShuffleVectorInst>(U) &&
+          isa<ConstantAggregateZero>(cast<ShuffleVectorInst>(U)->getMask()) &&
+          !isa<InsertElementInst>(III->getOperand(0)) &&
+          isa<ConstantInt>(III->getOperand(2)) &&
+          cast<ConstantInt>(III->getOperand(2))->isZero())
+      {
+          return;
+      }
+  }
+
+  // This insertelement is at the base of a chain of single-user insertelement
+  // instructions. Collect all the inserted elements so that we can categorize
+  // the chain as either a splat, a constructor, or an actual series of inserts.
+  VectorType *VT = III->getType();
+  checkVectorType(VT);
+  unsigned NumElems = VT->getNumElements();
+  unsigned NumInserted = 0;
+  SmallVector<const Value *, 8> Operands(NumElems, NULL);
+  const Value *Splat = III->getOperand(1);
+  const Value *Base = III;
+  do {
+    const InsertElementInst *BaseIII = cast<InsertElementInst>(Base);
+    const ConstantInt *IndexInt = cast<ConstantInt>(BaseIII->getOperand(2));
+    unsigned Index = IndexInt->getZExtValue();
+    if (Operands[Index] == NULL)
+      ++NumInserted;
+    Value *Op = BaseIII->getOperand(1);
+    if (Operands[Index] == NULL) {
+      Operands[Index] = Op;
+      if (Op != Splat)
+        Splat = NULL;
+    }
+    Base = BaseIII->getOperand(0);
+  } while (Base->hasOneUse() && isa<InsertElementInst>(Base));
+
+  // Emit code for the chain.
+  Code << getAssignIfNeeded(III);
+  if (NumInserted == NumElems) {
+    if (Splat) {
+      // Emit splat code.
+      if (VT->getElementType()->isIntegerTy()) {
+        Code << std::string("SIMD_") + SIMDType(VT) + "_splat(" << getValueAsStr(Splat) << ")";
+      } else {
+        std::string operand = getValueAsStr(Splat);
+        if (!PreciseF32) {
+          // SIMD_Float32x4_splat requires an actual float32 even if we're
+          // otherwise not being precise about it.
+          operand = "Math_fround(" + operand + ")";
+        }
+        Code << std::string("SIMD_") + SIMDType(VT) + "_splat(" << operand << ")";
+      }
+    } else {
+      // Emit constructor code.
+      Code << std::string("SIMD_") + SIMDType(VT) + '(';
+      for (unsigned Index = 0; Index < NumElems; ++Index) {
+        if (Index != 0)
+          Code << ", ";
+        std::string operand = getValueAsStr(Operands[Index]);
+        if (!PreciseF32 && VT->getElementType()->isFloatTy()) {
+          // SIMD_Float32x4_splat requires an actual float32 even if we're
+          // otherwise not being precise about it.
+          operand = "Math_fround(" + operand + ")";
+        }
+        Code << operand;
+      }
+      Code << ")";
+    }
+  } else {
+    // Emit a series of inserts.
+    std::string Result = getValueAsStr(Base);
+    for (unsigned Index = 0; Index < NumElems; ++Index) {
+      if (!Operands[Index])
+        continue;
+      std::string operand = getValueAsStr(Operands[Index]);
+      if (!PreciseF32 && VT->getElementType()->isFloatTy()) {
+        operand = "Math_fround(" + operand + ")";
+      }
+      Result = std::string("SIMD_") + SIMDType(VT) + "_replaceLane(" + Result + ',' + utostr(Index) + ',' + operand + ')';
+    }
+    Code << Result;
+  }
+}
+
+void JSWriter::generateExtractElementExpression(const ExtractElementInst *EEI, raw_string_ostream& Code) {
+  VectorType *VT = cast<VectorType>(EEI->getVectorOperand()->getType());
+  checkVectorType(VT);
+  const ConstantInt *IndexInt = dyn_cast<const ConstantInt>(EEI->getIndexOperand());
+  if (IndexInt) {
+    unsigned Index = IndexInt->getZExtValue();
+    Code << getAssignIfNeeded(EEI);
+    std::string OperandCode;
+    raw_string_ostream CodeStream(OperandCode);
+    CodeStream << std::string("SIMD_") << SIMDType(VT) << "_extractLane(" << getValueAsStr(EEI->getVectorOperand()) << ',' << Index << ')';
+    Code << getCast(CodeStream.str(), EEI->getType());
+    return;
+  }
+
+  error("SIMD extract element with non-constant index not implemented yet");
+}
+
+
+std::string castIntVecToBoolVec(int numElems, const std::string &str)
+{
+  int elemWidth = 128 / numElems;
+  std::string simdType = "SIMD_Int" + to_string(elemWidth) + "x" + to_string(numElems);
+  return simdType + "_notEqual(" + str + ", " + simdType + "_splat(0))";
+}
+
+std::string JSWriter::getSIMDCast(VectorType *fromType, VectorType *toType, const std::string &valueStr, bool signExtend)
+{
+  bool toInt = toType->getElementType()->isIntegerTy();
+  bool fromInt = fromType->getElementType()->isIntegerTy();
+  int fromPrimSize = fromType->getElementType()->getPrimitiveSizeInBits();
+  int toPrimSize = toType->getElementType()->getPrimitiveSizeInBits();
+
+  if (fromInt == toInt && fromPrimSize == toPrimSize) {
+    // To and from are the same types, no cast needed.
+    return valueStr;
+  }
+
+  // Promote smaller than 128-bit vector types to 128-bit since smaller ones do not exist in SIMD.js. (pad with zero lanes)
+  const int toNumElems = SIMDNumElements(toType);
+
+  bool fromIsBool = (fromInt && fromPrimSize == 1);
+  bool toIsBool = (toInt && toPrimSize == 1);
+  if (fromIsBool && !toIsBool) { // Casting from bool vector to a bit vector looks more complicated (e.g. Bool32x4 to Int32x4)
+    return castBoolVecToIntVec(toNumElems, valueStr, signExtend);
+  }
+
+  if (fromType->getBitWidth() != toType->getBitWidth() && !fromIsBool && !toIsBool) {
+    error("Invalid SIMD cast between items of different bit sizes!");
+  }
+
+  return std::string("SIMD_") + SIMDType(toType) + "_from" + SIMDType(fromType) + "Bits(" + valueStr + ")";
+}
+
+void JSWriter::generateShuffleVectorExpression(const ShuffleVectorInst *SVI, raw_string_ostream& Code) {
+  Code << getAssignIfNeeded(SVI);
+
+  // LLVM has no splat operator, so it makes do by using an insert and a
+  // shuffle. If that's what this shuffle is doing, the code in
+  // generateInsertElementExpression will have also detected it and skipped
+  // emitting the insert, so we can just emit a splat here.
+  if (isa<ConstantAggregateZero>(SVI->getMask()) &&
+      isa<InsertElementInst>(SVI->getOperand(0)))
+  {
+    InsertElementInst *IEI = cast<InsertElementInst>(SVI->getOperand(0));
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(IEI->getOperand(2))) {
+      if (CI->isZero()) {
+        std::string operand = getValueAsStr(IEI->getOperand(1));
+        if (!PreciseF32 && SVI->getType()->getElementType()->isFloatTy()) {
+          // SIMD_Float32x4_splat requires an actual float32 even if we're
+          // otherwise not being precise about it.
+          operand = "Math_fround(" + operand + ")";
+        }
+        Code << "SIMD_" << SIMDType(SVI->getType()) << "_splat(" << operand << ')';
+        return;
+      }
+    }
+  }
+
+  // Check whether can generate SIMD.js swizzle or shuffle.
+  std::string A = getValueAsStr(SVI->getOperand(0));
+  std::string B = getValueAsStr(SVI->getOperand(1));
+  VectorType *op0 = cast<VectorType>(SVI->getOperand(0)->getType());
+  int OpNumElements = op0->getNumElements();
+  int ResultNumElements = SVI->getType()->getNumElements();
+  // Promote smaller than 128-bit vector types to 128-bit since smaller ones do not exist in SIMD.js. (pad with zero lanes)
+  const int SIMDJsRetNumElements = SIMDNumElements(cast<VectorType>(SVI->getType()));
+  const int SIMDJsOp0NumElements = SIMDNumElements(op0);
+  bool swizzleA = true;
+  bool swizzleB = true;
+  for(int i = 0; i < ResultNumElements; ++i) {
+    if (SVI->getMaskValue(i) >= OpNumElements) swizzleA = false;
+    if (SVI->getMaskValue(i) < OpNumElements) swizzleB = false;
+  }
+  assert(!(swizzleA && swizzleB));
+  if (swizzleA || swizzleB) {
+    std::string T = (swizzleA ? A : B);
+    Code << "SIMD_" << SIMDType(SVI->getType()) << "_swizzle(" << T;
+    int i = 0;
+    for (; i < ResultNumElements; ++i) {
+      Code << ", ";
+      int Mask = SVI->getMaskValue(i);
+      if (Mask < 0) {
+        Code << 0;
+      } else if (Mask < OpNumElements) {
+        Code << Mask;
+      } else {
+        assert(Mask < OpNumElements * 2);
+        Code << (Mask-OpNumElements);
+      }
+    }
+    // Promote smaller than 128-bit vector types to 128-bit since smaller ones do not exist in SIMD.js. (pad with zero lanes)
+    for(int i = ResultNumElements; i < SIMDJsRetNumElements; ++i) {
+      Code << ", 0";
+    }
+    Code << ")";
+    return;
+  }
+
+  // Emit a fully-general shuffle.
+  Code << "SIMD_" << SIMDType(SVI->getType()) << "_shuffle(";
+
+  Code << getSIMDCast(cast<VectorType>(SVI->getOperand(0)->getType()), SVI->getType(), A, true) << ", "
+       << getSIMDCast(cast<VectorType>(SVI->getOperand(1)->getType()), SVI->getType(), B, true) << ", ";
+
+  SmallVector<int, 16> Indices;
+  SVI->getShuffleMask(Indices);
+  for (unsigned int i = 0; i < Indices.size(); ++i) {
+    if (i != 0)
+      Code << ", ";
+    int Mask = Indices[i];
+    if (Mask < 0)
+      Code << 0;
+    else if (Mask < OpNumElements)
+      Code << Mask;
+    else
+      Code << (Mask  + SIMDJsOp0NumElements - OpNumElements); // Fix up indices to second operand, since the first operand has potentially different number of lanes in SIMD.js compared to LLVM.
+  }
+
+  // Promote smaller than 128-bit vector types to 128-bit since smaller ones do not exist in SIMD.js. (pad with zero lanes)
+  for(int i = Indices.size(); i < SIMDJsRetNumElements; ++i) {
+    Code << ", 0";
+  }
+
+  Code << ')';
+}
+
+void JSWriter::generateICmpExpression(const ICmpInst *I, raw_string_ostream& Code) {
+  bool Invert = false;
+  const char *Name;
+  switch (cast<ICmpInst>(I)->getPredicate()) {
+    case ICmpInst::ICMP_EQ:  Name = "equal"; break;
+    case ICmpInst::ICMP_NE:  Name = "equal"; Invert = true; break;
+    case ICmpInst::ICMP_SLE: Name = "greaterThan"; Invert = true; break;
+    case ICmpInst::ICMP_SGE: Name = "lessThan"; Invert = true; break;
+    case ICmpInst::ICMP_ULE: Name = "unsignedLessThanOrEqual"; break;
+    case ICmpInst::ICMP_UGE: Name = "unsignedGreaterThanOrEqual"; break;
+    case ICmpInst::ICMP_ULT: Name = "unsignedLessThan"; break;
+    case ICmpInst::ICMP_SLT: Name = "lessThan"; break;
+    case ICmpInst::ICMP_UGT: Name = "unsignedGreaterThan"; break;
+    case ICmpInst::ICMP_SGT: Name = "greaterThan"; break;
+    default: I->dump(); error("invalid vector icmp"); break;
+  }
+
+  checkVectorType(I->getOperand(0)->getType());
+  checkVectorType(I->getOperand(1)->getType());
+
+  Code << getAssignIfNeeded(I);
+
+  if (Invert)
+    Code << "SIMD_" << SIMDType(cast<VectorType>(I->getType())) << "_not(";
+
+  Code << "SIMD_" << SIMDType(cast<VectorType>(I->getOperand(0)->getType())) << '_' << Name << '('
+       << getValueAsStr(I->getOperand(0)) << ',' << getValueAsStr(I->getOperand(1)) << ')';
+
+  if (Invert)
+    Code << ')';
+}
+
+void JSWriter::generateFCmpExpression(const FCmpInst *I, raw_string_ostream& Code) {
+  const char *Name;
+  bool Invert = false;
+  VectorType *VT = cast<VectorType>(I->getType());
+  checkVectorType(VT);
+  switch (cast<FCmpInst>(I)->getPredicate()) {
+    case ICmpInst::FCMP_FALSE:
+      Code << getAssignIfNeeded(I) << "SIMD_" << SIMDType(cast<VectorType>(I->getType())) << "_splat(" << ensureFloat("0", true) << ')';
+      return;
+    case ICmpInst::FCMP_TRUE:
+      Code << getAssignIfNeeded(I) << "SIMD_" << SIMDType(cast<VectorType>(I->getType())) << "_splat(" << ensureFloat("-1", true) << ')';
+      return;
+    case ICmpInst::FCMP_ONE:
+      checkVectorType(I->getOperand(0)->getType());
+      checkVectorType(I->getOperand(1)->getType());
+      Code << getAssignIfNeeded(I)
+           << castIntVecToBoolVec(VT->getNumElements(), std::string("SIMD_") + SIMDType(cast<VectorType>(I->getType())) + "_and(SIMD_" + SIMDType(cast<VectorType>(I->getType())) + "_and("
+            + castBoolVecToIntVec(VT->getNumElements(), std::string("SIMD_") + SIMDType(cast<VectorType>(I->getOperand(0)->getType())) + "_equal(" + getValueAsStr(I->getOperand(0)) + ',' + getValueAsStr(I->getOperand(0)) + ')', true) + ','
+            + castBoolVecToIntVec(VT->getNumElements(), std::string("SIMD_") + SIMDType(cast<VectorType>(I->getOperand(1)->getType())) + "_equal(" + getValueAsStr(I->getOperand(1)) + ',' + getValueAsStr(I->getOperand(1)) + ')', true) + ','
+            + castBoolVecToIntVec(VT->getNumElements(), std::string("SIMD_") + SIMDType(cast<VectorType>(I->getOperand(0)->getType())) + "_notEqual(" + getValueAsStr(I->getOperand(0)) + ',' + getValueAsStr(I->getOperand(1)) + ')', true) + ')');
+      return;
+    case ICmpInst::FCMP_UEQ:
+      checkVectorType(I->getOperand(0)->getType());
+      checkVectorType(I->getOperand(1)->getType());
+      Code << getAssignIfNeeded(I)
+           << castIntVecToBoolVec(VT->getNumElements(), std::string("SIMD_") + SIMDType(cast<VectorType>(I->getType())) + "_or(SIMD_" + SIMDType(cast<VectorType>(I->getType())) + "_or("
+            + castBoolVecToIntVec(VT->getNumElements(), std::string("SIMD_") + SIMDType(cast<VectorType>(I->getOperand(0)->getType())) + "_notEqual(" + getValueAsStr(I->getOperand(0)) + ',' + getValueAsStr(I->getOperand(0)) + ')', true) + ','
+            + castBoolVecToIntVec(VT->getNumElements(), std::string("SIMD_") + SIMDType(cast<VectorType>(I->getOperand(1)->getType())) + "_notEqual(" + getValueAsStr(I->getOperand(1)) + ',' + getValueAsStr(I->getOperand(1)) + ')', true) + ','
+            + castBoolVecToIntVec(VT->getNumElements(), std::string("SIMD_") + SIMDType(cast<VectorType>(I->getOperand(0)->getType())) + "_equal(" + getValueAsStr(I->getOperand(0)) + ',' + getValueAsStr(I->getOperand(1)) + ')', true) + ')');
+      return;
+    case FCmpInst::FCMP_ORD:
+      checkVectorType(I->getOperand(0)->getType());
+      checkVectorType(I->getOperand(1)->getType());
+      Code << getAssignIfNeeded(I)
+           << "SIMD_" << SIMDType(cast<VectorType>(I->getType())) << "_and("
+           << "SIMD_" << SIMDType(cast<VectorType>(I->getOperand(0)->getType())) << "_equal(" << getValueAsStr(I->getOperand(0)) << ',' << getValueAsStr(I->getOperand(0)) << "),"
+           << "SIMD_" << SIMDType(cast<VectorType>(I->getOperand(1)->getType())) << "_equal(" << getValueAsStr(I->getOperand(1)) << ',' << getValueAsStr(I->getOperand(1)) << "))";
+      return;
+
+    case FCmpInst::FCMP_UNO:
+      checkVectorType(I->getOperand(0)->getType());
+      checkVectorType(I->getOperand(1)->getType());
+      Code << getAssignIfNeeded(I)
+           << "SIMD_" << SIMDType(cast<VectorType>(I->getType())) << "_or("
+           << "SIMD_" << SIMDType(cast<VectorType>(I->getOperand(0)->getType())) << "_notEqual(" << getValueAsStr(I->getOperand(0)) << ',' << getValueAsStr(I->getOperand(0)) << "),"
+           << "SIMD_" << SIMDType(cast<VectorType>(I->getOperand(1)->getType())) << "_notEqual(" << getValueAsStr(I->getOperand(1)) << ',' << getValueAsStr(I->getOperand(1)) << "))";
+      return;
+
+    case ICmpInst::FCMP_OEQ:  Name = "equal"; break;
+    case ICmpInst::FCMP_OGT:  Name = "greaterThan"; break;
+    case ICmpInst::FCMP_OGE:  Name = "greaterThanOrEqual"; break;
+    case ICmpInst::FCMP_OLT:  Name = "lessThan"; break;
+    case ICmpInst::FCMP_OLE:  Name = "lessThanOrEqual"; break;
+    case ICmpInst::FCMP_UGT:  Name = "lessThanOrEqual"; Invert = true; break;
+    case ICmpInst::FCMP_UGE:  Name = "lessThan"; Invert = true; break;
+    case ICmpInst::FCMP_ULT:  Name = "greaterThanOrEqual"; Invert = true; break;
+    case ICmpInst::FCMP_ULE:  Name = "greaterThan"; Invert = true; break;
+    case ICmpInst::FCMP_UNE:  Name = "notEqual"; break;
+    default: I->dump(); error("invalid vector fcmp"); break;
+  }
+
+  checkVectorType(I->getOperand(0)->getType());
+  checkVectorType(I->getOperand(1)->getType());
+
+  Code << getAssignIfNeeded(I);
+
+  if (Invert)
+    Code << "SIMD_" << SIMDType(cast<VectorType>(I->getType())) << "_not(";
+
+  Code << "SIMD_" << SIMDType(cast<VectorType>(I->getOperand(0)->getType())) << "_" << Name << "("
+       << getValueAsStr(I->getOperand(0)) << ", " << getValueAsStr(I->getOperand(1)) << ")";
+
+  if (Invert)
+    Code << ")";
+}
+
+static const Value *getElement(const Value *V, unsigned i) {
+    if (const InsertElementInst *II = dyn_cast<InsertElementInst>(V)) {
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(II->getOperand(2))) {
+            if (CI->equalsInt(i))
+                return II->getOperand(1);
+        }
+        return getElement(II->getOperand(0), i);
+    }
+    return NULL;
+}
+
+static const Value *getSplatValue(const Value *V) {
+    if (const Constant *C = dyn_cast<Constant>(V))
+        return C->getSplatValue();
+
+    VectorType *VTy = cast<VectorType>(V->getType());
+    const Value *Result = NULL;
+    for (unsigned i = 0; i < VTy->getNumElements(); ++i) {
+        const Value *E = getElement(V, i);
+        if (!E)
+            return NULL;
+        if (!Result)
+            Result = E;
+        else if (Result != E)
+            return NULL;
+    }
+    return Result;
+
+}
+
+void JSWriter::generateShiftExpression(const BinaryOperator *I, raw_string_ostream& Code) {
+    // If we're shifting every lane by the same amount (shifting by a splat value
+    // then we can use a ByScalar shift.
+    const Value *Count = I->getOperand(1);
+    if (const Value *Splat = getSplatValue(Count)) {
+        Code << getAssignIfNeeded(I) << "SIMD_" << SIMDType(cast<VectorType>(I->getType())) << '_';
+        if (I->getOpcode() == Instruction::AShr)
+            Code << "shiftRightArithmeticByScalar";
+        else if (I->getOpcode() == Instruction::LShr)
+            Code << "shiftRightLogicalByScalar";
+        else
+            Code << "shiftLeftByScalar";
+        Code << "(" << getValueAsStr(I->getOperand(0)) << ", " << getValueAsStr(Splat) << ")";
+        return;
+    }
+
+    // SIMD.js does not currently have vector-vector shifts.
+    generateUnrolledExpression(I, Code);
+}
+
+void JSWriter::generateUnrolledExpression(const User *I, raw_string_ostream& Code) {
+  VectorType *VT = cast<VectorType>(I->getType());
+
+  Code << getAssignIfNeeded(I);
+
+  Code << "SIMD_" << SIMDType(VT) << '(';
+
+  int primSize = VT->getElementType()->getPrimitiveSizeInBits();
+  int numElems = VT->getNumElements();
+  if (primSize == 32 && numElems < 4) {
+    report_fatal_error("generateUnrolledExpression not expected to handle less than four-wide 32-bit vector types!");
+  }
+
+  for (unsigned Index = 0; Index < VT->getNumElements(); ++Index) {
+    if (Index != 0)
+        Code << ", ";
+    if (!PreciseF32 && VT->getElementType()->isFloatTy()) {
+        Code << "Math_fround(";
+    }
+    std::string Extract;
+    if (VT->getElementType()->isIntegerTy()) {
+      Extract = "SIMD_Int32x4_extractLane(";
+      UsesSIMDInt32x4 = true;
+    } else {
+      Extract = "SIMD_Float32x4_extractLane(";
+      UsesSIMDFloat32x4 = true;
+    }
+    switch (Operator::getOpcode(I)) {
+      case Instruction::SDiv:
+        Code << "(" << Extract << getValueAsStr(I->getOperand(0)) << "," << Index << ")|0)"
+                " / "
+                "(" << Extract << getValueAsStr(I->getOperand(1)) << "," << Index << ")|0)"
+                "|0";
+        break;
+      case Instruction::UDiv:
+        Code << "(" << Extract << getValueAsStr(I->getOperand(0)) << "," << Index << ")>>>0)"
+                " / "
+                "(" << Extract << getValueAsStr(I->getOperand(1)) << "," << Index << ")>>>0)"
+                ">>>0";
+        break;
+      case Instruction::SRem:
+        Code << "(" << Extract << getValueAsStr(I->getOperand(0)) << "," << Index << ")|0)"
+                " % "
+                "(" << Extract << getValueAsStr(I->getOperand(1)) << "," << Index << ")|0)"
+                "|0";
+        break;
+      case Instruction::URem:
+        Code << "(" << Extract << getValueAsStr(I->getOperand(0)) << "," << Index << ")>>>0)"
+                " % "
+                "(" << Extract << getValueAsStr(I->getOperand(1)) << "," << Index << ")>>>0)"
+                ">>>0";
+        break;
+      case Instruction::AShr:
+        Code << "(" << Extract << getValueAsStr(I->getOperand(0)) << "," << Index << ")|0)"
+                " >> "
+                "(" << Extract << getValueAsStr(I->getOperand(1)) << "," << Index << ")|0)"
+                "|0";
+        break;
+      case Instruction::LShr:
+        Code << "(" << Extract << getValueAsStr(I->getOperand(0)) << "," << Index << ")|0)"
+                " >>> "
+                "(" << Extract << getValueAsStr(I->getOperand(1)) << "," << Index << ")|0)"
+                "|0";
+        break;
+      case Instruction::Shl:
+        Code << "(" << Extract << getValueAsStr(I->getOperand(0)) << "," << Index << ")|0)"
+                " << "
+                "(" << Extract << getValueAsStr(I->getOperand(1)) << "," << Index << ")|0)"
+                "|0";
+        break;
+      default: I->dump(); error("invalid unrolled vector instr"); break;
+    }
+    if (!PreciseF32 && VT->getElementType()->isFloatTy()) {
+        Code << ")";
+    }
+  }
+
+  Code << ")";
+}
+
+bool JSWriter::generateSIMDExpression(const User *I, raw_string_ostream& Code) {
+  VectorType *VT;
+  if ((VT = dyn_cast<VectorType>(I->getType()))) {
+    // vector-producing instructions
+    checkVectorType(VT);
+    std::string simdType = SIMDType(VT);
+
+    switch (Operator::getOpcode(I)) {
+      default: I->dump(); error("invalid vector instr"); break;
+      case Instruction::Call: // return value is just a SIMD value, no special handling
+        return false;
+      case Instruction::PHI: // handled separately - we push them back into the relooper branchings
+        break;
+      case Instruction::ICmp:
+        generateICmpExpression(cast<ICmpInst>(I), Code);
+        break;
+      case Instruction::FCmp:
+        generateFCmpExpression(cast<FCmpInst>(I), Code);
+        break;
+      case Instruction::SExt:
+        assert(cast<VectorType>(I->getOperand(0)->getType())->getElementType()->isIntegerTy(1) &&
+               "sign-extension from vector of other than i1 not yet supported");
+        Code << getAssignIfNeeded(I) << getSIMDCast(cast<VectorType>(I->getOperand(0)->getType()), VT, getValueAsStr(I->getOperand(0)), true /* signExtend */);
+        break;
+      case Instruction::ZExt:
+        assert(cast<VectorType>(I->getOperand(0)->getType())->getElementType()->isIntegerTy(1) &&
+               "sign-extension from vector of other than i1 not yet supported");
+        Code << getAssignIfNeeded(I) << getSIMDCast(cast<VectorType>(I->getOperand(0)->getType()), VT, getValueAsStr(I->getOperand(0)), false /* signExtend */);
+        break;
+      case Instruction::Select:
+        // Since we represent vectors of i1 as vectors of sign extended wider integers,
+        // selecting on them is just an elementwise select.
+        if (isa<VectorType>(I->getOperand(0)->getType())) {
+          if (cast<VectorType>(I->getType())->getElementType()->isIntegerTy()) {
+            Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_select(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << "," << getValueAsStr(I->getOperand(2)) << ")"; break;
+          } else {
+            Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_select(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << "," << getValueAsStr(I->getOperand(2)) << ")"; break;
+          }
+          return true;
+        }
+        // Otherwise we have a scalar condition, so it's a ?: operator.
+        return false;
+      case Instruction::FAdd: Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_add(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case Instruction::FMul: Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_mul(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case Instruction::FDiv: Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_div(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case Instruction::Add: Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_add(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case Instruction::Sub: Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_sub(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case Instruction::Mul: Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_mul(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case Instruction::And: Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_and(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case Instruction::Or:  Code << getAssignIfNeeded(I) << "SIMD_" << simdType << "_or(" <<  getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case Instruction::Xor:
+        // LLVM represents a not(x) as -1 ^ x
+        Code << getAssignIfNeeded(I);
+        if (BinaryOperator::isNot(I)) {
+          Code << "SIMD_" << simdType << "_not(" << getValueAsStr(BinaryOperator::getNotArgument(I)) << ")"; break;
+        } else {
+          Code << "SIMD_" << simdType << "_xor(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        }
+        break;
+      case Instruction::FSub:
+        // LLVM represents an fneg(x) as -0.0 - x.
+        Code << getAssignIfNeeded(I);
+        if (BinaryOperator::isFNeg(I)) {
+          Code << "SIMD_" << simdType << "_neg(" << getValueAsStr(BinaryOperator::getFNegArgument(I)) << ")";
+        } else {
+          Code << "SIMD_" << simdType << "_sub(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")";
+        }
+        break;
+      case Instruction::BitCast: {
+      case Instruction::SIToFP:
+        Code << getAssignIfNeeded(I);
+        Code << getSIMDCast(cast<VectorType>(I->getOperand(0)->getType()), cast<VectorType>(I->getType()), getValueAsStr(I->getOperand(0)), true);
+        break;
+      }
+      case Instruction::Load: {
+        const LoadInst *LI = cast<LoadInst>(I);
+        const Value *P = LI->getPointerOperand();
+        std::string PS = getValueAsStr(P);
+        const char *load = "_load";
+        if (VT->getElementType()->getPrimitiveSizeInBits() == 32) {
+          switch (VT->getNumElements()) {
+            case 1: load = "_load1"; break;
+            case 2: load = "_load2"; break;
+            case 3: load = "_load3"; break;
+            default: break;
+          }
+        }
+        Code << getAssignIfNeeded(I) << "SIMD_" << simdType << load << "(HEAPU8, " << PS << ")";
+        break;
+      }
+      case Instruction::InsertElement:
+        generateInsertElementExpression(cast<InsertElementInst>(I), Code);
+        break;
+      case Instruction::ShuffleVector:
+        generateShuffleVectorExpression(cast<ShuffleVectorInst>(I), Code);
+        break;
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::SRem:
+      case Instruction::URem:
+        // The SIMD API does not currently support these operations directly.
+        // Emulate them using scalar operations (which is essentially the same
+        // as what would happen if the API did support them, since hardware
+        // doesn't support them).
+        generateUnrolledExpression(I, Code);
+        break;
+      case Instruction::AShr:
+      case Instruction::LShr:
+      case Instruction::Shl:
+        generateShiftExpression(cast<BinaryOperator>(I), Code);
+        break;
+    }
+    return true;
+  } else {
+    // vector-consuming instructions
+    if (Operator::getOpcode(I) == Instruction::Store && (VT = dyn_cast<VectorType>(I->getOperand(0)->getType())) && VT->isVectorTy()) {
+      checkVectorType(VT);
+      std::string simdType = SIMDType(VT);
+      const StoreInst *SI = cast<StoreInst>(I);
+      const Value *P = SI->getPointerOperand();
+      std::string PS = "temp_" + simdType + "_ptr";
+      std::string VS = getValueAsStr(SI->getValueOperand());
+      Code << getAdHocAssign(PS, P->getType()) << getValueAsStr(P) << ';';
+      const char *store = "_store";
+      if (VT->getElementType()->getPrimitiveSizeInBits() == 32) {
+        switch (VT->getNumElements()) {
+          case 1: store = "_store1"; break;
+          case 2: store = "_store2"; break;
+          case 3: store = "_store3"; break;
+          default: break;
+        }
+      }
+      Code << "SIMD_" << simdType << store << "(HEAPU8, " << PS << ", " << VS << ")";
+      return true;
+    } else if (Operator::getOpcode(I) == Instruction::ExtractElement) {
+      generateExtractElementExpression(cast<ExtractElementInst>(I), Code);
+      return true;
+    }
+  }
+  return false;
+}
+
+static uint64_t LSBMask(unsigned numBits) {
+  return numBits >= 64 ? 0xFFFFFFFFFFFFFFFFULL : (1ULL << numBits) - 1;
+}
+
+// Given a string which contains a printed base address, print a new string
+// which contains that address plus the given offset.
+static std::string AddOffset(const std::string &base, int32_t Offset) {
+  if (base.empty())
+    return itostr(Offset);
+
+  if (Offset == 0)
+    return base;
+
+  return "((" + base + ") + " + itostr(Offset) + "|0)";
+}
+
+// Generate code for and operator, either an Instruction or a ConstantExpr.
+void JSWriter::generateExpression(const User *I, raw_string_ostream& Code) {
+  // To avoid emiting code and variables for the no-op pointer bitcasts
+  // and all-zero-index geps that LLVM needs to satisfy its type system, we
+  // call stripPointerCasts() on all values before translating them. This
+  // includes bitcasts whose only use is lifetime marker intrinsics.
+  assert(I == stripPointerCastsWithoutSideEffects(I));
+
+  Type *T = I->getType();
+  if (T->isIntegerTy() && ((!OnlyWebAssembly && T->getIntegerBitWidth() > 32) ||
+                           ( OnlyWebAssembly && T->getIntegerBitWidth() > 64))) {
+    errs() << *I << "\n";
+    report_fatal_error("legalization problem");
+  }
+
+  if (!generateSIMDExpression(I, Code)) switch (Operator::getOpcode(I)) {
+  default: {
+    I->dump();
+    error("Invalid instruction in JSWriter::generateExpression");
+    break;
+  }
+  case Instruction::Ret: {
+    const ReturnInst* ret =  cast<ReturnInst>(I);
+    const Value *RV = ret->getReturnValue();
+    if (StackBumped) {
+      Code << "STACKTOP = sp;";
+    }
+    Code << "return";
+    if (RV != NULL) {
+      Code << " " << getValueAsCastParenStr(RV, ASM_NONSPECIFIC | ASM_MUST_CAST);
+    }
+    break;
+  }
+  case Instruction::Br:
+  case Instruction::IndirectBr:
+  case Instruction::Switch: return; // handled while relooping
+  case Instruction::Unreachable: {
+    // Typically there should be an abort right before these, so we don't emit any code // TODO: when ASSERTIONS are on, emit abort(0)
+    Code << "// unreachable";
+    break;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:{
+    Code << getAssignIfNeeded(I);
+    unsigned opcode = Operator::getOpcode(I);
+    if (OnlyWebAssembly && I->getType()->isIntegerTy() && I->getType()->getIntegerBitWidth() == 64) {
+      switch (opcode) {
+        case Instruction::Add:  Code << "i64_add(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::Sub:  Code << "i64_sub(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::Mul:  Code << "i64_mul(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::UDiv: Code << "i64_udiv(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::SDiv: Code << "i64_sdiv(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::URem: Code << "i64_urem(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::SRem: Code << "i64_srem(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::And:  Code << "i64_and(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::Or:   Code << "i64_or(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::Xor:  Code << "i64_xor(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::Shl:  Code << "i64_shl(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::AShr: Code << "i64_ashr(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case Instruction::LShr: Code << "i64_lshr(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        default: error("bad wasm-i64 binary opcode"); break;
+      }
+      break;
+    }
+    switch (opcode) {
+      case Instruction::Add:  Code << getParenCast(
+                                        getValueAsParenStr(I->getOperand(0)) +
+                                        " + " +
+                                        getValueAsParenStr(I->getOperand(1)),
+                                        I->getType()
+                                      ); break;
+      case Instruction::Sub:  Code << getParenCast(
+                                        getValueAsParenStr(I->getOperand(0)) +
+                                        " - " +
+                                        getValueAsParenStr(I->getOperand(1)),
+                                        I->getType()
+                                      ); break;
+      case Instruction::Mul:  Code << getIMul(I->getOperand(0), I->getOperand(1)); break;
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::URem:
+      case Instruction::SRem: Code << "(" <<
+                                      getValueAsCastParenStr(I->getOperand(0), (opcode == Instruction::SDiv || opcode == Instruction::SRem) ? ASM_SIGNED : ASM_UNSIGNED) <<
+                                      ((opcode == Instruction::UDiv || opcode == Instruction::SDiv) ? " / " : " % ") <<
+                                      getValueAsCastParenStr(I->getOperand(1), (opcode == Instruction::SDiv || opcode == Instruction::SRem) ? ASM_SIGNED : ASM_UNSIGNED) <<
+                                      ")&-1"; break;
+      case Instruction::And:  Code << getValueAsStr(I->getOperand(0)) << " & " <<   getValueAsStr(I->getOperand(1)); break;
+      case Instruction::Or:   Code << getValueAsStr(I->getOperand(0)) << " | " <<   getValueAsStr(I->getOperand(1)); break;
+      case Instruction::Xor:  Code << getValueAsStr(I->getOperand(0)) << " ^ " <<   getValueAsStr(I->getOperand(1)); break;
+      case Instruction::Shl:  {
+        std::string Shifted = getValueAsStr(I->getOperand(0)) + " << " +  getValueAsStr(I->getOperand(1));
+        if (I->getType()->getIntegerBitWidth() < 32) {
+          Shifted = getParenCast(Shifted, I->getType(), ASM_UNSIGNED); // remove bits that are shifted beyond the size of this value
+        }
+        Code << Shifted;
+        break;
+      }
+      case Instruction::AShr:
+      case Instruction::LShr: {
+        std::string Input = getValueAsStr(I->getOperand(0));
+        if (I->getType()->getIntegerBitWidth() < 32) {
+          Input = '(' + getCast(Input, I->getType(), opcode == Instruction::AShr ? ASM_SIGNED : ASM_UNSIGNED) + ')'; // fill in high bits, as shift needs those and is done in 32-bit
+        }
+        Code << Input << (opcode == Instruction::AShr ? " >> " : " >>> ") <<  getValueAsStr(I->getOperand(1));
+        break;
+      }
+
+      case Instruction::FAdd: Code << ensureFloat(getValueAsStr(I->getOperand(0)) + " + " + getValueAsStr(I->getOperand(1)), I->getType()); break;
+      case Instruction::FMul: Code << ensureFloat(getValueAsStr(I->getOperand(0)) + " * " + getValueAsStr(I->getOperand(1)), I->getType()); break;
+      case Instruction::FDiv: Code << ensureFloat(getValueAsStr(I->getOperand(0)) + " / " + getValueAsStr(I->getOperand(1)), I->getType()); break;
+      case Instruction::FRem: Code << ensureFloat(getValueAsStr(I->getOperand(0)) + " % " + getValueAsStr(I->getOperand(1)), I->getType()); break;
+      case Instruction::FSub:
+        // LLVM represents an fneg(x) as -0.0 - x.
+        if (BinaryOperator::isFNeg(I)) {
+          Code << ensureFloat("-" + getValueAsStr(BinaryOperator::getFNegArgument(I)), I->getType());
+        } else {
+          Code << ensureFloat(getValueAsStr(I->getOperand(0)) + " - " + getValueAsStr(I->getOperand(1)), I->getType());
+        }
+        break;
+      default: error("bad binary opcode"); break;
+    }
+    break;
+  }
+  case Instruction::FCmp: {
+    unsigned predicate = isa<ConstantExpr>(I) ?
+                         (unsigned)cast<ConstantExpr>(I)->getPredicate() :
+                         (unsigned)cast<FCmpInst>(I)->getPredicate();
+    Code << getAssignIfNeeded(I);
+    switch (predicate) {
+      // Comparisons which are simple JS operators.
+      case FCmpInst::FCMP_OEQ:   Code << getValueAsStr(I->getOperand(0)) << " == " << getValueAsStr(I->getOperand(1)); break;
+      case FCmpInst::FCMP_UNE:   Code << getValueAsStr(I->getOperand(0)) << " != " << getValueAsStr(I->getOperand(1)); break;
+      case FCmpInst::FCMP_OGT:   Code << getValueAsStr(I->getOperand(0)) << " > "  << getValueAsStr(I->getOperand(1)); break;
+      case FCmpInst::FCMP_OGE:   Code << getValueAsStr(I->getOperand(0)) << " >= " << getValueAsStr(I->getOperand(1)); break;
+      case FCmpInst::FCMP_OLT:   Code << getValueAsStr(I->getOperand(0)) << " < "  << getValueAsStr(I->getOperand(1)); break;
+      case FCmpInst::FCMP_OLE:   Code << getValueAsStr(I->getOperand(0)) << " <= " << getValueAsStr(I->getOperand(1)); break;
+
+      // Comparisons which are inverses of JS operators.
+      case FCmpInst::FCMP_UGT:
+        Code << "!(" << getValueAsStr(I->getOperand(0)) << " <= " << getValueAsStr(I->getOperand(1)) << ")";
+        break;
+      case FCmpInst::FCMP_UGE:
+        Code << "!(" << getValueAsStr(I->getOperand(0)) << " < "  << getValueAsStr(I->getOperand(1)) << ")";
+        break;
+      case FCmpInst::FCMP_ULT:
+        Code << "!(" << getValueAsStr(I->getOperand(0)) << " >= " << getValueAsStr(I->getOperand(1)) << ")";
+        break;
+      case FCmpInst::FCMP_ULE:
+        Code << "!(" << getValueAsStr(I->getOperand(0)) << " > "  << getValueAsStr(I->getOperand(1)) << ")";
+        break;
+
+      // Comparisons which require explicit NaN checks.
+      case FCmpInst::FCMP_UEQ:
+        Code << "(" << getValueAsStr(I->getOperand(0)) << " != " << getValueAsStr(I->getOperand(0)) << ") | " <<
+                "(" << getValueAsStr(I->getOperand(1)) << " != " << getValueAsStr(I->getOperand(1)) << ") |" <<
+                "(" << getValueAsStr(I->getOperand(0)) << " == " << getValueAsStr(I->getOperand(1)) << ")";
+        break;
+      case FCmpInst::FCMP_ONE:
+        Code << "(" << getValueAsStr(I->getOperand(0)) << " == " << getValueAsStr(I->getOperand(0)) << ") & " <<
+                "(" << getValueAsStr(I->getOperand(1)) << " == " << getValueAsStr(I->getOperand(1)) << ") &" <<
+                "(" << getValueAsStr(I->getOperand(0)) << " != " << getValueAsStr(I->getOperand(1)) << ")";
+        break;
+
+      // Simple NaN checks.
+      case FCmpInst::FCMP_ORD:   Code << "(" << getValueAsStr(I->getOperand(0)) << " == " << getValueAsStr(I->getOperand(0)) << ") & " <<
+                                         "(" << getValueAsStr(I->getOperand(1)) << " == " << getValueAsStr(I->getOperand(1)) << ")"; break;
+      case FCmpInst::FCMP_UNO:   Code << "(" << getValueAsStr(I->getOperand(0)) << " != " << getValueAsStr(I->getOperand(0)) << ") | " <<
+                                         "(" << getValueAsStr(I->getOperand(1)) << " != " << getValueAsStr(I->getOperand(1)) << ")"; break;
+
+      // Simple constants.
+      case FCmpInst::FCMP_FALSE: Code << "0"; break;
+      case FCmpInst::FCMP_TRUE : Code << "1"; break;
+
+      default: error("bad fcmp"); break;
+    }
+    break;
+  }
+  case Instruction::ICmp: {
+    auto predicate = isa<ConstantExpr>(I) ?
+                     (CmpInst::Predicate)cast<ConstantExpr>(I)->getPredicate() :
+                     cast<ICmpInst>(I)->getPredicate();
+    if (OnlyWebAssembly && I->getOperand(0)->getType()->isIntegerTy() && I->getOperand(0)->getType()->getIntegerBitWidth() == 64) {
+      Code << getAssignIfNeeded(I);
+      switch (predicate) {
+        case ICmpInst::ICMP_EQ:  Code << "i64_eq(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_NE:  Code << "i64_ne(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_ULE: Code << "i64_ule(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_SLE: Code << "i64_sle(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_UGE: Code << "i64_uge(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_SGE: Code << "i64_sge(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_ULT: Code << "i64_ult(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_SLT: Code << "i64_slt(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_UGT: Code << "i64_ugt(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        case ICmpInst::ICMP_SGT: Code << "i64_sgt(" << getValueAsStr(I->getOperand(0)) << "," << getValueAsStr(I->getOperand(1)) << ")"; break;
+        default: llvm_unreachable("Invalid ICmp-64 predicate");
+      }
+      break;
+    }
+    AsmCast sign = CmpInst::isUnsigned(predicate) ? ASM_UNSIGNED : ASM_SIGNED;
+    Code << getAssignIfNeeded(I) << "(" <<
+      getValueAsCastStr(I->getOperand(0), sign) <<
+    ")";
+    switch (predicate) {
+    case ICmpInst::ICMP_EQ:  Code << "==";  break;
+    case ICmpInst::ICMP_NE:  Code << "!=";  break;
+    case ICmpInst::ICMP_ULE: Code << "<="; break;
+    case ICmpInst::ICMP_SLE: Code << "<="; break;
+    case ICmpInst::ICMP_UGE: Code << ">="; break;
+    case ICmpInst::ICMP_SGE: Code << ">="; break;
+    case ICmpInst::ICMP_ULT: Code << "<"; break;
+    case ICmpInst::ICMP_SLT: Code << "<"; break;
+    case ICmpInst::ICMP_UGT: Code << ">"; break;
+    case ICmpInst::ICMP_SGT: Code << ">"; break;
+    default: llvm_unreachable("Invalid ICmp predicate");
+    }
+    Code << "(" <<
+      getValueAsCastStr(I->getOperand(1), sign) <<
+    ")";
+    break;
+  }
+  case Instruction::Alloca: {
+    const AllocaInst* AI = cast<AllocaInst>(I);
+
+    // We've done an alloca, so we'll have bumped the stack and will
+    // need to restore it.
+    // Yes, we shouldn't have to bump it for nativized vars, however
+    // they are included in the frame offset, so the restore is still
+    // needed until that is fixed.
+    StackBumped = true;
+
+    if (NativizedVars.count(AI)) {
+      // nativized stack variable, we just need a 'var' definition
+      UsedVars[getJSName(AI)] = AI->getType()->getElementType();
+      return;
+    }
+
+    // Fixed-size entry-block allocations are allocated all at once in the
+    // function prologue.
+    if (AI->isStaticAlloca()) {
+      uint64_t Offset;
+      if (Allocas.getFrameOffset(AI, &Offset)) {
+        Code << getAssign(AI);
+        if (Allocas.getMaxAlignment() <= STACK_ALIGN) {
+          Code << "sp";
+        } else {
+          Code << "sp_a"; // aligned base of stack is different, use that
+        }
+        if (Offset != 0) {
+          Code << " + " << Offset << "|0";
+        }
+        break;
+      }
+      // Otherwise, this alloca is being represented by another alloca, so
+      // there's nothing to print.
+      return;
+    }
+
+    assert(AI->getAlignment() <= STACK_ALIGN); // TODO
+
+    Type *T = AI->getAllocatedType();
+    std::string Size;
+    uint64_t BaseSize = DL->getTypeAllocSize(T);
+    const Value *AS = AI->getArraySize();
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(AS)) {
+      Size = Twine(stackAlign(BaseSize * CI->getZExtValue())).str();
+    } else {
+      Size = stackAlignStr("((" + utostr(BaseSize) + '*' + getValueAsStr(AS) + ")|0)");
+    }
+    Code << getAssign(AI) << "STACKTOP; " << getStackBump(Size);
+    break;
+  }
+  case Instruction::Load: {
+    const LoadInst *LI = cast<LoadInst>(I);
+    const Value *P = LI->getPointerOperand();
+    unsigned Alignment = LI->getAlignment();
+    if (NativizedVars.count(P)) {
+      Code << getAssign(LI) << getValueAsStr(P);
+    } else {
+      Code << getLoad(LI, P, LI->getType(), Alignment);
+    }
+    break;
+  }
+  case Instruction::Store: {
+    const StoreInst *SI = cast<StoreInst>(I);
+    const Value *P = SI->getPointerOperand();
+    const Value *V = SI->getValueOperand();
+    unsigned Alignment = SI->getAlignment();
+    std::string VS = getValueAsStr(V);
+    if (NativizedVars.count(P)) {
+      Code << getValueAsStr(P) << " = " << VS;
+    } else {
+      Code << getStore(SI, P, V->getType(), VS, Alignment);
+    }
+
+    Type *T = V->getType();
+    if (T->isIntegerTy() && T->getIntegerBitWidth() > 32 && !OnlyWebAssembly) {
+      errs() << *I << "\n";
+      report_fatal_error("legalization problem");
+    }
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    Code << getAssignIfNeeded(I);
+    const GEPOperator *GEP = cast<GEPOperator>(I);
+    gep_type_iterator GTI = gep_type_begin(GEP);
+    int32_t ConstantOffset = 0;
+    std::string text;
+
+    // If the base is an initialized global variable, the address is just an
+    // integer constant, so we can fold it into the ConstantOffset directly.
+    const Value *Ptr = GEP->getPointerOperand()->stripPointerCasts();
+    if (isa<GlobalVariable>(Ptr) && cast<GlobalVariable>(Ptr)->hasInitializer() && !Relocatable) {
+      ConstantOffset = getGlobalAddress(Ptr->getName().str());
+    } else {
+      text = getValueAsParenStr(Ptr);
+    }
+
+    GetElementPtrInst::const_op_iterator I = GEP->op_begin();
+    I++;
+    for (GetElementPtrInst::const_op_iterator E = GEP->op_end();
+       I != E; ++I, ++GTI) {
+      const Value *Index = *I;
+      if (StructType *STy = GTI.getStructTypeOrNull()) {
+        // For a struct, add the member offset.
+        unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
+        uint32_t Offset = DL->getStructLayout(STy)->getElementOffset(FieldNo);
+        ConstantOffset = (uint32_t)ConstantOffset + Offset;
+      } else {
+        // For an array, add the element offset, explicitly scaled.
+        uint32_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+        if (const ConstantInt *CI = dyn_cast<ConstantInt>(Index)) {
+          // The index is constant. Add it to the accumulating offset.
+          ConstantOffset = (uint32_t)ConstantOffset + (uint32_t)CI->getSExtValue() * ElementSize;
+        } else {
+          // The index is non-constant. To avoid reassociating, which increases
+          // the risk of slow wraparounds, add the accumulated offset first.
+          text = AddOffset(text, ConstantOffset);
+          ConstantOffset = 0;
+
+          // Now add the scaled dynamic index.
+          std::string Mul = getIMul(Index, ConstantInt::get(i32, ElementSize));
+          text = text.empty() ? Mul : ("(" + text + " + (" + Mul + ")|0)");
+        }
+      }
+    }
+    // Add in the final accumulated offset.
+    Code << AddOffset(text, ConstantOffset);
+    break;
+  }
+  case Instruction::PHI: {
+    // handled separately - we push them back into the relooper branchings
+    return;
+  }
+  case Instruction::PtrToInt: {
+    if (OnlyWebAssembly && I->getType()->getIntegerBitWidth() == 64) {
+      // it is valid in LLVM IR to convert a pointer into an i64, it zexts
+      Code << getAssignIfNeeded(I) << "i64_zext(" << getValueAsStr(I->getOperand(0)) << ')';
+      break;
+    }
+    Code << getAssignIfNeeded(I) << getValueAsStr(I->getOperand(0));
+    break;
+  }
+  case Instruction::IntToPtr: {
+    if (OnlyWebAssembly && I->getOperand(0)->getType()->getIntegerBitWidth() == 64) {
+      // it is valid in LLVM IR to convert an i64 into a 32-bit pointer, it truncates
+      Code << getAssignIfNeeded(I) << "i64_trunc(" << getValueAsStr(I->getOperand(0)) << ')';
+      break;
+    }
+    Code << getAssignIfNeeded(I) << getValueAsStr(I->getOperand(0));
+    break;
+  }
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP: {
+    Code << getAssignIfNeeded(I);
+    if (OnlyWebAssembly &&
+        ((I->getType()->isIntegerTy() && I->getType()->getIntegerBitWidth() == 64) ||
+         (I->getOperand(0)->getType()->isIntegerTy() && I->getOperand(0)->getType()->getIntegerBitWidth() == 64))) {
+      switch (Operator::getOpcode(I)) {
+        case Instruction::Trunc: {
+          unsigned outBits = I->getType()->getIntegerBitWidth();
+          Code << "i64_trunc(" << getValueAsStr(I->getOperand(0)) << ')';
+          if (outBits < 32) {
+            Code << "&" << utostr(LSBMask(outBits));
+          }
+          break;
+        }
+        case Instruction::SExt: {
+          unsigned inBits = I->getOperand(0)->getType()->getIntegerBitWidth();
+          std::string bits = utostr(32 - inBits);
+          Code << "i64_sext(" << getValueAsStr(I->getOperand(0));
+          if (inBits < 32) {
+            Code << " << " << bits << " >> " << bits;
+          }
+          Code << ')';
+          break;
+        }
+        case Instruction::ZExt: {
+          Code << "i64_zext(" << getValueAsCastStr(I->getOperand(0), ASM_UNSIGNED) << ')';
+          break;
+        }
+        case Instruction::SIToFP: Code << (I->getType()->isFloatTy() ? "i64_s2f(" : "i64_s2d(") << getValueAsStr(I->getOperand(0)) << ')'; break;
+        case Instruction::UIToFP: Code << (I->getType()->isFloatTy() ? "i64_u2f(" : "i64_u2d(") << getValueAsStr(I->getOperand(0)) << ')'; break;
+        case Instruction::FPToSI: Code << (I->getOperand(0)->getType()->isFloatTy() ? "i64_f2s(" : "i64_d2s(") << getValueAsStr(I->getOperand(0)) << ')'; break;
+        case Instruction::FPToUI: Code << (I->getOperand(0)->getType()->isFloatTy() ? "i64_f2u(" : "i64_d2u(") << getValueAsStr(I->getOperand(0)) << ')'; break;
+        default: llvm_unreachable("Unreachable-i64");
+      }
+      break;
+    }
+    switch (Operator::getOpcode(I)) {
+    case Instruction::Trunc: {
+      //unsigned inBits = V->getType()->getIntegerBitWidth();
+      unsigned outBits = I->getType()->getIntegerBitWidth();
+      Code << getValueAsStr(I->getOperand(0)) << "&" << utostr(LSBMask(outBits));
+      break;
+    }
+    case Instruction::SExt: {
+      std::string bits = utostr(32 - I->getOperand(0)->getType()->getIntegerBitWidth());
+      Code << getValueAsStr(I->getOperand(0)) << " << " << bits << " >> " << bits;
+      break;
+    }
+    case Instruction::ZExt: {
+      Code << getValueAsCastStr(I->getOperand(0), ASM_UNSIGNED);
+      break;
+    }
+    case Instruction::FPExt: {
+      if (PreciseF32) {
+        Code << "+" << getValueAsStr(I->getOperand(0)); break;
+      } else {
+        Code << getValueAsStr(I->getOperand(0)); break;
+      }
+      break;
+    }
+    case Instruction::FPTrunc: {
+      Code << ensureFloat(getValueAsStr(I->getOperand(0)), I->getType());
+      break;
+    }
+    case Instruction::SIToFP:   Code << '(' << getCast(getValueAsCastParenStr(I->getOperand(0), ASM_SIGNED),   I->getType()) << ')'; break;
+    case Instruction::UIToFP:   Code << '(' << getCast(getValueAsCastParenStr(I->getOperand(0), ASM_UNSIGNED), I->getType()) << ')'; break;
+    case Instruction::FPToSI:   Code << '(' << getDoubleToInt(getValueAsParenStr(I->getOperand(0))) << ')'; break;
+    case Instruction::FPToUI:   Code << '(' << getCast(getDoubleToInt(getValueAsParenStr(I->getOperand(0))), I->getType(), ASM_UNSIGNED) << ')'; break;
+    case Instruction::PtrToInt: Code << '(' << getValueAsStr(I->getOperand(0)) << ')'; break;
+    case Instruction::IntToPtr: Code << '(' << getValueAsStr(I->getOperand(0)) << ')'; break;
+    default: llvm_unreachable("Unreachable");
+    }
+    break;
+  }
+  case Instruction::BitCast: {
+    Code << getAssignIfNeeded(I);
+    // Most bitcasts are no-ops for us. However, the exception is int to float and float to int
+    Type *InType = I->getOperand(0)->getType();
+    Type *OutType = I->getType();
+    std::string V = getValueAsStr(I->getOperand(0));
+    if (InType->isIntegerTy() && OutType->isFloatingPointTy()) {
+      if (OnlyWebAssembly) {
+        if (InType->getIntegerBitWidth() == 64) {
+          Code << "i64_bc2d(" << V << ')';
+        } else {
+          Code << "i32_bc2f(" << V << ')';
+        }
+        break;
+      }
+      assert(InType->getIntegerBitWidth() == 32);
+      Code << "(HEAP32[tempDoublePtr>>2]=" << V << "," << getCast("HEAPF32[tempDoublePtr>>2]", Type::getFloatTy(TheModule->getContext())) << ")";
+    } else if (OutType->isIntegerTy() && InType->isFloatingPointTy()) {
+      if (OnlyWebAssembly) {
+        if (OutType->getIntegerBitWidth() == 64) {
+          Code << "i64_bc2i(" << V << ')';
+        } else {
+          Code << "i32_bc2i(" << V << ')';
+        }
+        break;
+      }
+      assert(OutType->getIntegerBitWidth() == 32);
+      Code << "(HEAPF32[tempDoublePtr>>2]=" << V << "," "HEAP32[tempDoublePtr>>2]|0)";
+    } else {
+      Code << V;
+    }
+    break;
+  }
+  case Instruction::Call: {
+    const CallInst *CI = cast<CallInst>(I);
+    std::string Call = handleCall(CI);
+    if (Call.empty()) return;
+    Code << Call;
+    break;
+  }
+  case Instruction::Select: {
+    Code << getAssignIfNeeded(I) << getValueAsStr(I->getOperand(0)) << " ? " <<
+                                    getValueAsStr(I->getOperand(1)) << " : " <<
+                                    getValueAsStr(I->getOperand(2));
+    break;
+  }
+  case Instruction::AtomicRMW: {
+    const AtomicRMWInst *rmwi = cast<AtomicRMWInst>(I);
+    const Value *P = rmwi->getOperand(0);
+    const Value *V = rmwi->getOperand(1);
+    std::string VS = getValueAsStr(V);
+
+    if (EnablePthreads) {
+      std::string Assign = getAssign(rmwi);
+      std::string text;
+      const char *HeapName;
+      std::string Index = getHeapNameAndIndex(P, &HeapName);
+      const char *atomicFunc = 0;
+      switch (rmwi->getOperation()) {
+        case AtomicRMWInst::Xchg: atomicFunc = "exchange"; break;
+        case AtomicRMWInst::Add:  atomicFunc = "add"; break;
+        case AtomicRMWInst::Sub:  atomicFunc = "sub"; break;
+        case AtomicRMWInst::And:  atomicFunc = "and"; break;
+        case AtomicRMWInst::Or:   atomicFunc = "or"; break;
+        case AtomicRMWInst::Xor:  atomicFunc = "xor"; break;
+        case AtomicRMWInst::Nand: // TODO
+        case AtomicRMWInst::Max:
+        case AtomicRMWInst::Min:
+        case AtomicRMWInst::UMax:
+        case AtomicRMWInst::UMin:
+        case AtomicRMWInst::BAD_BINOP: llvm_unreachable("Bad atomic operation");
+      }
+      if (!strcmp(HeapName, "HEAPF32") || !strcmp(HeapName, "HEAPF64")) {
+        // TODO: If https://bugzilla.mozilla.org/show_bug.cgi?id=1131613 and https://bugzilla.mozilla.org/show_bug.cgi?id=1131624 are
+        // implemented, we could remove the emulation, but until then we must emulate manually.
+        bool fround = PreciseF32 && !strcmp(HeapName, "HEAPF32");
+        Code << Assign << (fround ? "Math_fround(" : "+") << "_emscripten_atomic_" << atomicFunc << "_" << heapNameToAtomicTypeName(HeapName) << "(" << getValueAsStr(P) << ", " << VS << (fround ? "))" : ")"); break;
+
+      // TODO: Remove the following two lines once https://bugzilla.mozilla.org/show_bug.cgi?id=1141986 is implemented!
+      } else if (rmwi->getOperation() == AtomicRMWInst::Xchg && !strcmp(HeapName, "HEAP32")) {
+        Code << Assign << "_emscripten_atomic_exchange_u32(" << getValueAsStr(P) << ", " << VS << ")|0"; break;
+
+      } else {
+        Code << Assign << "(Atomics_" << atomicFunc << "(" << HeapName << ", " << Index << ", " << VS << ")|0)"; break;
+      }
+    } else {
+      Code << getLoad(rmwi, P, I->getType(), 0) << ';';
+      // Most bitcasts are no-ops for us. However, the exception is int to float and float to int
+      switch (rmwi->getOperation()) {
+        case AtomicRMWInst::Xchg: Code << getStore(rmwi, P, I->getType(), VS, 0); break;
+        case AtomicRMWInst::Add:  Code << getStore(rmwi, P, I->getType(), "((" + getJSName(I) + '+' + VS + ")|0)", 0); break;
+        case AtomicRMWInst::Sub:  Code << getStore(rmwi, P, I->getType(), "((" + getJSName(I) + '-' + VS + ")|0)", 0); break;
+        case AtomicRMWInst::And:  Code << getStore(rmwi, P, I->getType(), "(" + getJSName(I) + '&' + VS + ")", 0); break;
+        case AtomicRMWInst::Nand: Code << getStore(rmwi, P, I->getType(), "(~(" + getJSName(I) + '&' + VS + "))", 0); break;
+        case AtomicRMWInst::Or:   Code << getStore(rmwi, P, I->getType(), "(" + getJSName(I) + '|' + VS + ")", 0); break;
+        case AtomicRMWInst::Xor:  Code << getStore(rmwi, P, I->getType(), "(" + getJSName(I) + '^' + VS + ")", 0); break;
+        case AtomicRMWInst::Max:
+        case AtomicRMWInst::Min:
+        case AtomicRMWInst::UMax:
+        case AtomicRMWInst::UMin:
+        case AtomicRMWInst::BAD_BINOP: llvm_unreachable("Bad atomic operation");
+      }
+    }
+    break;
+  }
+  case Instruction::Fence:
+    if (EnablePthreads) Code << "(Atomics_add(HEAP32, 0, 0)|0) /* fence */";
+    else Code << "/* fence */";
+    break;
+  }
+
+  if (const Instruction *Inst = dyn_cast<Instruction>(I)) {
+    Code << ';';
+    // append debug info
+    emitDebugInfo(Code, Inst);
+    Code << '\n';
+  }
+}
+
+// Checks whether to use a condition variable. We do so for switches and for indirectbrs
+static const Value *considerConditionVar(const Instruction *I) {
+  if (const IndirectBrInst *IB = dyn_cast<const IndirectBrInst>(I)) {
+    return IB->getAddress();
+  }
+  const SwitchInst *SI = dyn_cast<SwitchInst>(I);
+  if (!SI) return NULL;
+  // otherwise, we trust LLVM switches. if they were too big or sparse, the switch expansion pass should have fixed that
+  return SI->getCondition();
+}
+
+void JSWriter::addBlock(const BasicBlock *BB, Relooper& R, LLVMToRelooperMap& LLVMToRelooper) {
+  std::string Code;
+  raw_string_ostream CodeStream(Code);
+  for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
+       II != E; ++II) {
+    auto I = &*II;
+    if (stripPointerCastsWithoutSideEffects(I) == I) {
+      CurrInstruction = I;
+      generateExpression(I, CodeStream);
+    }
+  }
+  CurrInstruction = nullptr;
+  CodeStream.flush();
+  const Value* Condition = considerConditionVar(BB->getTerminator());
+  Block *Curr = new Block(Code.c_str(), Condition ? getValueAsCastStr(Condition).c_str() : NULL);
+  LLVMToRelooper[BB] = Curr;
+  R.AddBlock(Curr);
+}
+
+void JSWriter::printFunctionBody(const Function *F) {
+  assert(!F->isDeclaration());
+
+  // Prepare relooper
+  Relooper::MakeOutputBuffer(1024*1024);
+  Relooper R;
+  //if (!canReloop(F)) R.SetEmulate(true);
+  if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize) ||
+      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize)) {
+    R.SetMinSize(true);
+  }
+  R.SetAsmJSMode(1);
+  Block *Entry = NULL;
+  LLVMToRelooperMap LLVMToRelooper;
+
+  // Create relooper blocks with their contents. TODO: We could optimize
+  // indirectbr by emitting indexed blocks first, so their indexes
+  // match up with the label index.
+  for (Function::const_iterator I = F->begin(), BE = F->end();
+       I != BE; ++I) {
+    auto BI = &*I;
+    InvokeState = 0; // each basic block begins in state 0; the previous may not have cleared it, if e.g. it had a throw in the middle and the rest of it was decapitated
+    addBlock(BI, R, LLVMToRelooper);
+    if (!Entry) Entry = LLVMToRelooper[BI];
+  }
+  assert(Entry);
+
+  // Create branchings
+  for (Function::const_iterator I = F->begin(), BE = F->end();
+       I != BE; ++I) {
+    auto BI = &*I;
+    const TerminatorInst *TI = BI->getTerminator();
+    switch (TI->getOpcode()) {
+      default: {
+        report_fatal_error("invalid branch instr " + Twine(TI->getOpcodeName()));
+        break;
+      }
+      case Instruction::Br: {
+        const BranchInst* br = cast<BranchInst>(TI);
+        if (br->getNumOperands() == 3) {
+          BasicBlock *S0 = br->getSuccessor(0);
+          BasicBlock *S1 = br->getSuccessor(1);
+          std::string P0 = getPhiCode(&*BI, S0);
+          std::string P1 = getPhiCode(&*BI, S1);
+          LLVMToRelooper[&*BI]->AddBranchTo(LLVMToRelooper[&*S0], getValueAsStr(TI->getOperand(0)).c_str(), P0.size() > 0 ? P0.c_str() : NULL);
+          LLVMToRelooper[&*BI]->AddBranchTo(LLVMToRelooper[&*S1], NULL,                                     P1.size() > 0 ? P1.c_str() : NULL);
+        } else if (br->getNumOperands() == 1) {
+          BasicBlock *S = br->getSuccessor(0);
+          std::string P = getPhiCode(&*BI, S);
+          LLVMToRelooper[&*BI]->AddBranchTo(LLVMToRelooper[&*S], NULL, P.size() > 0 ? P.c_str() : NULL);
+        } else {
+          error("Branch with 2 operands?");
+        }
+        break;
+      }
+      case Instruction::IndirectBr: {
+        const IndirectBrInst* br = cast<IndirectBrInst>(TI);
+        unsigned Num = br->getNumDestinations();
+        std::set<const BasicBlock*> Seen; // sadly llvm allows the same block to appear multiple times
+        bool SetDefault = false; // pick the first and make it the default, llvm gives no reasonable default here
+        for (unsigned i = 0; i < Num; i++) {
+          const BasicBlock *S = br->getDestination(i);
+          if (Seen.find(S) != Seen.end()) continue;
+          Seen.insert(S);
+          std::string P = getPhiCode(&*BI, S);
+          std::string Target;
+          if (!SetDefault) {
+            SetDefault = true;
+          } else {
+            Target = "case " + utostr(getBlockAddress(F, S)) + ": ";
+          }
+          LLVMToRelooper[&*BI]->AddBranchTo(LLVMToRelooper[&*S], Target.size() > 0 ? Target.c_str() : NULL, P.size() > 0 ? P.c_str() : NULL);
+        }
+        break;
+      }
+      case Instruction::Switch: {
+        const SwitchInst* SI = cast<SwitchInst>(TI);
+        bool UseSwitch = !!considerConditionVar(SI);
+        BasicBlock *DD = SI->getDefaultDest();
+        std::string P = getPhiCode(&*BI, DD);
+        LLVMToRelooper[&*BI]->AddBranchTo(LLVMToRelooper[&*DD], NULL, P.size() > 0 ? P.c_str() : NULL);
+        typedef std::map<const BasicBlock*, std::string> BlockCondMap;
+        BlockCondMap BlocksToConditions;
+        for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) {
+          const BasicBlock *BB = i.getCaseSuccessor();
+          APInt CaseValue = i.getCaseValue()->getValue();
+          std::string Curr;
+          if (CaseValue.getBitWidth() == 64) {
+            Curr = emitI64Const(CaseValue);
+          } else {
+            Curr = CaseValue.toString(10, true);
+          }
+          std::string Condition;
+          if (UseSwitch) {
+            Condition = "case " + Curr + ": ";
+          } else {
+            Condition = "(" + getValueAsCastParenStr(SI->getCondition()) + " == " + Curr + ")";
+          }
+          BlocksToConditions[BB] = Condition + (!UseSwitch && BlocksToConditions[BB].size() > 0 ? " | " : "") + BlocksToConditions[BB];
+        }
+        std::set<const BasicBlock *> alreadyProcessed;
+        for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) {
+          const BasicBlock *BB = i.getCaseSuccessor();
+          if (!alreadyProcessed.insert(BB).second) continue;
+          if (BB == DD) continue; // ok to eliminate this, default dest will get there anyhow
+          std::string P = getPhiCode(&*BI, BB);
+          LLVMToRelooper[&*BI]->AddBranchTo(LLVMToRelooper[&*BB], BlocksToConditions[BB].c_str(), P.size() > 0 ? P.c_str() : NULL);
+        }
+        break;
+      }
+      case Instruction::Ret:
+      case Instruction::Unreachable: break;
+    }
+  }
+
+  // Calculate relooping and print
+  R.Calculate(Entry);
+  R.Render();
+
+  // Emit local variables
+  UsedVars["sp"] = i32;
+  unsigned MaxAlignment = Allocas.getMaxAlignment();
+  if (MaxAlignment > STACK_ALIGN) {
+    UsedVars["sp_a"] = i32;
+  }
+  UsedVars["label"] = i32;
+  if (!UsedVars.empty()) {
+    unsigned Count = 0;
+    for (VarMap::const_iterator VI = UsedVars.begin(); VI != UsedVars.end(); ++VI) {
+      if (Count == 20) {
+        Out << ";\n";
+        Count = 0;
+      }
+      if (Count == 0) Out << " var ";
+      if (Count > 0) {
+        Out << ", ";
+      }
+      Count++;
+      Out << VI->first << " = ";
+      switch (VI->second->getTypeID()) {
+        default:
+          llvm_unreachable("unsupported variable initializer type");
+        case Type::PointerTyID:
+          Out << "0";
+          break;
+        case Type::IntegerTyID:
+          if (VI->second->getIntegerBitWidth() == 64) {
+            assert(OnlyWebAssembly);
+            Out << "i64()";
+          } else {
+            Out << "0";
+          }
+          break;
+        case Type::FloatTyID:
+          if (PreciseF32) {
+            Out << "Math_fround(0)";
+            break;
+          }
+          // otherwise fall through to double
+        case Type::DoubleTyID:
+          Out << "+0";
+          break;
+        case Type::VectorTyID: {
+          VectorType *VT = cast<VectorType>(VI->second);
+          Out << "SIMD_" << SIMDType(VT) << "(0";
+
+          // SIMD.js has only a fixed set of SIMD types, and no arbitrary vector sizes like <float x 3> or <i8 x 7>, so
+          // codegen rounds up to the smallest appropriate size where the LLVM vector fits.
+          unsigned simdJsNumElements = VT->getNumElements();
+          if (simdJsNumElements <= 2 && VT->getElementType()->getPrimitiveSizeInBits() > 32) simdJsNumElements = 2;
+          else if (simdJsNumElements <= 4 && VT->getElementType()->getPrimitiveSizeInBits() <= 32) simdJsNumElements = 4;
+          else if (simdJsNumElements <= 8 && VT->getElementType()->getPrimitiveSizeInBits() <= 16) simdJsNumElements = 8;
+          else if (simdJsNumElements <= 16 && VT->getElementType()->getPrimitiveSizeInBits() <= 8) simdJsNumElements = 16;
+
+          for (unsigned i = 1; i < simdJsNumElements; ++i) {
+            Out << ",0";
+          }
+          Out << ')';
+          break;
+        }
+      }
+    }
+    Out << ";";
+    nl(Out);
+  }
+
+  {
+    static bool Warned = false;
+    if (!Warned && OptLevel < 2 && UsedVars.size() > 2000) {
+      prettyWarning() << "emitted code will contain very large numbers of local variables, which is bad for performance (build to JS with -O2 or above to avoid this - make sure to do so both on source files, and during 'linking')\n";
+      Warned = true;
+    }
+  }
+
+  // Emit stack entry
+  Out << " " << getAdHocAssign("sp", i32) << "STACKTOP;";
+  if (uint64_t FrameSize = Allocas.getFrameSize()) {
+    if (MaxAlignment > STACK_ALIGN) {
+      // We must align this entire stack frame to something higher than the default
+      Out << "\n ";
+      Out << "sp_a = STACKTOP = (STACKTOP + " << utostr(MaxAlignment-1) << ")&-" << utostr(MaxAlignment) << ";";
+    }
+    Out << "\n ";
+    Out << getStackBump(FrameSize);
+  }
+
+  // Emit extern loads, if we have any
+  if (Relocatable) {
+    if (FuncRelocatableExterns.size() > 0) {
+      for (auto& RE : FuncRelocatableExterns) {
+        std::string Temp = "t$" + RE;
+        std::string Call = "g$" + RE;
+        Out << Temp + " = " + Call + "() | 0;\n";
+      }
+      FuncRelocatableExterns.clear();
+    }
+  }
+
+  // Emit (relooped) code
+  char *buffer = Relooper::GetOutputBuffer();
+  nl(Out) << buffer;
+
+  // Ensure a final return if necessary
+  Type *RT = F->getFunctionType()->getReturnType();
+  if (!RT->isVoidTy()) {
+    char *LastCurly = strrchr(buffer, '}');
+    if (!LastCurly) LastCurly = buffer;
+    char *FinalReturn = strstr(LastCurly, "return ");
+    if (!FinalReturn) {
+      Out << " return " << getParenCast(getConstant(UndefValue::get(RT)), RT, ASM_NONSPECIFIC) << ";\n";
+    }
+  }
+
+  if (Relocatable) {
+    if (!F->hasInternalLinkage()) {
+      Exports.push_back(getJSName(F));
+    }
+  }
+}
+
+void JSWriter::processConstants() {
+  // Ensure a name for each global
+  for (Module::global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    if (I->hasInitializer()) {
+      if (!I->hasName()) {
+        // ensure a unique name
+        static int id = 1;
+        std::string newName;
+        while (1) {
+          newName = std::string("glb_") + utostr(id);
+          if (!TheModule->getGlobalVariable("glb_" + utostr(id))) break;
+          id++;
+          assert(id != 0);
+        }
+        I->setName(Twine(newName));
+      }
+    }
+  }
+  // First, calculate the address of each constant
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    if (I->hasInitializer()) {
+      parseConstant(I->getName().str(), I->getInitializer(), I->getAlignment(), true);
+    }
+  }
+  if (WebAssembly && SideModule && StackSize > 0) {
+    // allocate the stack
+    allocateZeroInitAddress("wasm-module-stack", STACK_ALIGN, StackSize);
+  }
+  // Calculate MaxGlobalAlign, adjust final paddings, and adjust GlobalBasePadding
+  assert(MaxGlobalAlign == 0);
+  for (auto& GI : GlobalDataMap) {
+    int Alignment = GI.first;
+    if (Alignment > MaxGlobalAlign) MaxGlobalAlign = Alignment;
+    ensureAligned(Alignment, &GlobalDataMap[Alignment]);
+  }
+  if (int(ZeroInitSizes.size()-1) > MaxGlobalAlign) MaxGlobalAlign = ZeroInitSizes.size()-1; // highest index in ZeroInitSizes is the largest zero-init alignment
+  if (!Relocatable && MaxGlobalAlign > 0) {
+    while ((GlobalBase+GlobalBasePadding) % MaxGlobalAlign != 0) GlobalBasePadding++;
+  }
+  while (AlignedHeapStarts.size() <= (unsigned)MaxGlobalAlign) AlignedHeapStarts.push_back(0);
+  while (ZeroInitStarts.size() <= (unsigned)MaxGlobalAlign) ZeroInitStarts.push_back(0);
+  for (auto& GI : GlobalDataMap) {
+    int Alignment = GI.first;
+    int Curr = GlobalBase + GlobalBasePadding;
+    for (auto& GI : GlobalDataMap) { // bigger alignments show up first, smaller later
+      if (GI.first > Alignment) {
+        Curr += GI.second.size();
+      }
+    }
+    AlignedHeapStarts[Alignment] = Curr;
+  }
+
+  unsigned ZeroInitStart = GlobalBase + GlobalBasePadding;
+  for (auto& GI : GlobalDataMap) {
+    ZeroInitStart += GI.second.size();
+  }
+  if (!ZeroInitSizes.empty()) {
+    while (ZeroInitStart & (MaxGlobalAlign-1)) ZeroInitStart++; // fully align zero init area
+    for (int Alignment = ZeroInitSizes.size() - 1; Alignment > 0; Alignment--) {
+      if (ZeroInitSizes[Alignment] == 0) continue;
+      assert((ZeroInitStart & (Alignment-1)) == 0);
+      ZeroInitStarts[Alignment] = ZeroInitStart;
+      ZeroInitStart += ZeroInitSizes[Alignment];
+    }
+  }
+  StaticBump = ZeroInitStart; // total size of all the data section
+
+  // Second, allocate their contents
+  for (Module::const_global_iterator I = TheModule->global_begin(),
+         E = TheModule->global_end(); I != E; ++I) {
+    if (I->hasInitializer()) {
+      parseConstant(I->getName().str(), I->getInitializer(), I->getAlignment(), false);
+    }
+  }
+  if (Relocatable) {
+    for (Module::const_global_iterator II = TheModule->global_begin(),
+           E = TheModule->global_end(); II != E; ++II) {
+      auto I = &*II;
+      if (I->hasInitializer() && !I->hasInternalLinkage()) {
+        std::string Name = I->getName().str();
+        if (GlobalAddresses.find(Name) != GlobalAddresses.end()) {
+          std::string JSName = getJSName(I).substr(1);
+          if (Name == JSName) { // don't export things that have weird internal names, that C can't dlsym anyhow
+            NamedGlobals[Name] = getGlobalAddress(Name);
+          }
+        }
+      }
+    }
+  }
+}
+
+void JSWriter::printFunction(const Function *F) {
+  ValueNames.clear();
+
+  // Prepare and analyze function
+
+  UsedVars.clear();
+  UniqueNum = 0;
+
+  // When optimizing, the regular optimizer (mem2reg, SROA, GVN, and others)
+  // will have already taken all the opportunities for nativization.
+  if (OptLevel == CodeGenOpt::None)
+    calculateNativizedVars(F);
+
+  // Do alloca coloring at -O1 and higher.
+  Allocas.analyze(*F, *DL, OptLevel != CodeGenOpt::None);
+
+  // Emit the function
+
+  std::string Name = F->getName();
+  sanitizeGlobal(Name);
+  Out << "function " << Name << "(";
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI) {
+    if (AI != F->arg_begin()) Out << ",";
+    Out << getJSName(&*AI);
+  }
+  Out << ") {";
+  nl(Out);
+  for (Function::const_arg_iterator II = F->arg_begin(), AE = F->arg_end();
+       II != AE; ++II) {
+    auto AI = &*II;
+    std::string name = getJSName(AI);
+    Out << " " << name << " = " << getCast(name, AI->getType(), ASM_NONSPECIFIC) << ";";
+    nl(Out);
+  }
+  printFunctionBody(F);
+  Out << "}";
+  nl(Out);
+
+  Allocas.clear();
+  StackBumped = false;
+}
+
+void JSWriter::printModuleBody() {
+  processConstants();
+
+  if (Relocatable) {
+    for (Module::const_alias_iterator I = TheModule->alias_begin(), E = TheModule->alias_end();
+         I != E; ++I) {
+      if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(I)) {
+        const Value* Target = resolveFully(GA);
+        Aliases[getJSName(GA)] = getJSName(Target);
+      }
+    }
+  }
+
+  // Emit function bodies.
+  nl(Out) << "// EMSCRIPTEN_START_FUNCTIONS"; nl(Out);
+  for (Module::const_iterator II = TheModule->begin(), E = TheModule->end();
+       II != E; ++II) {
+    auto I = &*II;
+    if (!I->isDeclaration()) printFunction(I);
+  }
+  // Emit postSets, split up into smaller functions to avoid one massive one that is slow to compile (more likely to occur in dynamic linking, as more postsets)
+  {
+    const int CHUNK = 100;
+    int i = 0;
+    int chunk = 0;
+    int num = PostSets.size();
+    do {
+      if (chunk == 0) {
+        Out << "function runPostSets() {\n";
+      } else {
+        Out << "function runPostSets" << chunk << "() {\n";
+      }
+      if (Relocatable) Out << " var temp = 0;\n"; // need a temp var for relocation calls, for proper validation in heap growth
+      int j = i + CHUNK;
+      if (j > num) j = num;
+      while (i < j) {
+        Out << PostSets[i] << "\n";
+        i++;
+      }
+      // call the next chunk, if there is one
+      chunk++;
+      if (i < num) {
+        Out << " runPostSets" << chunk << "();\n";
+      }
+      Out << "}\n";
+    } while (i < num);
+    PostSets.clear();
+    if (WebAssembly && SideModule) {
+      // emit the init method for a wasm side module,
+      // which runs postsets and global inits
+      // note that we can't use the wasm start mechanism, as the JS side is
+      // not yet ready - imagine that in the start method we call out to JS,
+      // then try to call back in, but we haven't yet captured the exports
+      // from the wasm module to their places on the JS Module object etc.
+      Out << "function __post_instantiate() {\n";
+      if (StackSize > 0) {
+        Out << " STACKTOP = " << relocateGlobal(utostr(getGlobalAddress("wasm-module-stack"))) << ";\n";
+        Out << " STACK_MAX = STACKTOP + " << StackSize << " | 0;\n";
+      }
+      Out << " runPostSets();\n";
+      for (auto& init : GlobalInitializers) {
+        Out << " " << init << "();\n";
+      }
+      GlobalInitializers.clear();
+      Out << "}\n";
+      Exports.push_back("__post_instantiate");
+    }
+    if (DeclaresNeedingTypeDeclarations.size() > 0) {
+      Out << "function __emscripten_dceable_type_decls() {\n";
+      for (auto& Decl : DeclaresNeedingTypeDeclarations) {
+        std::string Call = getJSName(Decl) + "(";
+        bool First = true;
+        auto* FT = Decl->getFunctionType();
+        for (auto AI = FT->param_begin(), AE = FT->param_end(); AI != AE; ++AI) {
+          if (First) {
+            First = false;
+          } else {
+            Call += ", ";
+          }
+          Call += getUndefValue(*AI);
+        }
+        Call += ")";
+        Type *RT = FT->getReturnType();
+        if (!RT->isVoidTy()) {
+          Call = getCast(Call, RT);
+        }
+        Out << " " << Call << ";\n";
+      }
+      Out << "}\n";
+    }
+    for (auto& Name : ExtraFunctions) {
+      Out << Name << '\n';
+    }
+  }
+  Out << "// EMSCRIPTEN_END_FUNCTIONS\n\n";
+
+  if (EnablePthreads) {
+    Out << "if (!ENVIRONMENT_IS_PTHREAD) {\n";
+  }
+  Out << "/* memory initializer */ allocate([";
+  if (MaxGlobalAlign > 0) {
+    bool First = true;
+    for (int i = 0; i < GlobalBasePadding; i++) {
+      if (First) {
+        First = false;
+      } else {
+        Out << ",";
+      }
+      Out << "0";
+    }
+    int Curr = MaxGlobalAlign;
+    while (Curr > 0) {
+      if (GlobalDataMap.find(Curr) == GlobalDataMap.end()) {
+        Curr = Curr/2;
+        continue;
+      }
+      HeapData* GlobalData = &GlobalDataMap[Curr];
+      if (GlobalData->size() > 0) {
+        if (First) {
+          First = false;
+        } else {
+          Out << ",";
+        }
+        printCommaSeparated(*GlobalData);
+      }
+      Curr = Curr/2;
+    }
+  }
+  Out << "], \"i8\", ALLOC_NONE, Runtime.GLOBAL_BASE);\n";
+  if (EnablePthreads) {
+    Out << "}\n";
+  }
+  // Emit metadata for emcc driver
+  Out << "\n\n// EMSCRIPTEN_METADATA\n";
+  Out << "{\n";
+
+  Out << "\"staticBump\": " << StaticBump << ",\n";
+
+  Out << "\"declares\": [";
+  bool first = true;
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I) {
+    if (I->isDeclaration() && !I->use_empty()) {
+      // Ignore intrinsics that are always no-ops or expanded into other code
+      // which doesn't require the intrinsic function itself to be declared.
+      if (I->isIntrinsic()) {
+        switch (I->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::dbg_declare:
+        case Intrinsic::dbg_value:
+        case Intrinsic::lifetime_start:
+        case Intrinsic::lifetime_end:
+        case Intrinsic::invariant_start:
+        case Intrinsic::invariant_end:
+        case Intrinsic::prefetch:
+        case Intrinsic::memcpy:
+        case Intrinsic::memset:
+        case Intrinsic::memmove:
+        case Intrinsic::expect:
+        case Intrinsic::flt_rounds:
+          continue;
+        }
+      }
+      // Do not report methods implemented in a call handler, unless
+      // they are accessed by a function pointer (in which case, we
+      // need the expected name to be available TODO: optimize
+      // that out, call handlers can declare their "function table
+      // name").
+      std::string fullName = std::string("_") + I->getName().str();
+      if (CallHandlers.count(fullName) > 0) {
+        if (IndexedFunctions.find(fullName) == IndexedFunctions.end()) {
+          continue;
+        }
+      }
+
+      if (first) {
+        first = false;
+      } else {
+        Out << ", ";
+      }
+      Out << "\"" << I->getName() << "\"";
+    }
+  }
+  for (NameSet::const_iterator I = Declares.begin(), E = Declares.end();
+       I != E; ++I) {
+    if (first) {
+      first = false;
+    } else {
+      Out << ", ";
+    }
+    Out << "\"" << *I << "\"";
+  }
+  Out << "],";
+
+  Out << "\"redirects\": {";
+  first = true;
+  for (StringMap::const_iterator I = Redirects.begin(), E = Redirects.end();
+       I != E; ++I) {
+    if (first) {
+      first = false;
+    } else {
+      Out << ", ";
+    }
+    Out << "\"_" << I->first << "\": \"" << I->second << "\"";
+  }
+  Out << "},";
+
+  Out << "\"externs\": [";
+  first = true;
+  for (NameSet::const_iterator I = Externals.begin(), E = Externals.end();
+       I != E; ++I) {
+    if (first) {
+      first = false;
+    } else {
+      Out << ", ";
+    }
+    Out << "\"" << *I << "\"";
+  }
+  Out << "],";
+
+  Out << "\"implementedFunctions\": [";
+  first = true;
+  for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+       I != E; ++I) {
+    if (!I->isDeclaration()) {
+      if (first) {
+        first = false;
+      } else {
+        Out << ", ";
+      }
+      std::string name = I->getName();
+      sanitizeGlobal(name);
+      Out << "\"" << name << '"';
+    }
+  }
+  Out << "],";
+
+  Out << "\"tables\": {";
+  unsigned Num = FunctionTables.size();
+  for (FunctionTableMap::iterator I = FunctionTables.begin(), E = FunctionTables.end(); I != E; ++I) {
+    Out << "  \"" << I->first << "\": \"var FUNCTION_TABLE_" << I->first << " = [";
+    // wasm emulated function pointers use just one table
+    if (!(WebAssembly && EmulatedFunctionPointers && I->first != "X")) {
+      FunctionTable &Table = I->second;
+      // ensure power of two
+      unsigned Size = 1;
+      while (Size < Table.size()) Size <<= 1;
+      while (Table.size() < Size) Table.push_back("0");
+      for (unsigned i = 0; i < Table.size(); i++) {
+        Out << Table[i];
+        if (i < Table.size()-1) Out << ",";
+      }
+    }
+    Out << "];\"";
+    if (--Num > 0) Out << ",";
+    Out << "\n";
+  }
+  Out << "},";
+
+  Out << "\"initializers\": [";
+  first = true;
+  for (unsigned i = 0; i < GlobalInitializers.size(); i++) {
+    if (first) {
+      first = false;
+    } else {
+      Out << ", ";
+    }
+    Out << "\"" << GlobalInitializers[i] << "\"";
+  }
+  Out << "],";
+
+  Out << "\"exports\": [";
+  first = true;
+  for (unsigned i = 0; i < Exports.size(); i++) {
+    if (first) {
+      first = false;
+    } else {
+      Out << ", ";
+    }
+    Out << "\"" << Exports[i] << "\"";
+  }
+  Out << "],";
+
+  Out << "\"aliases\": {";
+  first = true;
+  for (StringMap::const_iterator I = Aliases.begin(), E = Aliases.end();
+       I != E; ++I) {
+    if (first) {
+      first = false;
+    } else {
+      Out << ", ";
+    }
+    Out << "\"" << I->first << "\": \"" << I->second << "\"";
+  }
+  Out << "},";
+
+  Out << "\"cantValidate\": \"" << CantValidate << "\",";
+
+  Out << "\"simd\": " << (UsesSIMDUint8x16 || UsesSIMDInt8x16 || UsesSIMDUint16x8 || UsesSIMDInt16x8 || UsesSIMDUint32x4 || UsesSIMDInt32x4 || UsesSIMDFloat32x4 || UsesSIMDFloat64x2 ? "1" : "0") << ",";
+  Out << "\"simdUint8x16\": " << (UsesSIMDUint8x16 ? "1" : "0") << ",";
+  Out << "\"simdInt8x16\": " << (UsesSIMDInt8x16 ? "1" : "0") << ",";
+  Out << "\"simdUint16x8\": " << (UsesSIMDUint16x8 ? "1" : "0") << ",";
+  Out << "\"simdInt16x8\": " << (UsesSIMDInt16x8 ? "1" : "0") << ",";
+  Out << "\"simdUint32x4\": " << (UsesSIMDUint32x4 ? "1" : "0") << ",";
+  Out << "\"simdInt32x4\": " << (UsesSIMDInt32x4 ? "1" : "0") << ",";
+  Out << "\"simdFloat32x4\": " << (UsesSIMDFloat32x4 ? "1" : "0") << ",";
+  Out << "\"simdFloat64x2\": " << (UsesSIMDFloat64x2 ? "1" : "0") << ",";
+  Out << "\"simdBool8x16\": " << (UsesSIMDBool8x16 ? "1" : "0") << ",";
+  Out << "\"simdBool16x8\": " << (UsesSIMDBool16x8 ? "1" : "0") << ",";
+  Out << "\"simdBool32x4\": " << (UsesSIMDBool32x4 ? "1" : "0") << ",";
+  Out << "\"simdBool64x2\": " << (UsesSIMDBool64x2 ? "1" : "0") << ",";
+
+  Out << "\"maxGlobalAlign\": " << utostr(MaxGlobalAlign) << ",";
+
+  Out << "\"namedGlobals\": {";
+  first = true;
+  for (NameIntMap::const_iterator I = NamedGlobals.begin(), E = NamedGlobals.end(); I != E; ++I) {
+    if (first) {
+      first = false;
+    } else {
+      Out << ", ";
+    }
+    Out << "\"" << I->first << "\": \"" << utostr(I->second) << "\"";
+  }
+  Out << "},";
+
+  Out << "\"asmConsts\": {";
+  first = true;
+  for (auto& I : AsmConsts) {
+    if (first) {
+      first = false;
+    } else {
+      Out << ", ";
+    }
+    Out << "\"" << utostr(I.second.Id) << "\": [\"" << I.first.c_str() << "\", [";
+    auto& Sigs = I.second.Sigs;
+    bool innerFirst = true;
+    for (auto& Sig : Sigs) {
+      if (innerFirst) {
+        innerFirst = false;
+      } else {
+        Out << ", ";
+      }
+      Out << "\"" << Sig << "\"";
+    }
+    Out << "]]";
+  }
+  Out << "}";
+
+  if (EnableCyberDWARF) {
+    Out << ",\"cyberdwarf_data\": {\n";
+    Out << "\"types\": {";
+
+    // Remove trailing comma
+    std::string TDD = cyberDWARFData.TypeDebugData.str().substr(0, cyberDWARFData.TypeDebugData.str().length() - 1);
+    // One Windows, paths can have \ separators
+    std::replace(TDD.begin(), TDD.end(), '\\', '/');
+    Out << TDD << "}, \"type_name_map\": {";
+
+    std::string TNM = cyberDWARFData.TypeNameMap.str().substr(0, cyberDWARFData.TypeNameMap.str().length() - 1);
+    std::replace(TNM.begin(), TNM.end(), '\\', '/');
+    Out << TNM << "}, \"functions\": {";
+
+    std::string FM = cyberDWARFData.FunctionMembers.str().substr(0, cyberDWARFData.FunctionMembers.str().length() - 1);
+    std::replace(FM.begin(), FM.end(), '\\', '/');
+    Out << FM << "}, \"vtable_offsets\": {";
+    bool first_elem = true;
+    for (auto VTO: cyberDWARFData.VtableOffsets) {
+      if (!first_elem) {
+        Out << ",";
+      }
+      Out << "\"" << VTO.first << "\":\"" << VTO.second << "\"";
+      first_elem = false;
+    }
+    Out << "}\n}";
+  }
+
+  Out << "\n}\n";
+}
+
+void JSWriter::parseConstant(const std::string& name, const Constant* CV, int Alignment, bool calculate) {
+  if (isa<GlobalValue>(CV))
+    return;
+  if (Alignment == 0) Alignment = DEFAULT_MEM_ALIGN;
+  //errs() << "parsing constant " << name << " : " << Alignment << "\n";
+  // TODO: we repeat some work in both calculate and emit phases here
+  // FIXME: use the proper optimal alignments
+  if (const ConstantDataSequential *CDS =
+         dyn_cast<ConstantDataSequential>(CV)) {
+    assert(CDS->isString());
+    if (calculate) {
+      HeapData *GlobalData = allocateAddress(name, Alignment);
+      StringRef Str = CDS->getAsString();
+      ensureAligned(Alignment, GlobalData);
+      for (unsigned int i = 0; i < Str.size(); i++) {
+        GlobalData->push_back(Str.data()[i]);
+      }
+    }
+  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+    APFloat APF = CFP->getValueAPF();
+    if (CFP->getType() == Type::getFloatTy(CFP->getContext())) {
+      if (calculate) {
+        HeapData *GlobalData = allocateAddress(name, Alignment);
+        union flt { float f; unsigned char b[sizeof(float)]; } flt;
+        flt.f = APF.convertToFloat();
+        ensureAligned(Alignment, GlobalData);
+        for (unsigned i = 0; i < sizeof(float); ++i) {
+          GlobalData->push_back(flt.b[i]);
+        }
+      }
+    } else if (CFP->getType() == Type::getDoubleTy(CFP->getContext())) {
+      if (calculate) {
+        HeapData *GlobalData = allocateAddress(name, Alignment);
+        union dbl { double d; unsigned char b[sizeof(double)]; } dbl;
+        dbl.d = APF.convertToDouble();
+        ensureAligned(Alignment, GlobalData);
+        for (unsigned i = 0; i < sizeof(double); ++i) {
+          GlobalData->push_back(dbl.b[i]);
+        }
+      }
+    } else {
+      assert(false && "Unsupported floating-point type");
+    }
+  } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    if (calculate) {
+      union { uint64_t i; unsigned char b[sizeof(uint64_t)]; } integer;
+      integer.i = *CI->getValue().getRawData();
+      unsigned BitWidth = 64; // CI->getValue().getBitWidth();
+      assert(BitWidth == 32 || BitWidth == 64);
+      HeapData *GlobalData = allocateAddress(name, Alignment);
+      // assuming compiler is little endian
+      ensureAligned(Alignment, GlobalData);
+      for (unsigned i = 0; i < BitWidth / 8; ++i) {
+        GlobalData->push_back(integer.b[i]);
+      }
+    }
+  } else if (isa<ConstantPointerNull>(CV)) {
+    assert(false && "Unlowered ConstantPointerNull");
+  } else if (isa<ConstantAggregateZero>(CV)) {
+    if (calculate) {
+      unsigned Bytes = DL->getTypeStoreSize(CV->getType());
+      allocateZeroInitAddress(name, Alignment, Bytes);
+    }
+  } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+    if (calculate) {
+      for (Constant::const_user_iterator UI = CV->user_begin(), UE = CV->user_end(); UI != UE; ++UI) {
+        if ((*UI)->getName() == "llvm.used") {
+          // export the kept-alives
+          for (unsigned i = 0; i < CA->getNumOperands(); i++) {
+            const Constant *C = CA->getOperand(i);
+            if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+              C = CE->getOperand(0); // ignore bitcasts
+            }
+            if (isa<Function>(C)) Exports.push_back(getJSName(C));
+          }
+        } else if ((*UI)->getName() == "llvm.global.annotations") {
+          // llvm.global.annotations can be ignored.
+        } else {
+          llvm_unreachable("Unexpected constant array");
+        }
+        break; // we assume one use here
+      }
+    }
+  } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+    if (name == "__init_array_start") {
+      // this is the global static initializer
+      if (calculate) {
+        unsigned Num = CS->getNumOperands();
+        for (unsigned i = 0; i < Num; i++) {
+          const Value* C = CS->getOperand(i);
+          if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+            C = CE->getOperand(0); // ignore bitcasts
+          }
+          GlobalInitializers.push_back(getJSName(C));
+        }
+      }
+    } else if (calculate) {
+      HeapData *GlobalData = allocateAddress(name, Alignment);
+      unsigned Bytes = DL->getTypeStoreSize(CV->getType());
+      ensureAligned(Alignment, GlobalData);
+      for (unsigned i = 0; i < Bytes; ++i) {
+        GlobalData->push_back(0);
+      }
+    } else {
+      // Per the PNaCl abi, this must be a packed struct of a very specific type
+      // https://chromium.googlesource.com/native_client/pnacl-llvm/+/7287c45c13dc887cebe3db6abfa2f1080186bb97/lib/Transforms/NaCl/FlattenGlobals.cpp
+      assert(CS->getType()->isPacked());
+      // This is the only constant where we cannot just emit everything during the first phase, 'calculate', as we may refer to other globals
+      unsigned Num = CS->getNumOperands();
+      unsigned Offset = getRelativeGlobalAddress(name);
+      unsigned OffsetStart = Offset;
+      unsigned Absolute = getGlobalAddress(name);
+
+      // VTable for the object
+      if (name.compare(0, 4, "_ZTV") == 0) {
+        cyberDWARFData.VtableOffsets[Absolute] = name;
+      }
+
+      for (unsigned i = 0; i < Num; i++) {
+        const Constant* C = CS->getOperand(i);
+        if (isa<ConstantAggregateZero>(C)) {
+          unsigned Bytes = DL->getTypeStoreSize(C->getType());
+          Offset += Bytes; // zeros, so just skip
+        } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+          const Value *V = CE->getOperand(0);
+          unsigned Data = 0;
+          if (CE->getOpcode() == Instruction::PtrToInt) {
+            Data = getConstAsOffset(V, Absolute + Offset - OffsetStart);
+          } else if (CE->getOpcode() == Instruction::Add) {
+            V = cast<ConstantExpr>(V)->getOperand(0);
+            Data = getConstAsOffset(V, Absolute + Offset - OffsetStart);
+            ConstantInt *CI = cast<ConstantInt>(CE->getOperand(1));
+            Data += *CI->getValue().getRawData();
+          } else {
+            CE->dump();
+            llvm_unreachable("Unexpected constant expr kind");
+          }
+          union { unsigned i; unsigned char b[sizeof(unsigned)]; } integer;
+          integer.i = Data;
+          HeapData& GlobalData = GlobalDataMap[Alignment];
+          assert(Offset+4 <= GlobalData.size());
+          ensureAligned(Alignment, GlobalData);
+          for (unsigned i = 0; i < 4; ++i) {
+            GlobalData[Offset++] = integer.b[i];
+          }
+        } else if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C)) {
+          assert(CDS->isString());
+          StringRef Str = CDS->getAsString();
+          HeapData& GlobalData = GlobalDataMap[Alignment];
+          assert(Offset+Str.size() <= GlobalData.size());
+          ensureAligned(Alignment, GlobalData);
+          for (unsigned int i = 0; i < Str.size(); i++) {
+            GlobalData[Offset++] = Str.data()[i];
+          }
+        } else {
+          C->dump();
+          llvm_unreachable("Unexpected constant kind");
+        }
+      }
+    }
+  } else if (isa<ConstantVector>(CV)) {
+    assert(false && "Unlowered ConstantVector");
+  } else if (isa<BlockAddress>(CV)) {
+    assert(false && "Unlowered BlockAddress");
+  } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    if (name == "__init_array_start") {
+      // this is the global static initializer
+      if (calculate) {
+        const Value *V = CE->getOperand(0);
+        GlobalInitializers.push_back(getJSName(V));
+        // is the func
+      }
+    } else if (name == "__fini_array_start") {
+      // nothing to do
+    } else {
+      // a global equal to a ptrtoint of some function, so a 32-bit integer for us
+      if (calculate) {
+        HeapData *GlobalData = allocateAddress(name, Alignment);
+        ensureAligned(Alignment, GlobalData);
+        for (unsigned i = 0; i < 4; ++i) {
+          GlobalData->push_back(0);
+        }
+      } else {
+        unsigned Data = 0;
+
+        // Deconstruct lowered getelementptrs.
+        if (CE->getOpcode() == Instruction::Add) {
+          Data = cast<ConstantInt>(CE->getOperand(1))->getZExtValue();
+          CE = cast<ConstantExpr>(CE->getOperand(0));
+        }
+        const Value *V = CE;
+        if (CE->getOpcode() == Instruction::PtrToInt) {
+          V = CE->getOperand(0);
+        }
+
+        // Deconstruct getelementptrs.
+        int64_t BaseOffset;
+        V = GetPointerBaseWithConstantOffset(V, BaseOffset, *DL);
+        Data += (uint64_t)BaseOffset;
+
+        Data += getConstAsOffset(V, getGlobalAddress(name));
+        union { unsigned i; unsigned char b[sizeof(unsigned)]; } integer;
+        integer.i = Data;
+        unsigned Offset = getRelativeGlobalAddress(name);
+        HeapData& GlobalData = GlobalDataMap[Alignment];
+        assert(Offset+4 <= GlobalData.size());
+        ensureAligned(Alignment, GlobalData);
+        for (unsigned i = 0; i < 4; ++i) {
+          GlobalData[Offset++] = integer.b[i];
+        }
+      }
+    }
+  } else if (isa<UndefValue>(CV)) {
+    assert(false && "Unlowered UndefValue");
+  } else {
+    CV->dump();
+    assert(false && "Unsupported constant kind");
+  }
+}
+
+std::string JSWriter::generateDebugRecordForVar(Metadata *MD) {
+  // void shows up as nullptr for Metadata
+  if (!MD) {
+    cyberDWARFData.IndexedMetadata[0] = 0;
+    return "\"0\"";
+  }
+  if (cyberDWARFData.IndexedMetadata.find(MD) == cyberDWARFData.IndexedMetadata.end()) {
+    cyberDWARFData.IndexedMetadata[MD] = cyberDWARFData.MetadataNum++;
+  }
+  else {
+    return "\"" + utostr(cyberDWARFData.IndexedMetadata[MD]) + "\"";
+  }
+
+  std::string VarIDForJSON = "\"" + utostr(cyberDWARFData.IndexedMetadata[MD]) + "\"";
+
+  if (DIBasicType *BT = dyn_cast<DIBasicType>(MD)) {
+    cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+    << "[0,\""
+    << BT->getName().str()
+    << "\","
+    << BT->getEncoding()
+    << ","
+    << BT->getOffsetInBits()
+    << ","
+    << BT->getSizeInBits()
+    << "],";
+  }
+  else if (MDString *MDS = dyn_cast<MDString>(MD)) {
+    cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+    << "[10,\"" << MDS->getString().str() << "\"],";
+  }
+  else if (DIDerivedType *DT = dyn_cast<DIDerivedType>(MD)) {
+    if (DT->getRawBaseType() && isa<MDString>(DT->getRawBaseType())) {
+      auto MDS = cast<MDString>(DT->getRawBaseType());
+      cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+      << "[1, \""
+      << DT->getName().str()
+      << "\","
+      << DT->getTag()
+      << ",\""
+      << MDS->getString().str()
+      << "\","
+      << DT->getOffsetInBits()
+      << ","
+      << DT->getSizeInBits() << "],";
+    }
+    else {
+      if (cyberDWARFData.IndexedMetadata.find(DT->getRawBaseType()) == cyberDWARFData.IndexedMetadata.end()) {
+        generateDebugRecordForVar(DT->getRawBaseType());
+      }
+
+    cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+        << "[1, \""
+        << DT->getName().str()
+        << "\","
+        << DT->getTag()
+        << ","
+        << cyberDWARFData.IndexedMetadata[DT->getRawBaseType()]
+        << ","
+        << DT->getOffsetInBits()
+        << ","
+        << DT->getSizeInBits() << "],";
+    }
+  }
+  else if (DICompositeType *CT = dyn_cast<DICompositeType>(MD)) {
+
+    if (CT->getIdentifier().str() != "") {
+      if (CT->isForwardDecl()) {
+        cyberDWARFData.TypeNameMap << "\"" << "fd_" << CT->getIdentifier().str() << "\":" << VarIDForJSON << ",";
+      } else {
+        cyberDWARFData.TypeNameMap << "\"" << CT->getIdentifier().str() << "\":" << VarIDForJSON << ",";
+      }
+    }
+
+    // Pull in debug info for any used elements before emitting ours
+    for (auto e : CT->getElements()) {
+      generateDebugRecordForVar(e);
+    }
+
+    // Build our base type, if we have one (arrays)
+    if (cyberDWARFData.IndexedMetadata.find(CT->getRawBaseType()) == cyberDWARFData.IndexedMetadata.end()) {
+      generateDebugRecordForVar(CT->getRawBaseType());
+    }
+
+    cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+      << "[2, \""
+      << CT->getName().str()
+      << "\","
+      << CT->getTag()
+      << ","
+      << cyberDWARFData.IndexedMetadata[CT->getRawBaseType()]
+      << ","
+      << CT->getOffsetInBits()
+      << ","
+      << CT->getSizeInBits()
+      << ",\""
+      << CT->getIdentifier().str()
+      << "\",[";
+
+    bool first_elem = true;
+    for (auto e : CT->getElements()) {
+      auto *vx = dyn_cast<DIType>(e);
+      if ((vx && vx->isStaticMember()) || isa<DISubroutineType>(e))
+        continue;
+      if (!first_elem) {
+        cyberDWARFData.TypeDebugData << ",";
+      }
+      first_elem = false;
+      cyberDWARFData.TypeDebugData << generateDebugRecordForVar(e);
+    }
+
+    cyberDWARFData.TypeDebugData << "]],";
+
+  }
+  else if (DISubroutineType *ST = dyn_cast<DISubroutineType>(MD)) {
+    cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+    << "[3," << ST->getTag() << "],";
+  }
+  else if (DISubrange *SR = dyn_cast<DISubrange>(MD)) {
+    cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+    << "[4," << SR->getCount() << "],";
+  }
+  else if (DISubprogram *SP = dyn_cast<DISubprogram>(MD)) {
+    cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+    << "[5,\"" << SP->getName().str() << "\"],";
+  }
+  else if (DIEnumerator *E = dyn_cast<DIEnumerator>(MD)) {
+    cyberDWARFData.TypeDebugData << VarIDForJSON << ":"
+    << "[6,\"" << E->getName().str() << "\"," << E->getValue() << "],";
+  }
+  else {
+    //MD->dump();
+  }
+
+  return VarIDForJSON;
+}
+
+void JSWriter::buildCyberDWARFData() {
+  for (auto &F : TheModule->functions()) {
+    auto MD = F.getMetadata("dbg");
+    if (MD) {
+      auto *SP = cast<DISubprogram>(MD);
+
+      if (SP->getLinkageName() != "") {
+        cyberDWARFData.FunctionMembers << "\"" << SP->getLinkageName().str() << "\":{";
+      }
+      else {
+        cyberDWARFData.FunctionMembers << "\"" << SP->getName().str() << "\":{";
+      }
+      bool first_elem = true;
+      for (auto V : SP->getVariables()) {
+        auto RT = V->getRawType();
+        if (!first_elem) {
+          cyberDWARFData.FunctionMembers << ",";
+        }
+        first_elem = false;
+        cyberDWARFData.FunctionMembers << "\"" << V->getName().str() << "\":" << generateDebugRecordForVar(RT);
+      }
+      cyberDWARFData.FunctionMembers << "},";
+    }
+  }
+
+  // Need to dump any types under each compilation unit's retained types
+  auto CUs = TheModule->getNamedMetadata("llvm.dbg.cu");
+
+  for (auto CUi : CUs->operands()) {
+    auto CU = cast<DICompileUnit>(CUi);
+    auto RT = CU->getRetainedTypes();
+    for (auto RTi : RT) {
+      generateDebugRecordForVar(RTi);
+    }
+  }
+}
+
+// nativization
+
+void JSWriter::calculateNativizedVars(const Function *F) {
+  NativizedVars.clear();
+
+  for (Function::const_iterator I = F->begin(), BE = F->end(); I != BE; ++I) {
+    auto BI = &*I;
+    for (BasicBlock::const_iterator II = BI->begin(), E = BI->end(); II != E; ++II) {
+      const Instruction *I = &*II;
+      if (const AllocaInst *AI = dyn_cast<const AllocaInst>(I)) {
+        if (AI->getAllocatedType()->isVectorTy()) continue; // we do not nativize vectors, we rely on the LLVM optimizer to avoid load/stores on them
+        if (AI->getAllocatedType()->isAggregateType()) continue; // we do not nativize aggregates either
+        // this is on the stack. if its address is never used nor escaped, we can nativize it
+        bool Fail = false;
+        for (Instruction::const_user_iterator UI = I->user_begin(), UE = I->user_end(); UI != UE && !Fail; ++UI) {
+          const Instruction *U = dyn_cast<Instruction>(*UI);
+          if (!U) { Fail = true; break; } // not an instruction, not cool
+          switch (U->getOpcode()) {
+            case Instruction::Load: break; // load is cool
+            case Instruction::Store: {
+              if (U->getOperand(0) == I) Fail = true; // store *of* it is not cool; store *to* it is fine
+              break;
+            }
+            default: { Fail = true; break; } // anything that is "not" "cool", is "not cool"
+          }
+        }
+        if (!Fail) NativizedVars.insert(I);
+      }
+    }
+  }
+}
+
+// special analyses
+
+bool JSWriter::canReloop(const Function *F) {
+  return true;
+}
+
+// main entry
+
+void JSWriter::printCommaSeparated(const HeapData data) {
+  for (HeapData::const_iterator I = data.begin();
+       I != data.end(); ++I) {
+    if (I != data.begin()) {
+      Out << ",";
+    }
+    Out << (int)*I;
+  }
+}
+
+void JSWriter::printProgram(const std::string& fname,
+                             const std::string& mName) {
+  printModule(fname,mName);
+}
+
+void JSWriter::printModule(const std::string& fname,
+                            const std::string& mName) {
+  printModuleBody();
+}
+
+bool JSWriter::runOnModule(Module &M) {
+  TheModule = &M;
+  DL = &M.getDataLayout();
+  i32 = Type::getInt32Ty(M.getContext());
+
+  // sanity checks on options
+  assert(Relocatable ? GlobalBase == 0 : true);
+  assert(Relocatable ? EmulatedFunctionPointers : true);
+
+  // Build debug data first, so that inline metadata can reuse the indicies
+  if (EnableCyberDWARF)
+    buildCyberDWARFData();
+
+  setupCallHandlers();
+
+  printProgram("", "");
+
+  return false;
+}
+
+char JSWriter::ID = 0;
+
+class CheckTriple : public ModulePass {
+public:
+  static char ID;
+  CheckTriple() : ModulePass(ID) {}
+  bool runOnModule(Module &M) override {
+    if (M.getTargetTriple() != "asmjs-unknown-emscripten") {
+      prettyWarning() << "incorrect target triple '" << M.getTargetTriple() << "' (did you use emcc/em++ on all source files and not clang directly?)\n";
+    }
+    return false;
+  }
+};
+
+char CheckTriple::ID;
+
+Pass *createCheckTriplePass() {
+  return new CheckTriple();
+}
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool JSTargetMachine::addPassesToEmitFile(
+      PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType,
+      bool DisableVerify, AnalysisID StartBefore,
+      AnalysisID StartAfter, AnalysisID StopBefore, AnalysisID StopAfter,
+      MachineFunctionInitializer *MFInitializer) {
+  assert(FileType == TargetMachine::CGFT_AssemblyFile);
+
+  PM.add(createCheckTriplePass());
+
+  if (NoExitRuntime) {
+    PM.add(createNoExitRuntimePass());
+    // removing atexits opens up globalopt/globaldce opportunities
+    PM.add(createGlobalOptimizerPass());
+    PM.add(createGlobalDCEPass());
+  }
+
+  // PNaCl legalization
+  {
+    PM.add(createStripDanglingDISubprogramsPass());
+    if (EnableSjLjEH) {
+      // This comes before ExpandTls because it introduces references to
+      // a TLS variable, __pnacl_eh_stack.  This comes before
+      // InternalizePass because it assumes various variables (including
+      // __pnacl_eh_stack) have not been internalized yet.
+      PM.add(createPNaClSjLjEHPass());
+    } else if (EnableEmCxxExceptions) {
+      PM.add(createLowerEmExceptionsPass());
+    } else {
+      // LowerInvoke prevents use of C++ exception handling by removing
+      // references to BasicBlocks which handle exceptions.
+      PM.add(createLowerInvokePass());
+    }
+    // Run CFG simplification passes for a few reasons:
+    // (1) Landingpad blocks can be made unreachable by LowerInvoke
+    // when EnableSjLjEH is not enabled, so clean those up to ensure
+    // there are no landingpad instructions in the stable ABI.
+    // (2) Unreachable blocks can have strange properties like self-referencing
+    // instructions, so remove them.
+    PM.add(createCFGSimplificationPass());
+
+    PM.add(createLowerEmSetjmpPass());
+
+    // Expand out computed gotos (indirectbr and blockaddresses) into switches.
+    PM.add(createExpandIndirectBrPass());
+
+    // ExpandStructRegs must be run after ExpandVarArgs so that struct-typed
+    // "va_arg" instructions have been removed.
+    PM.add(createExpandVarArgsPass());
+
+    // Convert struct reg function params to struct* byval. This needs to be
+    // before ExpandStructRegs so it has a chance to rewrite aggregates from
+    // function arguments and returns into something ExpandStructRegs can expand.
+    PM.add(createSimplifyStructRegSignaturesPass());
+
+    // TODO(mtrofin) Remove the following and only run it as a post-opt pass once
+    //               the following bug is fixed.
+    // https://code.google.com/p/nativeclient/issues/detail?id=3857
+    PM.add(createExpandStructRegsPass());
+
+    PM.add(createExpandCtorsPass());
+
+    if (EnableEmAsyncify)
+      PM.add(createLowerEmAsyncifyPass());
+
+    // ExpandStructRegs must be run after ExpandArithWithOverflow to expand out
+    // the insertvalue instructions that ExpandArithWithOverflow introduces.
+    PM.add(createExpandArithWithOverflowPass());
+
+    // We place ExpandByVal after optimization passes because some byval
+    // arguments can be expanded away by the ArgPromotion pass.  Leaving
+    // in "byval" during optimization also allows some dead stores to be
+    // eliminated, because "byval" is a stronger constraint than what
+    // ExpandByVal expands it to.
+    PM.add(createExpandByValPass());
+
+    PM.add(createPromoteI1OpsPass());
+
+    // We should not place arbitrary passes after ExpandConstantExpr
+    // because they might reintroduce ConstantExprs.
+    PM.add(createExpandConstantExprPass());
+    // The following pass inserts GEPs, it must precede ExpandGetElementPtr. It
+    // also creates vector loads and stores, the subsequent pass cleans them up to
+    // fix their alignment.
+    PM.add(createConstantInsertExtractElementIndexPass());
+
+    // Optimization passes and ExpandByVal introduce
+    // memset/memcpy/memmove intrinsics with a 64-bit size argument.
+    // This pass converts those arguments to 32-bit.
+    PM.add(createCanonicalizeMemIntrinsicsPass());
+
+    // ConstantMerge cleans up after passes such as GlobalizeConstantVectors. It
+    // must run before the FlattenGlobals pass because FlattenGlobals loses
+    // information that otherwise helps ConstantMerge do a good job.
+    PM.add(createConstantMergePass());
+    // FlattenGlobals introduces ConstantExpr bitcasts of globals which
+    // are expanded out later. ReplacePtrsWithInts also creates some
+    // ConstantExprs, and it locally creates an ExpandConstantExprPass
+    // to clean both of these up.
+    PM.add(createFlattenGlobalsPass());
+
+    // The type legalization passes (ExpandLargeIntegers and PromoteIntegers) do
+    // not handle constexprs and create GEPs, so they go between those passes.
+    PM.add(createExpandLargeIntegersPass());
+    PM.add(createPromoteIntegersPass());
+    // Rewrite atomic and volatile instructions with intrinsic calls.
+    PM.add(createRewriteAtomicsPass());
+
+    PM.add(createSimplifyAllocasPass());
+
+    // The atomic cmpxchg instruction returns a struct, and is rewritten to an
+    // intrinsic as a post-opt pass, we therefore need to expand struct regs.
+    PM.add(createExpandStructRegsPass());
+
+    // Eliminate simple dead code that the post-opt passes could have created.
+    PM.add(createDeadCodeEliminationPass());
+  }
+  // end PNaCl legalization
+
+  PM.add(createExpandInsertExtractElementPass());
+
+  if (!OnlyWebAssembly) {
+    // if only wasm, then we can emit i64s, otherwise they must be lowered
+    PM.add(createExpandI64Pass());
+  } else {
+    // only wasm, and for now no atomics there, so just lower them out
+    PM.add(createLowerAtomicPass());
+  }
+
+  CodeGenOpt::Level OptLevel = getOptLevel();
+
+  // When optimizing, there shouldn't be any opportunities for SimplifyAllocas
+  // because the regular optimizer should have taken them all (GVN, and possibly
+  // also SROA).
+  if (OptLevel == CodeGenOpt::None)
+    PM.add(createEmscriptenSimplifyAllocasPass());
+
+  PM.add(createEmscriptenRemoveLLVMAssumePass());
+  PM.add(createEmscriptenExpandBigSwitchesPass());
+
+  PM.add(new JSWriter(Out, OptLevel));
+
+  return false;
+}
diff --git a/lib/Target/JSBackend/JSTargetMachine.cpp b/lib/Target/JSBackend/JSTargetMachine.cpp
new file mode 100644
index 000000000000..2ae3dd6f6a92
--- /dev/null
+++ b/lib/Target/JSBackend/JSTargetMachine.cpp
@@ -0,0 +1,48 @@
+//===-- JSTargetMachine.cpp - Define TargetMachine for the JS -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the JS specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JSTargetMachine.h"
+#include "JSTargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+extern const llvm::SubtargetFeatureKV JSSubTypeKV[] = {
+  { "asmjs", "Select the asmjs processor", { }, { } }
+};
+
+static const llvm::SubtargetInfoKV JSProcSchedModels[] = {
+  { "asmjs", &MCSchedModel::GetDefaultSchedModel() }
+};
+
+JSSubtarget::JSSubtarget(const TargetMachine& TM, const Triple &TT) :
+  TargetSubtargetInfo(TT, "asmjs", "asmjs", None, makeArrayRef(JSSubTypeKV, 1), JSProcSchedModels, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr),
+  TL(TM)
+ {}
+
+
+JSTargetMachine::JSTargetMachine(const Target &T, const Triple &TT,
+                                 StringRef CPU, StringRef FS, const TargetOptions &Options,
+                                 Optional<Reloc::Model>& RM, CodeModel::Model CM,
+                                 CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, "e-p:32:32-i64:64-v128:32:128-n32-S128", TT,
+                        CPU, FS, Options, Reloc::Static, CM, OL),
+      ST(*this, TT) {
+}
+
+TargetIRAnalysis JSTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(JSTTIImpl(this, F));
+  });
+}
+
diff --git a/lib/Target/JSBackend/JSTargetMachine.h b/lib/Target/JSBackend/JSTargetMachine.h
new file mode 100644
index 000000000000..8efe93f8a15a
--- /dev/null
+++ b/lib/Target/JSBackend/JSTargetMachine.h
@@ -0,0 +1,72 @@
+//===-- JSTargetMachine.h - TargetMachine for the JS Backend ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file declares the TargetMachine that is used by the JS/asm.js/
+// emscripten backend.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef JSTARGETMACHINE_H
+#define JSTARGETMACHINE_H
+
+#include "JS.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class formatted_raw_ostream;
+
+class JSTargetLowering : public TargetLowering {
+public:
+  explicit JSTargetLowering(const TargetMachine& TM) : TargetLowering(TM) {}
+};
+
+class JSSubtarget : public TargetSubtargetInfo {
+  JSTargetLowering TL;
+
+public:
+  JSSubtarget(const TargetMachine& TM, const Triple &TT);
+
+  const TargetLowering *getTargetLowering() const override {
+    return &TL;
+  }
+};
+
+class JSTargetMachine : public LLVMTargetMachine {
+  const JSSubtarget ST;
+
+public:
+  JSTargetMachine(const Target &T, const Triple &TT,
+                  StringRef CPU, StringRef FS, const TargetOptions &Options,
+                  Optional<Reloc::Model>& RM, CodeModel::Model CM,
+                  CodeGenOpt::Level OL);
+
+  bool addPassesToEmitFile(
+      PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType,
+      bool DisableVerify = true, AnalysisID StartBefore = nullptr,
+      AnalysisID StartAfter = nullptr, AnalysisID StopBefore = nullptr,
+      AnalysisID StopAfter = nullptr,
+      MachineFunctionInitializer *MFInitializer = nullptr) override;
+
+  TargetIRAnalysis getTargetIRAnalysis() override;
+
+  const TargetSubtargetInfo *getJSSubtargetImpl() const {
+    return &ST;
+  }
+
+  const JSSubtarget *getSubtargetImpl(const Function &F) const override {
+    return &ST;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/JSBackend/JSTargetTransformInfo.cpp b/lib/Target/JSBackend/JSTargetTransformInfo.cpp
new file mode 100644
index 000000000000..fe9a4d4414e4
--- /dev/null
+++ b/lib/Target/JSBackend/JSTargetTransformInfo.cpp
@@ -0,0 +1,121 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// JS target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JSTargetTransformInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/CostTable.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "JStti"
+
+void JSTTIImpl::getUnrollingPreferences(Loop *L,
+                                            TTI::UnrollingPreferences &UP) {
+  // We generally don't want a lot of unrolling.
+  UP.Partial = false;
+  UP.Runtime = false;
+}
+
+unsigned JSTTIImpl::getNumberOfRegisters(bool Vector) {
+  if (Vector) return 16; // like NEON, x86_64, etc.
+
+  return 8; // like x86, thumb, etc.
+}
+
+unsigned JSTTIImpl::getRegisterBitWidth(bool Vector) {
+  if (Vector) {
+    return 128;
+  }
+
+  return 32;
+}
+
+static const unsigned Nope = 65536;
+
+// Certain types are fine, but some vector types must be avoided at all Costs.
+static bool isOkType(Type *Ty) {
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (VTy->getNumElements() != 4 || !(VTy->getElementType()->isIntegerTy(1) ||
+                                        VTy->getElementType()->isIntegerTy(32) ||
+                                        VTy->getElementType()->isFloatTy())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+unsigned JSTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value*> Args) {
+
+  unsigned Cost = BasicTTIImplBase<JSTTIImpl>::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+                                                                      Opd2Info, Opd1PropInfo,
+                                                                      Opd2PropInfo, Args);
+
+  if (!isOkType(Ty))
+    return Nope;
+
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    switch (Opcode) {
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::Shl:
+        // SIMD.js' shifts are currently only ByScalar.
+        if (Opd2Info != TTI::OK_UniformValue && Opd2Info != TTI::OK_UniformConstantValue)
+          Cost = Cost * VTy->getNumElements() + 100;
+        break;
+    }
+  }
+  return Cost;
+}
+
+unsigned JSTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  if (!isOkType(Val))
+    return Nope;
+
+  unsigned Cost = BasicTTIImplBase::getVectorInstrCost(Opcode, Val, Index);
+
+  // SIMD.js' insert/extract currently only take constant indices.
+  if (Index == -1u)
+    return Cost + 100;
+
+  return Cost;
+}
+
+
+unsigned JSTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                    unsigned AddressSpace) {
+  if (!isOkType(Src))
+    return Nope;
+
+  return BasicTTIImplBase::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+}
+
+unsigned JSTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+  if (!isOkType(Src) || !isOkType(Dst))
+    return Nope;
+
+  return BasicTTIImplBase::getCastInstrCost(Opcode, Dst, Src);
+}
+
diff --git a/lib/Target/JSBackend/JSTargetTransformInfo.h b/lib/Target/JSBackend/JSTargetTransformInfo.h
new file mode 100644
index 000000000000..638a28667f70
--- /dev/null
+++ b/lib/Target/JSBackend/JSTargetTransformInfo.h
@@ -0,0 +1,97 @@
+//===-- JSTargetTransformInfo.h - JS specific TTI -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file a TargetTransformInfo::Concept conforming object specific to the
+/// JS target machine. It uses the target's detailed information to
+/// provide more precise answers to certain TTI queries, while letting the
+/// target independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_JS_JSTARGETTRANSFORMINFO_H
+#define LLVM_LIB_TARGET_JS_JSTARGETTRANSFORMINFO_H
+
+#include "JSTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class JSTTIImpl : public BasicTTIImplBase<JSTTIImpl> {
+  typedef BasicTTIImplBase<JSTTIImpl> BaseT;
+  typedef TargetTransformInfo TTI;
+  friend BaseT;
+
+  const TargetSubtargetInfo *ST;
+  const TargetLoweringBase *TLI;
+
+  const TargetSubtargetInfo *getST() const { return ST; }
+  const TargetLoweringBase *getTLI() const { return TLI; }
+
+public:
+  explicit JSTTIImpl(const JSTargetMachine *TM, const Function &F)
+      : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+        TLI(ST->getTargetLowering()) {}
+
+  // Provide value semantics. MSVC requires that we spell all of these out.
+  JSTTIImpl(const JSTTIImpl &Arg)
+      : BaseT(static_cast<const BaseT &>(Arg)), ST(Arg.ST), TLI(Arg.TLI) {}
+  JSTTIImpl(JSTTIImpl &&Arg)
+      : BaseT(std::move(static_cast<BaseT &>(Arg))), ST(std::move(Arg.ST)),
+        TLI(std::move(Arg.TLI)) {}
+/*
+  JSTTIImpl &operator=(const JSTTIImpl &RHS) {
+    BaseT::operator=(static_cast<const BaseT &>(RHS));
+    ST = RHS.ST;
+    TLI = RHS.TLI;
+    return *this;
+  }
+  JSTTIImpl &operator=(JSTTIImpl &&RHS) {
+    BaseT::operator=(std::move(static_cast<BaseT &>(RHS)));
+    ST = std::move(RHS.ST);
+    TLI = std::move(RHS.TLI);
+    return *this;
+  }
+*/
+
+  bool hasBranchDivergence() { return true; }
+
+  void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
+
+  TTI::PopcntSupportKind getPopcntSupport(
+      unsigned TyWidth) {
+    assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+    // Hopefully we'll get popcnt in ES7, but for now, we just have software.
+    return TargetTransformInfo::PSK_Software;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector);
+
+  unsigned getRegisterBitWidth(bool Vector);
+
+  unsigned getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value*> Args = {});
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace);
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/JSBackend/LLVMBuild.txt b/lib/Target/JSBackend/LLVMBuild.txt
new file mode 100644
index 000000000000..850253e539fa
--- /dev/null
+++ b/lib/Target/JSBackend/LLVMBuild.txt
@@ -0,0 +1,31 @@
+;===- ./lib/Target/JSBackend/LLVMBuild.txt --------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = MCTargetDesc NaCl TargetInfo
+
+[component_0]
+type = TargetGroup
+name = JSBackend
+parent = Target
+
+[component_1]
+type = Library
+name = JSBackendCodeGen
+parent = JSBackend
+required_libraries = Analysis CodeGen Core IPO JSBackendInfo JSBackendDesc MC PNaClTransforms Scalar Support SelectionDAG Target TransformUtils
+add_to_library_groups = JSBackend
diff --git a/lib/Target/JSBackend/MCTargetDesc/CMakeLists.txt b/lib/Target/JSBackend/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 000000000000..81c5eadef6a7
--- /dev/null
+++ b/lib/Target/JSBackend/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_llvm_library(LLVMJSBackendDesc
+  JSBackendMCTargetDesc.cpp
+  )
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/JSBackend/MCTargetDesc/JSBackendMCTargetDesc.cpp b/lib/Target/JSBackend/MCTargetDesc/JSBackendMCTargetDesc.cpp
new file mode 100644
index 000000000000..01b225ee4e3c
--- /dev/null
+++ b/lib/Target/JSBackend/MCTargetDesc/JSBackendMCTargetDesc.cpp
@@ -0,0 +1,22 @@
+//===-- JSBackendMCTargetDesc.cpp - JS Backend Target Descriptions --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides asm.js specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "JSBackendMCTargetDesc.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+// Force static initialization.
+extern "C" void LLVMInitializeJSBackendTargetMC() {
+  // nothing to register
+}
+
diff --git a/lib/Target/JSBackend/MCTargetDesc/JSBackendMCTargetDesc.h b/lib/Target/JSBackend/MCTargetDesc/JSBackendMCTargetDesc.h
new file mode 100644
index 000000000000..c98a55df83ba
--- /dev/null
+++ b/lib/Target/JSBackend/MCTargetDesc/JSBackendMCTargetDesc.h
@@ -0,0 +1,25 @@
+//===- JSBackendMCTargetDesc.h - JS Backend Target Descriptions -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides asm.js specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef JSBACKENDMCTARGETDESC_H
+#define JSBACKENDMCTARGETDESC_H
+
+#include "llvm/Support/TargetRegistry.h"
+
+namespace llvm {
+
+extern Target TheJSBackendTarget;
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Target/JSBackend/MCTargetDesc/LLVMBuild.txt b/lib/Target/JSBackend/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 000000000000..b7f3e6d89a00
--- /dev/null
+++ b/lib/Target/JSBackend/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/JSBackend/MCTargetDesc/LLVMBuild.txt --------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = JSBackendDesc
+parent = JSBackend
+required_libraries = MC Support JSBackendInfo
+add_to_library_groups = JSBackend
+
diff --git a/lib/Target/JSBackend/NaCl/AddPNaClExternalDecls.cpp b/lib/Target/JSBackend/NaCl/AddPNaClExternalDecls.cpp
new file mode 100644
index 000000000000..871a834b79b6
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/AddPNaClExternalDecls.cpp
@@ -0,0 +1,85 @@
+//===- AddPNaClExternalDecls.cpp - Add decls for PNaCl external functions -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass adds function declarations for external functions used by PNaCl.
+// These externals are implemented in native libraries and calls to them are
+// created as part of the translation process.
+//
+// Running this pass is a precondition for running ResolvePNaClIntrinsics. They
+// are separate because one is a ModulePass and the other is a FunctionPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NaClAtomicIntrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  // This is a module pass because it adds declarations to the module.
+  class AddPNaClExternalDecls : public ModulePass {
+  public:
+    static char ID;
+    AddPNaClExternalDecls() : ModulePass(ID) {
+      initializeAddPNaClExternalDeclsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+bool AddPNaClExternalDecls::runOnModule(Module &M) {
+  // Add declarations for a pre-defined set of external functions to the module.
+  // The function names must match the functions implemented in native code (in
+  // pnacl/support). The function types must match the types of the LLVM
+  // intrinsics.
+  // We expect these declarations not to exist in the module before this pass
+  // runs, but don't assert it; it will be handled by the ABI verifier.
+  LLVMContext &C = M.getContext();
+  M.getOrInsertFunction("setjmp",
+                        // return type
+                        Type::getInt32Ty(C),
+                        // arguments
+                        Type::getInt8Ty(C)->getPointerTo(),
+                        NULL);
+  M.getOrInsertFunction("longjmp",
+                        // return type
+                        Type::getVoidTy(C),
+                        // arguments
+                        Type::getInt8Ty(C)->getPointerTo(),
+                        Type::getInt32Ty(C),
+                        NULL);
+
+  // Add Intrinsic declarations needed by ResolvePNaClIntrinsics up front.
+  Intrinsic::getDeclaration(&M, Intrinsic::nacl_setjmp);
+  Intrinsic::getDeclaration(&M, Intrinsic::nacl_longjmp);
+  NaCl::AtomicIntrinsics AI(C);
+  NaCl::AtomicIntrinsics::View V = AI.allIntrinsicsAndOverloads();
+  for (NaCl::AtomicIntrinsics::View::iterator I = V.begin(), E = V.end();
+       I != E; ++I) {
+    I->getDeclaration(&M);
+  }
+  Intrinsic::getDeclaration(&M, Intrinsic::nacl_atomic_is_lock_free);
+
+  return true;
+}
+
+char AddPNaClExternalDecls::ID = 0;
+INITIALIZE_PASS(AddPNaClExternalDecls, "add-pnacl-external-decls",
+                "Add declarations of external functions used by PNaCl",
+                false, false)
+
+ModulePass *llvm::createAddPNaClExternalDeclsPass() {
+  return new AddPNaClExternalDecls();
+}
diff --git a/lib/Target/JSBackend/NaCl/BackendCanonicalize.cpp b/lib/Target/JSBackend/NaCl/BackendCanonicalize.cpp
new file mode 100644
index 000000000000..de9852336539
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/BackendCanonicalize.cpp
@@ -0,0 +1,360 @@
+//===- BackendCanonicalize.cpp --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Clean up some toolchain-side PNaCl ABI simplification passes. These passes
+// allow PNaCl to have a simple and stable ABI, but they sometimes lead to
+// harder-to-optimize code. This is desirable because LLVM's definition of
+// "canonical" evolves over time, meaning that PNaCl's simple ABI can stay
+// simple yet still take full advantage of LLVM's backend by having this pass
+// massage the code into something that the backend prefers handling.
+//
+// It currently:
+// - Re-generates shufflevector (not part of the PNaCl ABI) from insertelement /
+//   extractelement combinations. This is done by duplicating some of
+//   instcombine's implementation, and ignoring optimizations that should
+//   already have taken place.
+// - Re-materializes constant loads, especially of vectors. This requires doing
+//   constant folding through bitcasts.
+//
+// The pass also performs limited DCE on instructions it knows to be dead,
+// instead of performing a full global DCE.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+// =============================================================================
+// TODO(jfb) The following functions are as-is from instcombine. Make them
+//           reusable instead.
+
+/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
+/// elements from either LHS or RHS, return the shuffle mask and true.
+/// Otherwise, return false.
+static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         SmallVectorImpl<Constant*> &Mask) {
+  assert(LHS->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
+  unsigned NumElts = V->getType()->getVectorNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
+    return true;
+  }
+
+  if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
+    return true;
+  }
+
+  if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                                      i+NumElts));
+    return true;
+  }
+
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // We can handle this if the vector we are inserting into is
+      // transitively ok.
+      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext()));
+        return true;
+      }
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1))) {
+        unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned NumLHSElts = LHS->getType()->getVectorNumElements();
+
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // We can handle this if the vector we are inserting into is
+          // transitively ok.
+          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx % NumElts] =
+              ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                               ExtractedIdx);
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx % NumElts] =
+              ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                               ExtractedIdx + NumLHSElts);
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+/// We are building a shuffle to create V, which is a sequence of insertelement,
+/// extractelement pairs. If PermittedRHS is set, then we must either use it or
+/// not rely on the second vector source. Return a std::pair containing the
+/// left and right vectors of the proposed shuffle (or 0), and set the Mask
+/// parameter as required.
+///
+/// Note: we intentionally don't try to fold earlier shuffles since they have
+/// often been chosen carefully to be efficiently implementable on the target.
+typedef std::pair<Value *, Value *> ShuffleOps;
+
+static ShuffleOps CollectShuffleElements(Value *V,
+                                         SmallVectorImpl<Constant *> &Mask,
+                                         Value *PermittedRHS) {
+  assert(V->getType()->isVectorTy() && "Invalid shuffle!");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
+    return std::make_pair(
+        PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
+  }
+
+  if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
+    return std::make_pair(V, nullptr);
+  }
+
+  if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
+          Value *RHS = EI->getOperand(0);
+          ShuffleOps LR = CollectShuffleElements(VecOp, Mask, RHS);
+          assert(LR.second == nullptr || LR.second == RHS);
+
+          if (LR.first->getType() != RHS->getType()) {
+            // We tried our best, but we can't find anything compatible with RHS
+            // further up the chain. Return a trivial shuffle.
+            for (unsigned i = 0; i < NumElts; ++i)
+              Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()), i);
+            return std::make_pair(V, nullptr);
+          }
+
+          unsigned NumLHSElts = RHS->getType()->getVectorNumElements();
+          Mask[InsertedIdx % NumElts] =
+            ConstantInt::get(Type::getInt32Ty(V->getContext()),
+                             NumLHSElts+ExtractedIdx);
+          return std::make_pair(LR.first, RHS);
+        }
+
+        if (VecOp == PermittedRHS) {
+          // We've gone as far as we can: anything on the other side of the
+          // extractelement will already have been converted into a shuffle.
+          unsigned NumLHSElts =
+              EI->getOperand(0)->getType()->getVectorNumElements();
+          for (unsigned i = 0; i != NumElts; ++i)
+            Mask.push_back(ConstantInt::get(
+                Type::getInt32Ty(V->getContext()),
+                i == InsertedIdx ? ExtractedIdx : NumLHSElts + i));
+          return std::make_pair(EI->getOperand(0), PermittedRHS);
+        }
+
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
+            CollectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
+                                         Mask))
+          return std::make_pair(EI->getOperand(0), PermittedRHS);
+      }
+    }
+  }
+
+  // Otherwise, can't do anything fancy.  Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
+  return std::make_pair(V, nullptr);
+}
+
+// =============================================================================
+
+
+namespace {
+
+class BackendCanonicalize : public FunctionPass,
+                            public InstVisitor<BackendCanonicalize, bool> {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  BackendCanonicalize() : FunctionPass(ID), DL(0), TLI(0) {
+    initializeBackendCanonicalizePass(*PassRegistry::getPassRegistry());
+  }
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  virtual bool runOnFunction(Function &F);
+
+  // InstVisitor implementation. Unhandled instructions stay as-is.
+  bool visitInstruction(Instruction &I) { return false; }
+  bool visitInsertElementInst(InsertElementInst &IE);
+  bool visitBitCastInst(BitCastInst &C);
+  bool visitLoadInst(LoadInst &L);
+
+private:
+  const DataLayout *DL;
+  const TargetLibraryInfo *TLI;
+
+  // List of instructions that are now obsolete, and should be DCE'd.
+  typedef SmallVector<Instruction *, 512> KillList;
+  KillList Kill;
+
+  /// Helper that constant folds an instruction.
+  bool visitConstantFoldableInstruction(Instruction *I);
+
+  /// Empty the kill list, making sure that all other dead instructions
+  /// up the chain (but in the current basic block) also get killed.
+  static void emptyKillList(KillList &Kill);
+};
+
+} // anonymous namespace
+
+char BackendCanonicalize::ID = 0;
+INITIALIZE_PASS(BackendCanonicalize, "backend-canonicalize",
+                "Canonicalize PNaCl bitcode for LLVM backends", false, false)
+
+bool BackendCanonicalize::runOnFunction(Function &F) {
+  bool Modified = false;
+  DL = &F.getParent()->getDataLayout();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI)
+      Modified |= visit(&*BI);
+  emptyKillList(Kill);
+  return Modified;
+}
+
+// This function is *almost* as-is from instcombine, avoiding silly
+// cases that should already have been optimized.
+bool BackendCanonicalize::visitInsertElementInst(InsertElementInst &IE) {
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp = IE.getOperand(2);
+
+  // If the inserted element was extracted from some other vector, and if the
+  // indexes are constant, try to turn this into a shufflevector operation.
+  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
+      unsigned NumInsertVectorElts = IE.getType()->getNumElements();
+      unsigned NumExtractVectorElts =
+          EI->getOperand(0)->getType()->getVectorNumElements();
+      unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+      if (ExtractedIdx >= NumExtractVectorElts) // Out of range extract.
+        return false;
+
+      if (InsertedIdx >= NumInsertVectorElts)  // Out of range insert.
+        return false;
+
+      // If this insertelement isn't used by some other insertelement, turn it
+      // (and any insertelements it points to), into one big shuffle.
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.user_back())) {
+        typedef SmallVector<Constant *, 16> MaskT;
+        MaskT Mask;
+        Value *LHS, *RHS;
+        std::tie(LHS, RHS) = CollectShuffleElements(&IE, Mask, nullptr);
+        if (!RHS)
+          RHS = UndefValue::get(LHS->getType());
+        // We now have a shuffle of LHS, RHS, Mask.
+
+        if (isa<UndefValue>(LHS) && !isa<UndefValue>(RHS)) {
+          // Canonicalize shufflevector to always have undef on the RHS,
+          // and adjust the mask.
+          std::swap(LHS, RHS);
+          for (MaskT::iterator I = Mask.begin(), E = Mask.end(); I != E; ++I) {
+            unsigned Idx = cast<ConstantInt>(*I)->getZExtValue();
+            unsigned NewIdx = Idx >= NumInsertVectorElts
+                                  ? Idx - NumInsertVectorElts
+                                  : Idx + NumInsertVectorElts;
+            *I = ConstantInt::get(Type::getInt32Ty(RHS->getContext()), NewIdx);
+          }
+        }
+
+        IRBuilder<> IRB(&IE);
+        IE.replaceAllUsesWith(
+            IRB.CreateShuffleVector(LHS, RHS, ConstantVector::get(Mask)));
+        // The chain of now-dead insertelement / extractelement
+        // instructions can be deleted.
+        Kill.push_back(&IE);
+
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool BackendCanonicalize::visitBitCastInst(BitCastInst &B) {
+  return visitConstantFoldableInstruction(&B);
+}
+
+bool BackendCanonicalize::visitLoadInst(LoadInst &L) {
+  return visitConstantFoldableInstruction(&L);
+}
+
+bool BackendCanonicalize::visitConstantFoldableInstruction(Instruction *I) {
+  if (Constant *Folded = ConstantFoldInstruction(I, *DL, TLI)) {
+    I->replaceAllUsesWith(Folded);
+    Kill.push_back(I);
+    return true;
+  }
+  return false;
+}
+
+void BackendCanonicalize::emptyKillList(KillList &Kill) {
+  while (!Kill.empty())
+    RecursivelyDeleteTriviallyDeadInstructions(Kill.pop_back_val());
+}
+
+FunctionPass *llvm::createBackendCanonicalizePass() {
+  return new BackendCanonicalize();
+}
diff --git a/lib/Target/JSBackend/NaCl/CMakeLists.txt b/lib/Target/JSBackend/NaCl/CMakeLists.txt
new file mode 100644
index 000000000000..53dad6fcd13b
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/CMakeLists.txt
@@ -0,0 +1,55 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPNaClTransforms
+  AddPNaClExternalDecls.cpp
+  BackendCanonicalize.cpp
+  CanonicalizeMemIntrinsics.cpp
+  CleanupUsedGlobalsMetadata.cpp
+  ConstantInsertExtractElementIndex.cpp
+  ExceptionInfoWriter.cpp
+  ExpandArithWithOverflow.cpp
+  ExpandByVal.cpp
+  ExpandConstantExpr.cpp
+  ExpandCtors.cpp
+  ExpandGetElementPtr.cpp
+  ExpandIndirectBr.cpp
+  ExpandLargeIntegers.cpp
+  ExpandShuffleVector.cpp
+  ExpandSmallArguments.cpp
+  ExpandStructRegs.cpp
+  ExpandTls.cpp
+  ExpandTlsConstantExpr.cpp
+  ExpandUtils.cpp
+  ExpandVarArgs.cpp
+  FixVectorLoadStoreAlignment.cpp
+  FlattenGlobals.cpp
+  SimplifiedFuncTypeMap.cpp
+  GlobalCleanup.cpp
+  GlobalizeConstantVectors.cpp
+  InsertDivideCheck.cpp
+  InternalizeUsedGlobals.cpp
+  NormalizeAlignment.cpp
+  PNaClSjLjEH.cpp
+  PromoteI1Ops.cpp
+  PromoteIntegers.cpp
+  RemoveAsmMemory.cpp
+  ReplacePtrsWithInts.cpp
+  ResolvePNaClIntrinsics.cpp
+  RewriteAtomics.cpp
+  RewriteLLVMIntrinsics.cpp
+  RewritePNaClLibraryCalls.cpp
+  SimplifyAllocas.cpp
+  SimplifyStructRegSignatures.cpp
+  StripAttributes.cpp
+  StripMetadata.cpp
+  # Emscripten files:
+  ExpandI64.cpp
+  ExpandInsertExtractElement.cpp
+  LowerEmAsyncify.cpp
+  LowerEmExceptionsPass.cpp
+  LowerEmSetjmp.cpp
+  NoExitRuntime.cpp
+  # Emscripten files end.
+  )
+
+add_dependencies(LLVMPNaClTransforms intrinsics_gen)
diff --git a/lib/Target/JSBackend/NaCl/CanonicalizeMemIntrinsics.cpp b/lib/Target/JSBackend/NaCl/CanonicalizeMemIntrinsics.cpp
new file mode 100644
index 000000000000..1acde98d322a
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/CanonicalizeMemIntrinsics.cpp
@@ -0,0 +1,100 @@
+//===- CanonicalizeMemIntrinsics.cpp - Make memcpy's "len" arg consistent--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass canonicalizes uses of the llvm.memset, llvm.memcpy and
+// llvm.memmove intrinsics so that the variants with 64-bit "len"
+// arguments aren't used, and the 32-bit variants are used instead.
+//
+// This means the PNaCl translator won't need to handle two versions
+// of each of these intrinsics, and it won't need to do any implicit
+// truncations from 64-bit to 32-bit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  // This is a ModulePass because that makes it easier to find all
+  // uses of intrinsics efficiently.
+  class CanonicalizeMemIntrinsics : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    CanonicalizeMemIntrinsics() : ModulePass(ID) {
+      initializeCanonicalizeMemIntrinsicsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char CanonicalizeMemIntrinsics::ID = 0;
+INITIALIZE_PASS(CanonicalizeMemIntrinsics, "canonicalize-mem-intrinsics",
+                "Make memcpy() et al's \"len\" argument consistent",
+                false, false)
+
+static bool expandIntrinsic(Module *M, Intrinsic::ID ID) {
+  SmallVector<Type *, 3> Types;
+  Types.push_back(Type::getInt8PtrTy(M->getContext()));
+  if (ID != Intrinsic::memset)
+    Types.push_back(Type::getInt8PtrTy(M->getContext()));
+  unsigned LengthTypePos = Types.size();
+  Types.push_back(Type::getInt64Ty(M->getContext()));
+
+  std::string OldName = Intrinsic::getName(ID, Types);
+  Function *OldIntrinsic = M->getFunction(OldName);
+  if (!OldIntrinsic)
+    return false;
+
+  Types[LengthTypePos] = Type::getInt32Ty(M->getContext());
+  Function *NewIntrinsic = Intrinsic::getDeclaration(M, ID, Types);
+
+  SmallVector<CallInst *, 64> Calls;
+  for (User *U : OldIntrinsic->users()) {
+    if (CallInst *Call = dyn_cast<CallInst>(U))
+      Calls.push_back(Call);
+    else
+      report_fatal_error("CanonicalizeMemIntrinsics: Taking the address of an "
+                         "intrinsic is not allowed: " +
+                         OldName);
+  }
+
+  for (CallInst *Call : Calls) {
+    // This temporarily leaves Call non-well-typed.
+    Call->setCalledFunction(NewIntrinsic);
+    // Truncate the "len" argument.  No overflow check.
+    IRBuilder<> Builder(Call);
+    Value *Length = Builder.CreateTrunc(Call->getArgOperand(2),
+                                        Type::getInt32Ty(M->getContext()),
+                                        "mem_len_truncate");
+    Call->setArgOperand(2, Length);
+  }
+
+  OldIntrinsic->eraseFromParent();
+  return true;
+}
+
+bool CanonicalizeMemIntrinsics::runOnModule(Module &M) {
+  bool Changed = false;
+  Changed |= expandIntrinsic(&M, Intrinsic::memset);
+  Changed |= expandIntrinsic(&M, Intrinsic::memcpy);
+  Changed |= expandIntrinsic(&M, Intrinsic::memmove);
+  return Changed;
+}
+
+ModulePass *llvm::createCanonicalizeMemIntrinsicsPass() {
+  return new CanonicalizeMemIntrinsics();
+}
diff --git a/lib/Target/JSBackend/NaCl/CleanupUsedGlobalsMetadata.cpp b/lib/Target/JSBackend/NaCl/CleanupUsedGlobalsMetadata.cpp
new file mode 100644
index 000000000000..a0e88effddfc
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/CleanupUsedGlobalsMetadata.cpp
@@ -0,0 +1,48 @@
+//===- CleanupUsedGlobalsMetadata.cpp - Cleanup llvm.used -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===---------------------------------------------------------------------===//
+//
+// Remove llvm.used metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+class CleanupUsedGlobalsMetadata : public ModulePass {
+public:
+  static char ID;
+  CleanupUsedGlobalsMetadata() : ModulePass(ID) {
+    initializeCleanupUsedGlobalsMetadataPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+};
+}
+
+char CleanupUsedGlobalsMetadata::ID = 0;
+INITIALIZE_PASS(CleanupUsedGlobalsMetadata, "cleanup-used-globals-metadata",
+                "Removes llvm.used metadata.", false, false)
+
+bool CleanupUsedGlobalsMetadata::runOnModule(Module &M) {
+  bool Modified = false;
+
+  if (auto *GV = M.getNamedGlobal("llvm.used")) {
+    GV->eraseFromParent();
+    Modified = true;
+  }
+
+  return Modified;
+}
+
+ModulePass *llvm::createCleanupUsedGlobalsMetadataPass() {
+  return new CleanupUsedGlobalsMetadata();
+}
\ No newline at end of file
diff --git a/lib/Target/JSBackend/NaCl/ConstantInsertExtractElementIndex.cpp b/lib/Target/JSBackend/NaCl/ConstantInsertExtractElementIndex.cpp
new file mode 100644
index 000000000000..743cada62233
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ConstantInsertExtractElementIndex.cpp
@@ -0,0 +1,180 @@
+//===- ConstantInsertExtractElementIndex.cpp - Insert/Extract element -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Transform all InsertElement and ExtractElement with non-constant or
+// out-of-bounds indices into either in-bounds constant accesses or
+// stack accesses. This moves all undefined behavior to the stack,
+// making InsertElement and ExtractElement well-defined.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+#include <algorithm>
+
+using namespace llvm;
+
+namespace {
+class ConstantInsertExtractElementIndex : public BasicBlockPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ConstantInsertExtractElementIndex() : BasicBlockPass(ID), M(0), DL(0) {
+    initializeConstantInsertExtractElementIndexPass(
+        *PassRegistry::getPassRegistry());
+  }
+  using BasicBlockPass::doInitialization;
+  bool doInitialization(Module &Mod) override {
+    M = &Mod;
+    return false; // Unchanged.
+  }
+  bool runOnBasicBlock(BasicBlock &BB) override;
+
+private:
+  typedef SmallVector<Instruction *, 8> Instructions;
+  const Module *M;
+  const DataLayout *DL;
+
+  void findNonConstantInsertExtractElements(
+      const BasicBlock &BB, Instructions &OutOfRangeConstantIndices,
+      Instructions &NonConstantVectorIndices) const;
+  void fixOutOfRangeConstantIndices(BasicBlock &BB,
+                                    const Instructions &Instrs) const;
+  void fixNonConstantVectorIndices(BasicBlock &BB,
+                                   const Instructions &Instrs) const;
+};
+
+/// Number of elements in a vector instruction.
+unsigned vectorNumElements(const Instruction *I) {
+  return cast<VectorType>(I->getOperand(0)->getType())->getNumElements();
+}
+
+/// Get the index of an InsertElement or ExtractElement instruction, or null.
+Value *getInsertExtractElementIdx(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default: return NULL;
+  case Instruction::InsertElement: return I->getOperand(2);
+  case Instruction::ExtractElement: return I->getOperand(1);
+  }
+}
+
+/// Set the index of an InsertElement or ExtractElement instruction.
+void setInsertExtractElementIdx(Instruction *I, Value *NewIdx) {
+  switch (I->getOpcode()) {
+  default:
+    llvm_unreachable(
+        "expected instruction to be InsertElement or ExtractElement");
+  case Instruction::InsertElement: I->setOperand(2, NewIdx); break;
+  case Instruction::ExtractElement: I->setOperand(1, NewIdx); break;
+  }
+}
+} // anonymous namespace
+
+char ConstantInsertExtractElementIndex::ID = 0;
+INITIALIZE_PASS(
+    ConstantInsertExtractElementIndex, "constant-insert-extract-element-index",
+    "Force insert and extract vector element to always be in bounds", false,
+    false)
+
+void ConstantInsertExtractElementIndex::findNonConstantInsertExtractElements(
+    const BasicBlock &BB, Instructions &OutOfRangeConstantIndices,
+    Instructions &NonConstantVectorIndices) const {
+  for (BasicBlock::const_iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE;
+       ++BBI) {
+    const Instruction *I = &*BBI;
+    if (Value *Idx = getInsertExtractElementIdx(I)) {
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (!CI->getValue().ult(vectorNumElements(I)))
+          OutOfRangeConstantIndices.push_back(const_cast<Instruction *>(I));
+      } else
+        NonConstantVectorIndices.push_back(const_cast<Instruction *>(I));
+    }
+  }
+}
+
+void ConstantInsertExtractElementIndex::fixOutOfRangeConstantIndices(
+    BasicBlock &BB, const Instructions &Instrs) const {
+  for (Instructions::const_iterator IB = Instrs.begin(), IE = Instrs.end();
+       IB != IE; ++IB) {
+    Instruction *I = *IB;
+    const APInt &Idx =
+        cast<ConstantInt>(getInsertExtractElementIdx(I))->getValue();
+    APInt NumElements = APInt(Idx.getBitWidth(), vectorNumElements(I));
+    APInt NewIdx = Idx.urem(NumElements);
+    setInsertExtractElementIdx(I, ConstantInt::get(M->getContext(), NewIdx));
+  }
+}
+
+void ConstantInsertExtractElementIndex::fixNonConstantVectorIndices(
+    BasicBlock &BB, const Instructions &Instrs) const {
+  for (Instructions::const_iterator IB = Instrs.begin(), IE = Instrs.end();
+       IB != IE; ++IB) {
+    Instruction *I = *IB;
+    Value *Vec = I->getOperand(0);
+    Value *Idx = getInsertExtractElementIdx(I);
+    VectorType *VecTy = cast<VectorType>(Vec->getType());
+    Type *ElemTy = VecTy->getElementType();
+    unsigned ElemAlign = DL->getPrefTypeAlignment(ElemTy);
+    unsigned VecAlign = std::max(ElemAlign, DL->getPrefTypeAlignment(VecTy));
+
+    IRBuilder<> IRB(I);
+    AllocaInst *Alloca = IRB.CreateAlloca(
+        ElemTy, ConstantInt::get(Type::getInt32Ty(M->getContext()),
+                                 vectorNumElements(I)));
+    Alloca->setAlignment(VecAlign);
+    Value *AllocaAsVec = IRB.CreateBitCast(Alloca, VecTy->getPointerTo());
+    IRB.CreateAlignedStore(Vec, AllocaAsVec, Alloca->getAlignment());
+    Value *GEP = IRB.CreateGEP(Alloca, Idx);
+
+    Value *Res;
+    switch (I->getOpcode()) {
+    default:
+      llvm_unreachable("expected InsertElement or ExtractElement");
+    case Instruction::InsertElement:
+      IRB.CreateAlignedStore(I->getOperand(1), GEP, ElemAlign);
+      Res = IRB.CreateAlignedLoad(AllocaAsVec, Alloca->getAlignment());
+      break;
+    case Instruction::ExtractElement:
+      Res = IRB.CreateAlignedLoad(GEP, ElemAlign);
+      break;
+    }
+
+    I->replaceAllUsesWith(Res);
+    I->eraseFromParent();
+  }
+}
+
+bool ConstantInsertExtractElementIndex::runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+  if (!DL)
+    DL = &BB.getParent()->getParent()->getDataLayout();
+  Instructions OutOfRangeConstantIndices;
+  Instructions NonConstantVectorIndices;
+
+  findNonConstantInsertExtractElements(BB, OutOfRangeConstantIndices,
+                                       NonConstantVectorIndices);
+  if (!OutOfRangeConstantIndices.empty()) {
+    Changed = true;
+    fixOutOfRangeConstantIndices(BB, OutOfRangeConstantIndices);
+  }
+  if (!NonConstantVectorIndices.empty()) {
+    Changed = true;
+    fixNonConstantVectorIndices(BB, NonConstantVectorIndices);
+  }
+  return Changed;
+}
+
+BasicBlockPass *llvm::createConstantInsertExtractElementIndexPass() {
+  return new ConstantInsertExtractElementIndex();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExceptionInfoWriter.cpp b/lib/Target/JSBackend/NaCl/ExceptionInfoWriter.cpp
new file mode 100644
index 000000000000..0596d92c29df
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExceptionInfoWriter.cpp
@@ -0,0 +1,291 @@
+//===- ExceptionInfoWriter.cpp - Generate C++ exception info for PNaCl-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ExceptionInfoWriter class converts the clauses of a
+// "landingpad" instruction into data tables stored in global
+// variables.  These tables are interpreted by PNaCl's C++ runtime
+// library (either libsupc++ or libcxxabi), which is linked into a
+// pexe.
+//
+// This is similar to the lowering that the LLVM backend does to
+// convert landingpad clauses into ".gcc_except_table" sections.  The
+// difference is that ExceptionInfoWriter is an IR-to-IR
+// transformation that runs on the PNaCl user toolchain side.  The
+// format it produces is not part of PNaCl's stable ABI; the PNaCl
+// translator and LLVM backend do not know about this format.
+//
+// Encoding:
+//
+// A landingpad instruction contains a list of clauses.
+// ExceptionInfoWriter encodes each clause as a 32-bit "clause ID".  A
+// clause is one of the following forms:
+//
+//  1) "catch i8* @ExcType"
+//     * This clause means that the landingpad should be entered if
+//       the C++ exception being thrown has type @ExcType (or a
+//       subtype of @ExcType).  @ExcType is a pointer to the
+//       std::type_info object (an RTTI object) for the C++ exception
+//       type.
+//     * Clang generates this for a "catch" block in the C++ source.
+//     * @ExcType is NULL for "catch (...)" (catch-all) blocks.
+//     * This is encoded as the "type ID" for @ExcType, defined below,
+//       which is a positive integer.
+//
+//  2) "filter [i8* @ExcType1, ..., i8* @ExcTypeN]"
+//     * This clause means that the landingpad should be entered if
+//       the C++ exception being thrown *doesn't* match any of the
+//       types in the list (which are again specified as
+//       std::type_info pointers).
+//     * Clang uses this to implement C++ exception specifications, e.g.
+//          void foo() throw(ExcType1, ..., ExcTypeN) { ... }
+//     * This is encoded as the filter ID, X, where X < 0, and
+//       &__pnacl_eh_filter_table[-X-1] points to a 0-terminated
+//       array of integer "type IDs".
+//
+//  3) "cleanup"
+//     * This means that the landingpad should always be entered.
+//     * Clang uses this for calling objects' destructors.
+//     * This is encoded as 0.
+//     * The runtime may treat "cleanup" differently from "catch i8*
+//       null" (a catch-all).  In C++, if an unhandled exception
+//       occurs, the language runtime may abort execution without
+//       running any destructors.  The runtime may implement this by
+//       searching for a matching non-"cleanup" clause, and aborting
+//       if it does not find one, before entering any landingpad
+//       blocks.
+//
+// The "type ID" for a type @ExcType is a 1-based index into the array
+// __pnacl_eh_type_table[].  That is, the type ID is a value X such
+// that __pnacl_eh_type_table[X-1] == @ExcType, and X >= 1.
+//
+// ExceptionInfoWriter generates the following data structures:
+//
+//   struct action_table_entry {
+//     int32_t clause_id;
+//     uint32_t next_clause_list_id;
+//   };
+//
+//   // Represents singly linked lists of clauses.
+//   extern const struct action_table_entry __pnacl_eh_action_table[];
+//
+//   // Allows std::type_infos to be represented using small integer IDs.
+//   extern std::type_info *const __pnacl_eh_type_table[];
+//
+//   // Used to represent type arrays for "filter" clauses.
+//   extern const uint32_t __pnacl_eh_filter_table[];
+//
+// A "clause list ID" is either:
+//  * 0, representing the empty list; or
+//  * an index into __pnacl_eh_action_table[] with 1 added, which
+//    specifies a node in the clause list.
+//
+// Example:
+//
+//   std::type_info *const __pnacl_eh_type_table[] = {
+//     // defines type ID 1 == ExcA and clause ID 1 == "catch ExcA"
+//     &typeinfo(ExcA),
+//     // defines type ID 2 == ExcB and clause ID 2 == "catch ExcB"
+//     &typeinfo(ExcB),
+//     // defines type ID 3 == ExcC and clause ID 3 == "catch ExcC"
+//     &typeinfo(ExcC),
+//   };
+//
+//   const uint32_t __pnacl_eh_filter_table[] = {
+//     1,  // refers to ExcA;  defines clause ID -1 as "filter [ExcA, ExcB]"
+//     2,  // refers to ExcB;  defines clause ID -2 as "filter [ExcB]"
+//     0,  // list terminator; defines clause ID -3 as "filter []"
+//     3,  // refers to ExcC;  defines clause ID -4 as "filter [ExcC]"
+//     0,  // list terminator; defines clause ID -5 as "filter []"
+//   };
+//
+//   const struct action_table_entry __pnacl_eh_action_table[] = {
+//     // defines clause list ID 1:
+//     {
+//       -4,  // "filter [ExcC]"
+//       0,  // end of list (no more actions)
+//     },
+//     // defines clause list ID 2:
+//     {
+//       -1,  // "filter [ExcA, ExcB]"
+//       1,  // else go to clause list ID 1
+//     },
+//     // defines clause list ID 3:
+//     {
+//       2,  // "catch ExcB"
+//       2,  // else go to clause list ID 2
+//     },
+//     // defines clause list ID 4:
+//     {
+//       1,  // "catch ExcA"
+//       3,  // else go to clause list ID 3
+//     },
+//   };
+//
+// So if a landingpad contains the clause list:
+//   [catch ExcA,
+//    catch ExcB,
+//    filter [ExcA, ExcB],
+//    filter [ExcC]]
+// then this can be represented as clause list ID 4 using the tables above.
+//
+// The C++ runtime library checks the clauses in order to decide
+// whether to enter the landingpad.  If a clause matches, the
+// landingpad BasicBlock is passed the clause ID.  The landingpad code
+// can use the clause ID to decide which C++ catch() block (if any) to
+// execute.
+//
+// The purpose of these exception tables is to keep code sizes
+// relatively small.  The landingpad code only needs to check a small
+// integer clause ID, rather than having to call a function to check
+// whether the C++ exception matches a type.
+//
+// ExceptionInfoWriter's encoding corresponds loosely to the format of
+// GCC's .gcc_except_table sections.  One difference is that
+// ExceptionInfoWriter writes fixed-width 32-bit integers, whereas
+// .gcc_except_table uses variable-length LEB128 encodings.  We could
+// switch to LEB128 to save space in the future.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ExceptionInfoWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+ExceptionInfoWriter::ExceptionInfoWriter(LLVMContext *Context):
+    Context(Context) {
+  Type *I32 = Type::getInt32Ty(*Context);
+  Type *Fields[] = { I32, I32 };
+  ActionTableEntryTy = StructType::create(Fields, "action_table_entry");
+}
+
+unsigned ExceptionInfoWriter::getIDForExceptionType(Value *ExcTy) {
+  Constant *ExcTyConst = dyn_cast<Constant>(ExcTy);
+  if (!ExcTyConst)
+    report_fatal_error("Exception type not a constant");
+
+  // Reuse existing ID if one has already been assigned.
+  TypeTableIDMapType::iterator Iter = TypeTableIDMap.find(ExcTyConst);
+  if (Iter != TypeTableIDMap.end())
+    return Iter->second;
+
+  unsigned Index = TypeTableData.size() + 1;
+  TypeTableIDMap[ExcTyConst] = Index;
+  TypeTableData.push_back(ExcTyConst);
+  return Index;
+}
+
+unsigned ExceptionInfoWriter::getIDForClauseListNode(
+    unsigned ClauseID, unsigned NextClauseListID) {
+  // Reuse existing ID if one has already been assigned.
+  ActionTableEntry Key(ClauseID, NextClauseListID);
+  ActionTableIDMapType::iterator Iter = ActionTableIDMap.find(Key);
+  if (Iter != ActionTableIDMap.end())
+    return Iter->second;
+
+  Type *I32 = Type::getInt32Ty(*Context);
+  Constant *Fields[] = { ConstantInt::get(I32, ClauseID),
+                         ConstantInt::get(I32, NextClauseListID) };
+  Constant *Entry = ConstantStruct::get(ActionTableEntryTy, Fields);
+
+  // Add 1 so that the empty list can be represented as 0.
+  unsigned ClauseListID = ActionTableData.size() + 1;
+  ActionTableIDMap[Key] = ClauseListID;
+  ActionTableData.push_back(Entry);
+  return ClauseListID;
+}
+
+unsigned ExceptionInfoWriter::getIDForFilterClause(Value *Filter) {
+  unsigned FilterClauseID = -(FilterTableData.size() + 1);
+  Type *I32 = Type::getInt32Ty(*Context);
+  ArrayType *ArrayTy = dyn_cast<ArrayType>(Filter->getType());
+  if (!ArrayTy)
+    report_fatal_error("Landingpad filter clause is not of array type");
+  unsigned FilterLength = ArrayTy->getNumElements();
+  // Don't try the dyn_cast if the FilterLength is zero, because Array
+  // could be a zeroinitializer.
+  if (FilterLength > 0) {
+    ConstantArray *Array = dyn_cast<ConstantArray>(Filter);
+    if (!Array)
+      report_fatal_error("Landingpad filter clause is not a ConstantArray");
+    for (unsigned I = 0; I < FilterLength; ++I) {
+      unsigned TypeID = getIDForExceptionType(Array->getOperand(I));
+      assert(TypeID > 0);
+      FilterTableData.push_back(ConstantInt::get(I32, TypeID));
+    }
+  }
+  // Add array terminator.
+  FilterTableData.push_back(ConstantInt::get(I32, 0));
+  return FilterClauseID;
+}
+
+unsigned ExceptionInfoWriter::getIDForLandingPadClauseList(LandingPadInst *LP) {
+  unsigned NextClauseListID = 0;  // ID for empty list.
+
+  if (LP->isCleanup()) {
+    // Add cleanup clause at the end of the list.
+    NextClauseListID = getIDForClauseListNode(0, NextClauseListID);
+  }
+
+  for (int I = (int) LP->getNumClauses() - 1; I >= 0; --I) {
+    unsigned ClauseID;
+    if (LP->isCatch(I)) {
+      ClauseID = getIDForExceptionType(LP->getClause(I));
+    } else if (LP->isFilter(I)) {
+      ClauseID = getIDForFilterClause(LP->getClause(I));
+    } else {
+      report_fatal_error("Unknown kind of landingpad clause");
+    }
+    assert(ClauseID > 0);
+    NextClauseListID = getIDForClauseListNode(ClauseID, NextClauseListID);
+  }
+
+  return NextClauseListID;
+}
+
+static void defineArray(Module *M, const char *Name,
+                        const SmallVectorImpl<Constant *> &Elements,
+                        Type *ElementType) {
+  ArrayType *ArrayTy = ArrayType::get(ElementType, Elements.size());
+  Constant *ArrayData = ConstantArray::get(ArrayTy, Elements);
+  GlobalVariable *OldGlobal = M->getGlobalVariable(Name);
+  if (OldGlobal) {
+    if (OldGlobal->hasInitializer()) {
+      report_fatal_error(std::string("Variable ") + Name +
+                         " already has an initializer");
+    }
+    Constant *NewGlobal = new GlobalVariable(
+        *M, ArrayTy, /* isConstant= */ true,
+        GlobalValue::InternalLinkage, ArrayData);
+    NewGlobal->takeName(OldGlobal);
+    OldGlobal->replaceAllUsesWith(ConstantExpr::getBitCast(
+                                      NewGlobal, OldGlobal->getType()));
+    OldGlobal->eraseFromParent();
+  } else {
+    if (Elements.size() > 0) {
+      // This warning could happen for a program that does not link
+      // against the C++ runtime libraries.  Such a program might
+      // contain "invoke" instructions but never throw any C++
+      // exceptions.
+      errs() << "Warning: Variable " << Name << " not referenced\n";
+    }
+  }
+}
+
+void ExceptionInfoWriter::defineGlobalVariables(Module *M) {
+  defineArray(M, "__pnacl_eh_type_table", TypeTableData,
+              Type::getInt8PtrTy(M->getContext()));
+
+  defineArray(M, "__pnacl_eh_action_table", ActionTableData,
+              ActionTableEntryTy);
+
+  defineArray(M, "__pnacl_eh_filter_table", FilterTableData,
+              Type::getInt32Ty(M->getContext()));
+}
diff --git a/lib/Target/JSBackend/NaCl/ExceptionInfoWriter.h b/lib/Target/JSBackend/NaCl/ExceptionInfoWriter.h
new file mode 100644
index 000000000000..dadaaf76158c
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExceptionInfoWriter.h
@@ -0,0 +1,71 @@
+//===-- ExceptionInfoWriter.h - Generate C++ exception info------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TRANSFORMS_NACL_EXCEPTIONINFOWRITER_H
+#define TRANSFORMS_NACL_EXCEPTIONINFOWRITER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+
+// The ExceptionInfoWriter class converts the clauses of a
+// "landingpad" instruction into data tables stored in global
+// variables, which are interpreted by PNaCl's C++ runtime library.
+// See ExceptionInfoWriter.cpp for a full description.
+class ExceptionInfoWriter {
+  LLVMContext *Context;
+  StructType *ActionTableEntryTy;
+
+  // Data for populating __pnacl_eh_type_table[], which is an array of
+  // std::type_info* pointers.  Each of these pointers represents a
+  // C++ exception type.
+  SmallVector<Constant *, 10> TypeTableData;
+  // Mapping from std::type_info* pointer to type ID (index in
+  // TypeTableData).
+  typedef DenseMap<Constant *, unsigned> TypeTableIDMapType;
+  TypeTableIDMapType TypeTableIDMap;
+
+  // Data for populating __pnacl_eh_action_table[], which is an array
+  // of pairs.
+  SmallVector<Constant *, 10> ActionTableData;
+  // Pair of (clause_id, clause_list_id).
+  typedef std::pair<unsigned, unsigned> ActionTableEntry;
+  // Mapping from (clause_id, clause_list_id) to clause_id (index in
+  // ActionTableData).
+  typedef DenseMap<ActionTableEntry, unsigned> ActionTableIDMapType;
+  ActionTableIDMapType ActionTableIDMap;
+
+  // Data for populating __pnacl_eh_filter_table[], which is an array
+  // of integers.
+  SmallVector<Constant *, 10> FilterTableData;
+
+  // Get the interned ID for an action.
+  unsigned getIDForClauseListNode(unsigned ClauseID, unsigned NextClauseListID);
+
+  // Get the clause ID for a "filter" clause.
+  unsigned getIDForFilterClause(Value *Filter);
+
+public:
+  explicit ExceptionInfoWriter(LLVMContext *Context);
+
+  // Get the interned type ID (a small integer) for a C++ exception type.
+  unsigned getIDForExceptionType(Value *Ty);
+
+  // Get the clause list ID for a landingpad's clause list.
+  unsigned getIDForLandingPadClauseList(LandingPadInst *LP);
+
+  // Add the exception info tables to the module.
+  void defineGlobalVariables(Module *M);
+};
+
+}
+
+#endif
diff --git a/lib/Target/JSBackend/NaCl/ExpandArithWithOverflow.cpp b/lib/Target/JSBackend/NaCl/ExpandArithWithOverflow.cpp
new file mode 100644
index 000000000000..4adcd74b7a02
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandArithWithOverflow.cpp
@@ -0,0 +1,234 @@
+//===- ExpandArithWithOverflow.cpp - Expand out uses of *.with.overflow----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The llvm.*.with.overflow.*() intrinsics are awkward for PNaCl support because
+// they return structs, and we want to omit struct types from IR in PNaCl's
+// stable ABI.
+//
+// However, llvm.{umul,uadd}.with.overflow.*() are used by Clang to implement an
+// overflow check for C++'s new[] operator, and {sadd,ssub} are used by
+// ubsan. This pass expands out these uses so that PNaCl does not have to
+// support *.with.overflow as part of PNaCl's stable ABI.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+#define DEBUG_TYPE "expand-arith-with-overflow"
+
+using namespace llvm;
+
+namespace {
+class ExpandArithWithOverflow : public ModulePass {
+public:
+  static char ID;
+  ExpandArithWithOverflow() : ModulePass(ID) {
+    initializeExpandArithWithOverflowPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnModule(Module &M);
+};
+}
+
+char ExpandArithWithOverflow::ID = 0;
+INITIALIZE_PASS(ExpandArithWithOverflow, "expand-arith-with-overflow",
+                "Expand out some uses of *.with.overflow intrinsics", false,
+                false)
+
+enum class ExpandArith { Add, Sub, Mul };
+static const ExpandArith ExpandArithOps[] = {ExpandArith::Add, ExpandArith::Sub,
+                                             ExpandArith::Mul};
+
+static Intrinsic::ID getID(ExpandArith Op, bool Signed) {
+  static const Intrinsic::ID IDs[][2] = {
+      //         Unsigned                       Signed
+      /* Add */ {Intrinsic::uadd_with_overflow, Intrinsic::sadd_with_overflow},
+      /* Sub */ {Intrinsic::usub_with_overflow, Intrinsic::ssub_with_overflow},
+      /* Mul */ {Intrinsic::umul_with_overflow, Intrinsic::smul_with_overflow},
+  };
+  return IDs[(size_t)Op][Signed];
+}
+
+static Instruction::BinaryOps getOpcode(ExpandArith Op) {
+  static const Instruction::BinaryOps Opcodes[] = {
+      Instruction::Add, Instruction::Sub, Instruction::Mul,
+  };
+  return Opcodes[(size_t)Op];
+}
+
+static Value *CreateInsertValue(IRBuilder<> *IRB, Value *StructVal,
+                                unsigned Index, Value *Field,
+                                Instruction *BasedOn) {
+  SmallVector<unsigned, 1> EVIndexes(1, Index);
+  return IRB->CreateInsertValue(StructVal, Field, EVIndexes,
+                                BasedOn->getName() + ".insert");
+}
+
+static bool Expand(Module *M, unsigned Bits, ExpandArith Op, bool Signed) {
+  IntegerType *IntTy = IntegerType::get(M->getContext(), Bits);
+  SmallVector<Type *, 1> Types(1, IntTy);
+  Function *Intrinsic =
+      M->getFunction(Intrinsic::getName(getID(Op, Signed), Types));
+  if (!Intrinsic)
+    return false;
+
+  SmallVector<CallInst *, 64> Calls;
+  for (User *U : Intrinsic->users())
+    if (CallInst *Call = dyn_cast<CallInst>(U)) {
+      Calls.push_back(Call);
+    } else {
+      errs() << "User: " << *U << "\n";
+      report_fatal_error("ExpandArithWithOverflow: Taking the address of a "
+                         "*.with.overflow intrinsic is not allowed");
+    }
+
+  for (CallInst *Call : Calls) {
+    DEBUG(dbgs() << "Expanding " << *Call << "\n");
+
+    StringRef Name = Call->getName();
+    Value *LHS;
+    Value *RHS;
+    Value *NonConstOperand;
+    ConstantInt *ConstOperand;
+    bool hasConstOperand;
+
+    if (ConstantInt *C = dyn_cast<ConstantInt>(Call->getArgOperand(0))) {
+      LHS = ConstOperand = C;
+      RHS = NonConstOperand = Call->getArgOperand(1);
+      hasConstOperand = true;
+    } else if (ConstantInt *C = dyn_cast<ConstantInt>(Call->getArgOperand(1))) {
+      LHS = NonConstOperand = Call->getArgOperand(0);
+      RHS = ConstOperand = C;
+      hasConstOperand = true;
+    } else {
+      LHS = Call->getArgOperand(0);
+      RHS = Call->getArgOperand(1);
+      hasConstOperand = false;
+    }
+
+    IRBuilder<> IRB(Call);
+    Value *ArithResult =
+        IRB.CreateBinOp(getOpcode(Op), LHS, RHS, Name + ".arith");
+    Value *OverflowResult;
+
+    if (ExpandArith::Mul == Op && hasConstOperand &&
+        ConstOperand->getValue() == 0) {
+      // Mul by zero never overflows but can divide by zero.
+      OverflowResult = ConstantInt::getFalse(M->getContext());
+    } else if (hasConstOperand && !Signed && ExpandArith::Sub != Op) {
+      // Unsigned add & mul with a constant operand can be optimized.
+      uint64_t ArgMax =
+          (ExpandArith::Mul == Op
+               ? APInt::getMaxValue(Bits).udiv(ConstOperand->getValue())
+               : APInt::getMaxValue(Bits) - ConstOperand->getValue())
+              .getLimitedValue();
+      OverflowResult =
+          IRB.CreateICmp(CmpInst::ICMP_UGT, NonConstOperand,
+                         ConstantInt::get(IntTy, ArgMax), Name + ".overflow");
+    } else if (ExpandArith::Mul == Op) {
+      // Dividing the result by one of the operands should yield the other
+      // operand if there was no overflow. Note that this division can't
+      // overflow (signed division of INT_MIN / -1 overflows but can't occur
+      // here), but it could divide by 0 in which case we instead divide by 1
+      // (this case didn't overflow).
+      //
+      // FIXME: This approach isn't optimal because it's better to perform a
+      // wider multiplication and mask off the result, or perform arithmetic on
+      // the component pieces.
+      auto DivOp = Signed ? Instruction::SDiv : Instruction::UDiv;
+      auto DenomIsZero =
+          IRB.CreateICmp(CmpInst::ICMP_EQ, RHS,
+                         ConstantInt::get(RHS->getType(), 0), Name + ".iszero");
+      auto Denom =
+          IRB.CreateSelect(DenomIsZero, ConstantInt::get(RHS->getType(), 1),
+                           RHS, Name + ".denom");
+      auto Div = IRB.CreateBinOp(DivOp, ArithResult, Denom, Name + ".div");
+      OverflowResult = IRB.CreateSelect(
+          DenomIsZero, ConstantInt::getFalse(M->getContext()),
+          IRB.CreateICmp(CmpInst::ICMP_NE, Div, LHS, Name + ".same"),
+          Name + ".overflow");
+    } else {
+      if (!Signed) {
+        switch (Op) {
+        case ExpandArith::Add:
+          // Overflow occurs if unsigned x+y < x (or y). We only need to compare
+          // with one of them because this is unsigned arithmetic: on overflow
+          // the result is smaller than both inputs, and when there's no
+          // overflow the result is greater than both inputs.
+          OverflowResult = IRB.CreateICmp(CmpInst::ICMP_ULT, ArithResult, LHS,
+                                          Name + ".overflow");
+          break;
+        case ExpandArith::Sub:
+          // Overflow occurs if x < y.
+          OverflowResult =
+              IRB.CreateICmp(CmpInst::ICMP_ULT, LHS, RHS, Name + ".overflow");
+          break;
+        case ExpandArith::Mul: // This is handled above.
+          llvm_unreachable("Unsigned variable saturating multiplication");
+        }
+      } else {
+        // In the signed case, we care if the sum is >127 or <-128. When looked
+        // at as an unsigned number, that is precisely when the sum is >= 128.
+        Value *PositiveTemp = IRB.CreateBinOp(
+            Instruction::Add, LHS,
+            ConstantInt::get(IntTy, APInt::getSignedMinValue(Bits) +
+                                        (ExpandArith::Sub == Op ? 1 : 0)),
+            Name + ".postemp");
+        Value *NegativeTemp = IRB.CreateBinOp(
+            Instruction::Add, LHS,
+            ConstantInt::get(IntTy, APInt::getSignedMaxValue(Bits) +
+                                        (ExpandArith::Sub == Op ? 1 : 0)),
+            Name + ".negtemp");
+        Value *PositiveCheck = IRB.CreateICmp(CmpInst::ICMP_SLT, ArithResult,
+                                              PositiveTemp, Name + ".poscheck");
+        Value *NegativeCheck = IRB.CreateICmp(CmpInst::ICMP_SGT, ArithResult,
+                                              NegativeTemp, Name + ".negcheck");
+        Value *IsPositive =
+            IRB.CreateICmp(CmpInst::ICMP_SGE, LHS, ConstantInt::get(IntTy, 0),
+                           Name + ".ispos");
+        OverflowResult = IRB.CreateSelect(IsPositive, PositiveCheck,
+                                          NegativeCheck, Name + ".select");
+      }
+    }
+
+    // Construct the struct result.
+    Value *NewStruct = UndefValue::get(Call->getType());
+    NewStruct = CreateInsertValue(&IRB, NewStruct, 0, ArithResult, Call);
+    NewStruct = CreateInsertValue(&IRB, NewStruct, 1, OverflowResult, Call);
+    Call->replaceAllUsesWith(NewStruct);
+    Call->eraseFromParent();
+  }
+
+  Intrinsic->eraseFromParent();
+  return true;
+}
+
+static const unsigned MaxBits = 64;
+
+bool ExpandArithWithOverflow::runOnModule(Module &M) {
+  bool Modified = false;
+  for (ExpandArith Op : ExpandArithOps)
+    for (int Signed = false; Signed <= true; ++Signed)
+      for (unsigned Bits = 8; Bits <= MaxBits; Bits <<= 1)
+        Modified |= Expand(&M, Bits, Op, Signed);
+  return Modified;
+}
+
+ModulePass *llvm::createExpandArithWithOverflowPass() {
+  return new ExpandArithWithOverflow();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandByVal.cpp b/lib/Target/JSBackend/NaCl/ExpandByVal.cpp
new file mode 100644
index 000000000000..7022f8e6e355
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandByVal.cpp
@@ -0,0 +1,206 @@
+//===- ExpandByVal.cpp - Expand out use of "byval" and "sret" attributes---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands out by-value passing of structs as arguments and
+// return values.  In LLVM IR terms, it expands out the "byval" and
+// "sret" function argument attributes.
+//
+// The semantics of the "byval" attribute are that the callee function
+// gets a private copy of the pointed-to argument that it is allowed
+// to modify.  In implementing this, we have a choice between making
+// the caller responsible for making the copy or making the callee
+// responsible for making the copy.  We choose the former, because
+// this matches how the normal native calling conventions work, and
+// because it often allows the caller to write struct contents
+// directly into the stack slot that it passes the callee, without an
+// additional copy.
+//
+// Note that this pass does not attempt to modify functions that pass
+// structs by value without using "byval" or "sret", such as:
+//
+//   define %struct.X @func()                           ; struct return
+//   define void @func(%struct.X %arg)                  ; struct arg
+//
+// The pass only handles functions such as:
+//
+//   define void @func(%struct.X* sret %result_buffer)  ; struct return
+//   define void @func(%struct.X* byval %ptr_to_arg)    ; struct arg
+//
+// This is because PNaCl Clang generates the latter and not the former.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  // This is a ModulePass so that it can strip attributes from
+  // declared functions as well as defined functions.
+  class ExpandByVal : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ExpandByVal() : ModulePass(ID) {
+      initializeExpandByValPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char ExpandByVal::ID = 0;
+INITIALIZE_PASS(ExpandByVal, "expand-byval",
+                "Expand out by-value passing of structs",
+                false, false)
+
+// removeAttribute() currently does not work on Attribute::Alignment
+// (it fails with an assertion error), so we have to take a more
+// convoluted route to removing this attribute by recreating the
+// AttributeSet.
+AttributeSet RemoveAttrs(LLVMContext &Context, AttributeSet Attrs) {
+  SmallVector<AttributeSet, 8> AttrList;
+  for (unsigned Slot = 0; Slot < Attrs.getNumSlots(); ++Slot) {
+    unsigned Index = Attrs.getSlotIndex(Slot);
+    AttrBuilder AB;
+    for (AttributeSet::iterator Attr = Attrs.begin(Slot), E = Attrs.end(Slot);
+         Attr != E; ++Attr) {
+      if (Attr->isEnumAttribute() &&
+          Attr->getKindAsEnum() != Attribute::ByVal &&
+          Attr->getKindAsEnum() != Attribute::StructRet) {
+        AB.addAttribute(*Attr);
+      }
+      // IR semantics require that ByVal implies NoAlias.  However, IR
+      // semantics do not require StructRet to imply NoAlias.  For
+      // example, a global variable address can be passed as a
+      // StructRet argument, although Clang does not do so and Clang
+      // explicitly adds NoAlias to StructRet arguments.
+      if (Attr->isEnumAttribute() &&
+          Attr->getKindAsEnum() == Attribute::ByVal) {
+        AB.addAttribute(Attribute::get(Context, Attribute::NoAlias));
+      }
+    }
+    AttrList.push_back(AttributeSet::get(Context, Index, AB));
+  }
+  return AttributeSet::get(Context, AttrList);
+}
+
+// ExpandCall() can take a CallInst or an InvokeInst.  It returns
+// whether the instruction was modified.
+template <class InstType>
+static bool ExpandCall(DataLayout *DL, InstType *Call) {
+  bool Modify = false;
+  AttributeSet Attrs = Call->getAttributes();
+  for (unsigned ArgIdx = 0; ArgIdx < Call->getNumArgOperands(); ++ArgIdx) {
+    unsigned AttrIdx = ArgIdx + 1;
+
+    if (Attrs.hasAttribute(AttrIdx, Attribute::StructRet))
+      Modify = true;
+
+    if (Attrs.hasAttribute(AttrIdx, Attribute::ByVal)) {
+      Modify = true;
+
+      Value *ArgPtr = Call->getArgOperand(ArgIdx);
+      Type *ArgType = ArgPtr->getType()->getPointerElementType();
+      ConstantInt *ArgSize = ConstantInt::get(
+          Call->getContext(), APInt(64, DL->getTypeStoreSize(ArgType)));
+      // In principle, using the alignment from the argument attribute
+      // should be enough.  However, Clang is not emitting this
+      // attribute for PNaCl.  LLVM alloca instructions do not use the
+      // ABI alignment of the type, so this must be specified
+      // explicitly.
+      // See https://code.google.com/p/nativeclient/issues/detail?id=3403
+      //
+      // Note that the parameter may have no alignment, but we have
+      // more useful information from the type which we can use here
+      // -- 0 in the parameter means no alignment is specified there,
+      // so it has default alignment, but in memcpy 0 means
+      // pessimistic alignment, the same as 1.
+      unsigned Alignment =
+          std::max(Attrs.getParamAlignment(AttrIdx),
+                   DL->getABITypeAlignment(ArgType));
+
+      // Make a copy of the byval argument.
+      Instruction *CopyBuf = new AllocaInst(ArgType, 0, Alignment,
+                                            ArgPtr->getName() + ".byval_copy");
+      Function *Func = Call->getParent()->getParent();
+      Func->getEntryBlock().getInstList().push_front(CopyBuf);
+      IRBuilder<> Builder(Call);
+      Builder.CreateLifetimeStart(CopyBuf, ArgSize);
+      // Using the argument's alignment attribute for the memcpy
+      // should be OK because the LLVM Language Reference says that
+      // the alignment attribute specifies "the alignment of the stack
+      // slot to form and the known alignment of the pointer specified
+      // to the call site".
+      Instruction *MemCpy = Builder.CreateMemCpy(CopyBuf, ArgPtr, ArgSize,
+                                                 Alignment);
+      MemCpy->setDebugLoc(Call->getDebugLoc());
+
+      Call->setArgOperand(ArgIdx, CopyBuf);
+
+      // Mark the argument copy as unused using llvm.lifetime.end.
+      if (isa<CallInst>(Call)) {
+        BasicBlock::iterator It = BasicBlock::iterator(Call);
+        Builder.SetInsertPoint(&*(++It));
+        Builder.CreateLifetimeEnd(CopyBuf, ArgSize);
+      } else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Call)) {
+        Builder.SetInsertPoint(&*Invoke->getNormalDest()->getFirstInsertionPt());
+        Builder.CreateLifetimeEnd(CopyBuf, ArgSize);
+        Builder.SetInsertPoint(&*Invoke->getUnwindDest()->getFirstInsertionPt());
+        Builder.CreateLifetimeEnd(CopyBuf, ArgSize);
+      }
+    }
+  }
+  if (Modify) {
+    Call->setAttributes(RemoveAttrs(Call->getContext(), Attrs));
+
+    if (CallInst *CI = dyn_cast<CallInst>(Call)) {
+      // This is no longer a tail call because the callee references
+      // memory alloca'd by the caller.
+      CI->setTailCall(false);
+    }
+  }
+  return Modify;
+}
+
+bool ExpandByVal::runOnModule(Module &M) {
+  bool Modified = false;
+  DataLayout DL(&M);
+
+  for (Module::iterator Func = M.begin(), E = M.end(); Func != E; ++Func) {
+    AttributeSet NewAttrs = RemoveAttrs(Func->getContext(),
+                                        Func->getAttributes());
+    Modified |= (NewAttrs != Func->getAttributes());
+    Func->setAttributes(NewAttrs);
+
+    for (Function::iterator BB = Func->begin(), E = Func->end();
+         BB != E; ++BB) {
+      for (BasicBlock::iterator Inst = BB->begin(), E = BB->end();
+           Inst != E; ++Inst) {
+        if (CallInst *Call = dyn_cast<CallInst>(Inst)) {
+          Modified |= ExpandCall(&DL, Call);
+        } else if (InvokeInst *Call = dyn_cast<InvokeInst>(Inst)) {
+          Modified |= ExpandCall(&DL, Call);
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
+
+ModulePass *llvm::createExpandByValPass() {
+  return new ExpandByVal();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandConstantExpr.cpp b/lib/Target/JSBackend/NaCl/ExpandConstantExpr.cpp
new file mode 100644
index 000000000000..82287ef90c06
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandConstantExpr.cpp
@@ -0,0 +1,121 @@
+//===- ExpandConstantExpr.cpp - Convert ConstantExprs to Instructions------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands out ConstantExprs into Instructions.
+//
+// Note that this only converts ConstantExprs that are referenced by
+// Instructions.  It does not convert ConstantExprs that are used as
+// initializers for global variables.
+//
+// This simplifies the language so that the PNaCl translator does not
+// need to handle ConstantExprs as part of a stable wire format for
+// PNaCl.
+//
+//===----------------------------------------------------------------------===//
+
+#include <map>
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+static bool expandInstruction(Instruction *Inst);
+
+namespace {
+  // This is a FunctionPass because our handling of PHI nodes means
+  // that our modifications may cross BasicBlocks.
+  struct ExpandConstantExpr : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ExpandConstantExpr() : FunctionPass(ID) {
+      initializeExpandConstantExprPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnFunction(Function &Func);
+  };
+}
+
+char ExpandConstantExpr::ID = 0;
+INITIALIZE_PASS(ExpandConstantExpr, "expand-constant-expr",
+                "Expand out ConstantExprs into Instructions",
+                false, false)
+
+static Value *expandConstantExpr(Instruction *InsertPt, ConstantExpr *Expr) {
+  Instruction *NewInst = Expr->getAsInstruction();
+  NewInst->insertBefore(InsertPt);
+  NewInst->setName("expanded");
+  expandInstruction(NewInst);
+  return NewInst;
+}
+
+// XXX Emscripten: Utilities for illegal expressions.
+static bool isIllegal(Type *T) {
+  if (!T->isIntegerTy()) return false;
+  unsigned Bits = T->getIntegerBitWidth();
+  // we need to expand out not just 64-bit and larger values, but also i24s, so PromoteIntegers can process them
+  return Bits != 1 && Bits != 8 && Bits != 16 && Bits != 32;
+}
+static bool ContainsIllegalTypes(const Value *Expr) {
+  if (isIllegal(Expr->getType()))
+    return true;
+  if (const User *U = dyn_cast<User>(Expr)) {
+    for (User::const_op_iterator I = U->op_begin(), E = U->op_end(); I != E; ++I) {
+      if (Constant *C = dyn_cast<Constant>(*I)) {
+        if (!isa<GlobalValue>(C) && ContainsIllegalTypes(C)) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+static bool expandInstruction(Instruction *Inst) {
+  // A landingpad can only accept ConstantExprs, so it should remain
+  // unmodified.
+  if (isa<LandingPadInst>(Inst))
+    return false;
+
+  bool Modified = false;
+  for (unsigned OpNum = 0; OpNum < Inst->getNumOperands(); OpNum++) {
+    if (ConstantExpr *Expr =
+        dyn_cast<ConstantExpr>(Inst->getOperand(OpNum))) {
+      // XXX Emscripten: Only do the expansion of the expression contains
+      // illegal types, for now, since we can handle legal ConstantExprs
+      // in the backend directly.
+      if (ContainsIllegalTypes(Expr)) {
+        Modified = true;
+        Use *U = &Inst->getOperandUse(OpNum);
+        PhiSafeReplaceUses(U, expandConstantExpr(PhiSafeInsertPt(U), Expr));
+      }
+    }
+  }
+  return Modified;
+}
+
+bool ExpandConstantExpr::runOnFunction(Function &Func) {
+  bool Modified = false;
+  for (llvm::Function::iterator BB = Func.begin(), E = Func.end();
+       BB != E;
+       ++BB) {
+    for (BasicBlock::InstListType::iterator Inst = BB->begin(), E = BB->end();
+         Inst != E;
+         ++Inst) {
+      Modified |= expandInstruction(&*Inst);
+    }
+  }
+  return Modified;
+}
+
+FunctionPass *llvm::createExpandConstantExprPass() {
+  return new ExpandConstantExpr();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandCtors.cpp b/lib/Target/JSBackend/NaCl/ExpandCtors.cpp
new file mode 100644
index 000000000000..97398870b400
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandCtors.cpp
@@ -0,0 +1,154 @@
+//===- ExpandCtors.cpp - Convert ctors/dtors to concrete arrays -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts LLVM's special symbols llvm.global_ctors and
+// llvm.global_dtors to concrete arrays, __init_array_start/end and
+// __fini_array_start/end, that are usable by a C library.
+//
+// This pass sorts the contents of global_ctors/dtors according to the
+// priority values they contain and removes the priority values.
+//
+//===----------------------------------------------------------------------===//
+
+#include <vector>
+
+#include "llvm/Pass.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  struct ExpandCtors : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    ExpandCtors() : ModulePass(ID) {
+      initializeExpandCtorsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char ExpandCtors::ID = 0;
+INITIALIZE_PASS(ExpandCtors, "nacl-expand-ctors",
+                "Hook up constructor and destructor arrays to libc",
+                false, false)
+
+static void setGlobalVariableValue(Module &M, const char *Name,
+                                   Constant *Value) {
+  if (GlobalVariable *Var = M.getNamedGlobal(Name)) {
+    if (Var->hasInitializer()) {
+      report_fatal_error(std::string("Variable ") + Name +
+                         " already has an initializer");
+    }
+    Var->replaceAllUsesWith(ConstantExpr::getBitCast(Value, Var->getType()));
+    Var->eraseFromParent();
+  }
+}
+
+struct FuncArrayEntry {
+  uint64_t priority;
+  Constant *func;
+};
+
+static bool compareEntries(FuncArrayEntry Entry1, FuncArrayEntry Entry2) {
+  return Entry1.priority < Entry2.priority;
+}
+
+static void readFuncList(GlobalVariable *Array, std::vector<Constant*> *Funcs) {
+  if (!Array->hasInitializer())
+    return;
+  Constant *Init = Array->getInitializer();
+  ArrayType *Ty = dyn_cast<ArrayType>(Init->getType());
+  if (!Ty) {
+    errs() << "Initializer: " << *Array->getInitializer() << "\n";
+    report_fatal_error("ExpandCtors: Initializer is not of array type");
+  }
+  if (Ty->getNumElements() == 0)
+    return;
+  ConstantArray *InitList = dyn_cast<ConstantArray>(Init);
+  if (!InitList) {
+    errs() << "Initializer: " << *Array->getInitializer() << "\n";
+    report_fatal_error("ExpandCtors: Unexpected initializer ConstantExpr");
+  }
+  std::vector<FuncArrayEntry> FuncsToSort;
+  for (unsigned Index = 0; Index < InitList->getNumOperands(); ++Index) {
+    ConstantStruct *CS = cast<ConstantStruct>(InitList->getOperand(Index));
+    FuncArrayEntry Entry;
+    Entry.priority = cast<ConstantInt>(CS->getOperand(0))->getZExtValue();
+    Entry.func = CS->getOperand(1);
+    FuncsToSort.push_back(Entry);
+  }
+
+  std::sort(FuncsToSort.begin(), FuncsToSort.end(), compareEntries);
+  for (std::vector<FuncArrayEntry>::iterator Iter = FuncsToSort.begin();
+       Iter != FuncsToSort.end();
+       ++Iter) {
+    Funcs->push_back(Iter->func);
+  }
+}
+
+static void defineFuncArray(Module &M, const char *LlvmArrayName,
+                            const char *StartSymbol,
+                            const char *EndSymbol) {
+  std::vector<Constant*> Funcs;
+
+  GlobalVariable *Array = M.getNamedGlobal(LlvmArrayName);
+  if (Array) {
+    readFuncList(Array, &Funcs);
+    // No code should be referencing global_ctors/global_dtors,
+    // because this symbol is internal to LLVM.
+    Array->eraseFromParent();
+  }
+
+  Type *FuncTy = FunctionType::get(Type::getVoidTy(M.getContext()), false);
+  Type *FuncPtrTy = FuncTy->getPointerTo();
+  ArrayType *ArrayTy = ArrayType::get(FuncPtrTy, Funcs.size());
+  GlobalVariable *NewArray =
+      new GlobalVariable(M, ArrayTy, /* isConstant= */ true,
+                         GlobalValue::InternalLinkage,
+                         ConstantArray::get(ArrayTy, Funcs));
+  setGlobalVariableValue(M, StartSymbol, NewArray);
+  // We do this last so that LLVM gives NewArray the name
+  // "__{init,fini}_array_start" without adding any suffixes to
+  // disambiguate from the original GlobalVariable's name.  This is
+  // not essential -- it just makes the output easier to understand
+  // when looking at symbols for debugging.
+  NewArray->setName(StartSymbol);
+
+  // We replace "__{init,fini}_array_end" with the address of the end
+  // of NewArray.  This removes the name "__{init,fini}_array_end"
+  // from the output, which is not ideal for debugging.  Ideally we
+  // would convert "__{init,fini}_array_end" to being a GlobalAlias
+  // that points to the end of the array.  However, unfortunately LLVM
+  // does not generate correct code when a GlobalAlias contains a
+  // GetElementPtr ConstantExpr.
+  Constant *NewArrayEnd =
+      ConstantExpr::getGetElementPtr(ArrayTy, NewArray,
+                                     ConstantInt::get(M.getContext(),
+                                                      APInt(32, 1)));
+  setGlobalVariableValue(M, EndSymbol, NewArrayEnd);
+}
+
+bool ExpandCtors::runOnModule(Module &M) {
+  defineFuncArray(M, "llvm.global_ctors",
+                  "__init_array_start", "__init_array_end");
+  defineFuncArray(M, "llvm.global_dtors",
+                  "__fini_array_start", "__fini_array_end");
+  return true;
+}
+
+ModulePass *llvm::createExpandCtorsPass() {
+  return new ExpandCtors();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandGetElementPtr.cpp b/lib/Target/JSBackend/NaCl/ExpandGetElementPtr.cpp
new file mode 100644
index 000000000000..bd576866d090
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandGetElementPtr.cpp
@@ -0,0 +1,151 @@
+//===- ExpandGetElementPtr.cpp - Expand GetElementPtr into arithmetic------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands out GetElementPtr instructions into ptrtoint,
+// inttoptr and arithmetic instructions.
+//
+// This simplifies the language so that the PNaCl translator does not
+// need to handle GetElementPtr and struct types as part of a stable
+// wire format for PNaCl.
+//
+// Note that we drop the "inbounds" attribute of GetElementPtr.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  class ExpandGetElementPtr : public BasicBlockPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ExpandGetElementPtr() : BasicBlockPass(ID) {
+      initializeExpandGetElementPtrPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnBasicBlock(BasicBlock &BB);
+  };
+}
+
+char ExpandGetElementPtr::ID = 0;
+INITIALIZE_PASS(ExpandGetElementPtr, "expand-getelementptr",
+                "Expand out GetElementPtr instructions into arithmetic",
+                false, false)
+
+static Value *CastToPtrSize(Value *Val, Instruction *InsertPt,
+                            const DebugLoc &Debug, Type *PtrType) {
+  unsigned ValSize = Val->getType()->getIntegerBitWidth();
+  unsigned PtrSize = PtrType->getIntegerBitWidth();
+  if (ValSize == PtrSize)
+    return Val;
+  Instruction *Inst;
+  if (ValSize > PtrSize) {
+    Inst = new TruncInst(Val, PtrType, "gep_trunc", InsertPt);
+  } else {
+    // GEP indexes must be sign-extended.
+    Inst = new SExtInst(Val, PtrType, "gep_sext", InsertPt);
+  }
+  Inst->setDebugLoc(Debug);
+  return Inst;
+}
+
+static void FlushOffset(Instruction **Ptr, uint64_t *CurrentOffset,
+                        Instruction *InsertPt, const DebugLoc &Debug,
+                        Type *PtrType) {
+  if (*CurrentOffset) {
+    *Ptr = BinaryOperator::Create(Instruction::Add, *Ptr,
+                                  ConstantInt::get(PtrType, *CurrentOffset),
+                                  "gep", InsertPt);
+    (*Ptr)->setDebugLoc(Debug);
+    *CurrentOffset = 0;
+  }
+}
+
+static void ExpandGEP(GetElementPtrInst *GEP, DataLayout *DL, Type *PtrType) {
+  const DebugLoc &Debug = GEP->getDebugLoc();
+  Instruction *Ptr = new PtrToIntInst(GEP->getPointerOperand(), PtrType,
+                                      "gep_int", GEP);
+  Ptr->setDebugLoc(Debug);
+
+  Type *CurrentTy = GEP->getPointerOperand()->getType();
+  // We do some limited constant folding ourselves.  An alternative
+  // would be to generate verbose, unfolded output (e.g. multiple
+  // adds; adds of zero constants) and use a later pass such as
+  // "-instcombine" to clean that up.  However, "-instcombine" can
+  // reintroduce GetElementPtr instructions.
+  uint64_t CurrentOffset = 0;
+
+  for (GetElementPtrInst::op_iterator Op = GEP->op_begin() + 1;
+       Op != GEP->op_end();
+       ++Op) {
+    Value *Index = *Op;
+    if (StructType *StTy = dyn_cast<StructType>(CurrentTy)) {
+      uint64_t Field = cast<ConstantInt>(Op)->getZExtValue();
+      CurrentTy = StTy->getElementType(Field);
+      CurrentOffset += DL->getStructLayout(StTy)->getElementOffset(Field);
+    } else {
+      CurrentTy = cast<SequentialType>(CurrentTy)->getElementType();
+      uint64_t ElementSize = DL->getTypeAllocSize(CurrentTy);
+      if (ConstantInt *C = dyn_cast<ConstantInt>(Index)) {
+        CurrentOffset += C->getSExtValue() * ElementSize;
+      } else {
+        FlushOffset(&Ptr, &CurrentOffset, GEP, Debug, PtrType);
+        Index = CastToPtrSize(Index, GEP, Debug, PtrType);
+        if (ElementSize != 1) {
+          Index = CopyDebug(
+              BinaryOperator::Create(Instruction::Mul, Index,
+                                     ConstantInt::get(PtrType, ElementSize),
+                                     "gep_array", GEP),
+              GEP);
+        }
+        Ptr = BinaryOperator::Create(Instruction::Add, Ptr,
+                                     Index, "gep", GEP);
+        Ptr->setDebugLoc(Debug);
+      }
+    }
+  }
+  FlushOffset(&Ptr, &CurrentOffset, GEP, Debug, PtrType);
+
+  assert(CurrentTy == GEP->getResultElementType());
+  Instruction *Result = new IntToPtrInst(Ptr, GEP->getType(), "", GEP);
+  Result->setDebugLoc(Debug);
+  Result->takeName(GEP);
+  GEP->replaceAllUsesWith(Result);
+  GEP->eraseFromParent();
+}
+
+bool ExpandGetElementPtr::runOnBasicBlock(BasicBlock &BB) {
+  bool Modified = false;
+  DataLayout DL(BB.getParent()->getParent());
+  Type *PtrType = DL.getIntPtrType(BB.getContext());
+
+  for (BasicBlock::InstListType::iterator Iter = BB.begin();
+       Iter != BB.end(); ) {
+    Instruction *Inst = &*Iter++;
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+      Modified = true;
+      ExpandGEP(GEP, &DL, PtrType);
+    }
+  }
+  return Modified;
+}
+
+BasicBlockPass *llvm::createExpandGetElementPtrPass() {
+  return new ExpandGetElementPtr();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandI64.cpp b/lib/Target/JSBackend/NaCl/ExpandI64.cpp
new file mode 100644
index 000000000000..e7921b10879c
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandI64.cpp
@@ -0,0 +1,1235 @@
+//===- ExpandI64.cpp - Expand i64 and wider integer types -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------------===//
+//
+// This pass expands and lowers all operations on integers i64 and wider
+// into 32-bit operations that can be handled by JS in a natural way.
+//
+// 64-bit variables become pairs of 2 32-bit variables, for the low and
+// high 32 bit chunks. This happens for both registers and function
+// arguments. Function return values become a return of the low 32 bits
+// and a store of the high 32-bits in tempRet0, a global helper variable.
+// Larger values become more chunks of 32 bits. Currently we require that
+// types be a multiple of 32 bits.
+//
+// Many operations then become simple pairs of operations, for example
+// bitwise AND becomes and AND of each 32-bit chunk. More complex operations
+// like addition are lowered into calls into library support code in
+// Emscripten (i64Add for example).
+//
+//===------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <map>
+#include <vector>
+
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+  struct PhiBlockChange {
+    BasicBlock *DD, *SwitchBB, *NewBB;
+  };
+
+  typedef SmallVector<Value*, 2> ChunksVec;
+  typedef std::map<Value*, ChunksVec> SplitsMap;
+
+  typedef SmallVector<PHINode *, 8> PHIVec;
+  typedef SmallVector<Instruction *, 8> DeadVec;
+
+  // This is a ModulePass because the pass recreates functions in
+  // order to expand i64 arguments to pairs of i32s.
+  class ExpandI64 : public ModulePass {
+    bool Changed;
+    const DataLayout *DL;
+    Module *TheModule;
+
+    SplitsMap Splits; // old illegal value to new insts
+    PHIVec Phis;
+    std::vector<PhiBlockChange> PhiBlockChanges;
+
+    // If the function has an illegal return or argument, create a legal version
+    void ensureLegalFunc(Function *F);
+
+    // If a function is illegal, remove it
+    void removeIllegalFunc(Function *F);
+
+    // splits an illegal instruction into 32-bit chunks. We do
+    // not yet have the values yet, as they depend on other
+    // splits, so store the parts in Splits, for FinalizeInst.
+    bool splitInst(Instruction *I);
+
+    // For an illegal value, returns the split out chunks
+    // representing the low and high parts, that splitInst
+    // generated.
+    // The value can also be a constant, in which case we just
+    // split it, or a function argument, in which case we
+    // map to the proper legalized new arguments
+    //
+    // @param AllowUnreachable  It is possible for phi nodes
+    //                          to refer to unreachable blocks,
+    //                          which our traversal never
+    //                          reaches; this flag lets us
+    //                          ignore those - otherwise,
+    //                          not finding chunks is fatal
+    ChunksVec getChunks(Value *V, bool AllowUnreachable=false);
+
+    Function *Add, *Sub, *Mul, *SDiv, *UDiv, *SRem, *URem, *LShr, *AShr, *Shl, *GetHigh, *SetHigh, *FtoILow, *FtoIHigh, *DtoILow, *DtoIHigh, *SItoF, *UItoF, *SItoD, *UItoD, *BItoD, *BDtoILow, *BDtoIHigh;
+
+    Function *AtomicAdd, *AtomicSub, *AtomicAnd, *AtomicOr, *AtomicXor;
+
+    void ensureFuncs();
+    unsigned getNumChunks(Type *T);
+
+  public:
+    static char ID;
+    ExpandI64() : ModulePass(ID) {
+      initializeExpandI64Pass(*PassRegistry::getPassRegistry());
+
+      Add = Sub = Mul = SDiv = UDiv = SRem = URem = LShr = AShr = Shl = GetHigh = SetHigh = AtomicAdd = AtomicSub = AtomicAnd = AtomicOr = AtomicXor = NULL;
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char ExpandI64::ID = 0;
+INITIALIZE_PASS(ExpandI64, "expand-illegal-ints",
+                "Expand and lower illegal >i32 operations into 32-bit chunks",
+                false, false)
+
+// Utilities
+
+static Instruction *CopyDebug(Instruction *NewInst, Instruction *Original) {
+  NewInst->setDebugLoc(Original->getDebugLoc());
+  return NewInst;
+}
+
+static bool isIllegal(Type *T) {
+  return T->isIntegerTy() && T->getIntegerBitWidth() > 32;
+}
+
+static FunctionType *getLegalizedFunctionType(FunctionType *FT) {
+  SmallVector<Type*, 0> ArgTypes; // XXX
+  int Num = FT->getNumParams();
+  for (int i = 0; i < Num; i++) {
+    Type *T = FT->getParamType(i);
+    if (!isIllegal(T)) {
+      ArgTypes.push_back(T);
+    } else {
+      Type *i32 = Type::getInt32Ty(FT->getContext());
+      ArgTypes.push_back(i32);
+      ArgTypes.push_back(i32);
+    }
+  }
+  Type *RT = FT->getReturnType();
+  Type *NewRT;
+  if (!isIllegal(RT)) {
+    NewRT = RT;
+  } else {
+    NewRT = Type::getInt32Ty(FT->getContext());
+  }
+  return FunctionType::get(NewRT, ArgTypes, false);
+}
+
+// Implementation of ExpandI64
+
+static bool okToRemainIllegal(Function *F) {
+  StringRef Name = F->getName();
+  if (Name == "llvm.dbg.value") return true;
+
+  // XXX EMSCRIPTEN: These take an i64 immediate argument; since they're not
+  // real instructions, we don't need to legalize them.
+  if (Name == "llvm.lifetime.start") return true;
+  if (Name == "llvm.lifetime.end") return true;
+  if (Name == "llvm.invariant.start") return true;
+  if (Name == "llvm.invariant.end") return true;
+
+  return false;
+}
+
+unsigned ExpandI64::getNumChunks(Type *T) {
+  unsigned Num = DL->getTypeSizeInBits(T);
+  return (Num + 31) / 32;
+}
+
+static bool isLegalFunctionType(FunctionType *FT) {
+  if (isIllegal(FT->getReturnType())) {
+    return false;
+  }
+
+  int Num = FT->getNumParams();
+  for (int i = 0; i < Num; i++) {
+    if (isIllegal(FT->getParamType(i))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+static bool isLegalInstruction(const Instruction *I) {
+  if (isIllegal(I->getType())) {
+    return false;
+  }
+
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    if (isIllegal(I->getOperand(i)->getType())) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// We can't use RecreateFunction because we need to handle
+// function and argument attributes specially.
+static Function *RecreateFunctionLegalized(Function *F, FunctionType *NewType) {
+  Function *NewFunc = Function::Create(NewType, F->getLinkage());
+
+  AttributeSet Attrs = F->getAttributes();
+  AttributeSet FnAttrs = Attrs.getFnAttributes();
+
+  // Legalizing the return value is done by storing part of the value into
+  // static storage. Subsequent analysis will see this as a memory access,
+  // so we can no longer claim to be readonly or readnone.
+  if (isIllegal(F->getReturnType())) {
+    FnAttrs = FnAttrs.removeAttribute(F->getContext(),
+                                      AttributeSet::FunctionIndex,
+                                      Attribute::ReadOnly);
+    FnAttrs = FnAttrs.removeAttribute(F->getContext(),
+                                      AttributeSet::FunctionIndex,
+                                      Attribute::ReadNone);
+  }
+
+  NewFunc->addAttributes(AttributeSet::FunctionIndex, FnAttrs);
+  NewFunc->addAttributes(AttributeSet::ReturnIndex, Attrs.getRetAttributes());
+  Function::arg_iterator AI = F->arg_begin();
+
+  // We need to recreate the attribute set, with the right indexes
+  AttributeSet NewAttrs;
+  unsigned NumArgs = F->arg_size();
+  for (unsigned i = 1, j = 1; i < NumArgs+1; i++, j++, AI++) {
+    if (isIllegal(AI->getType())) {
+      j++;
+      continue;
+    }
+    if (!Attrs.hasAttributes(i)) continue;
+    AttributeSet ParamAttrs = Attrs.getParamAttributes(i);
+    AttrBuilder AB;
+    unsigned NumSlots = ParamAttrs.getNumSlots();
+    for (unsigned k = 0; k < NumSlots; k++) {
+      for (AttributeSet::iterator I = ParamAttrs.begin(k), E = ParamAttrs.end(k); I != E; I++) {
+        AB.addAttribute(*I);
+      }
+    }
+    NewFunc->addAttributes(j, AttributeSet::get(F->getContext(), j, AB));
+  }
+
+  F->getParent()->getFunctionList().insert(F->getIterator(), NewFunc);
+  NewFunc->takeName(F);
+  NewFunc->getBasicBlockList().splice(NewFunc->begin(),
+                                      F->getBasicBlockList());
+  F->replaceAllUsesWith(
+      ConstantExpr::getBitCast(NewFunc,
+                               F->getFunctionType()->getPointerTo()));
+  return NewFunc;
+}
+
+void ExpandI64::ensureLegalFunc(Function *F) {
+  if (okToRemainIllegal(F)) return;
+
+  FunctionType *FT = F->getFunctionType();
+  if (isLegalFunctionType(FT)) return;
+
+  Changed = true;
+  Function *NF = RecreateFunctionLegalized(F, getLegalizedFunctionType(FT));
+  std::string Name = NF->getName();
+  if (strncmp(Name.c_str(), "llvm.", 5) == 0) {
+    // this is an intrinsic, and we are changing its signature, which will annoy LLVM, so rename
+    const size_t len = Name.size();
+    SmallString<256> NewName;
+    NewName.resize(len);
+    for (unsigned i = 0; i < len; i++) {
+      NewName[i] = Name[i] != '.' ? Name[i] : '_';
+    }
+    NF->setName(Twine(NewName));
+  }
+
+  // Move and update arguments
+  for (Function::arg_iterator Arg = F->arg_begin(), E = F->arg_end(), NewArg = NF->arg_begin();
+       Arg != E; ++Arg) {
+    if (Arg->getType() == NewArg->getType()) {
+      NewArg->takeName(&*Arg);
+      Arg->replaceAllUsesWith(&*NewArg);
+      NewArg++;
+    } else {
+      // This was legalized
+      ChunksVec &Chunks = Splits[&*Arg];
+      int Num = getNumChunks(Arg->getType());
+      assert(Num == 2);
+      for (int i = 0; i < Num; i++) {
+        Chunks.push_back(&*NewArg);
+        if (NewArg->hasName()) Chunks[i]->setName(NewArg->getName() + "$" + utostr(i));
+        NewArg++;
+      }
+    }
+  }
+}
+
+void ExpandI64::removeIllegalFunc(Function *F) {
+  if (okToRemainIllegal(F)) return;
+
+  FunctionType *FT = F->getFunctionType();
+  if (!isLegalFunctionType(FT)) {
+    F->eraseFromParent();
+  }
+}
+
+bool ExpandI64::splitInst(Instruction *I) {
+  Type *i32 = Type::getInt32Ty(I->getContext());
+  Type *i32P = i32->getPointerTo();
+  Type *i64 = Type::getInt64Ty(I->getContext());
+  Value *Zero  = Constant::getNullValue(i32);
+
+  ChunksVec &Chunks = Splits[I];
+
+  switch (I->getOpcode()) {
+    case Instruction::GetElementPtr: {
+      GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+      SmallVector<Value*, 2> NewOps;
+      for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i) {
+        Value *Op = I->getOperand(i);
+        if (isIllegal(Op->getType())) {
+          // Truncate the operand down to one chunk.
+          NewOps.push_back(getChunks(Op)[0]);
+        } else {
+          NewOps.push_back(Op);
+        }
+      }
+      Value *NewGEP = CopyDebug(GetElementPtrInst::Create(GEP->getSourceElementType(), GEP->getPointerOperand(), NewOps, "", GEP), GEP);
+      Chunks.push_back(NewGEP);
+      I->replaceAllUsesWith(NewGEP);
+      break;
+    }
+    case Instruction::SExt: {
+      ChunksVec InputChunks;
+      Value *Op = I->getOperand(0);
+      if (isIllegal(Op->getType())) {
+        InputChunks = getChunks(Op);
+      } else {
+        InputChunks.push_back(Op);
+      }
+
+      for (unsigned i = 0, e = InputChunks.size(); i != e; ++i) {
+        Value *Input = InputChunks[i];
+
+        Type *T = Input->getType();
+        Value *Chunk;
+        if (T->getIntegerBitWidth() < 32) {
+          Chunk = CopyDebug(new SExtInst(Input, i32, "", I), I);
+        } else {
+          assert(T->getIntegerBitWidth() == 32);
+          Chunk = Input;
+        }
+        Chunks.push_back(Chunk);
+      }
+
+      Instruction *Check = CopyDebug(new ICmpInst(I, ICmpInst::ICMP_SLT, Chunks.back(), Zero), I);
+      int Num = getNumChunks(I->getType());
+      for (int i = Chunks.size(); i < Num; i++) {
+        Instruction *High = CopyDebug(new SExtInst(Check, i32, "", I), I);
+        Chunks.push_back(High);
+      }
+      break;
+    }
+    case Instruction::PtrToInt:
+    case Instruction::ZExt: {
+      Value *Op = I->getOperand(0);
+      ChunksVec InputChunks;
+      if (I->getOpcode() == Instruction::PtrToInt) {
+        InputChunks.push_back(CopyDebug(new PtrToIntInst(Op, i32, "", I), I));
+      } else if (isIllegal(Op->getType())) {
+        InputChunks = getChunks(Op);
+      } else {
+        InputChunks.push_back(Op);
+      }
+
+      for (unsigned i = 0, e = InputChunks.size(); i != e; ++i) {
+        Value *Input = InputChunks[i];
+        Type *T = Input->getType();
+
+        Value *Chunk;
+        if (T->getIntegerBitWidth() < 32) {
+          Chunk = CopyDebug(new ZExtInst(Input, i32, "", I), I);
+        } else {
+          assert(T->getIntegerBitWidth() == 32);
+          Chunk = Input;
+        }
+        Chunks.push_back(Chunk);
+      }
+
+      int Num = getNumChunks(I->getType());
+      for (int i = Chunks.size(); i < Num; i++) {
+        Chunks.push_back(Zero);
+      }
+      break;
+    }
+    case Instruction::IntToPtr:
+    case Instruction::Trunc: {
+      unsigned Num = getNumChunks(I->getType());
+      unsigned NumBits = DL->getTypeSizeInBits(I->getType());
+      ChunksVec InputChunks = getChunks(I->getOperand(0));
+      for (unsigned i = 0; i < Num; i++) {
+        Value *Input = InputChunks[i];
+
+        Value *Chunk;
+        if (NumBits < 32) {
+          Chunk = CopyDebug(new TruncInst(Input, IntegerType::get(I->getContext(), NumBits), "", I), I);
+          NumBits = 0;
+        } else {
+          Chunk = Input;
+          NumBits -= 32;
+        }
+        if (I->getOpcode() == Instruction::IntToPtr) {
+          assert(i == 0);
+          Chunk = CopyDebug(new IntToPtrInst(Chunk, I->getType(), "", I), I);
+        }
+        Chunks.push_back(Chunk);
+      }
+      if (!isIllegal(I->getType())) {
+        assert(Chunks.size() == 1);
+        I->replaceAllUsesWith(Chunks[0]);
+      }
+      break;
+    }
+    case Instruction::Load: {
+      LoadInst *LI = cast<LoadInst>(I);
+      Instruction *AI = CopyDebug(new PtrToIntInst(LI->getPointerOperand(), i32, "", I), I);
+      int Num = getNumChunks(I->getType());
+      for (int i = 0; i < Num; i++) {
+        Instruction *Add = i == 0 ? AI : CopyDebug(BinaryOperator::Create(Instruction::Add, AI, ConstantInt::get(i32, 4*i), "", I), I);
+        Instruction *Ptr = CopyDebug(new IntToPtrInst(Add, i32P, "", I), I);
+        LoadInst *Chunk = new LoadInst(Ptr, "", I); CopyDebug(Chunk, I);
+        Chunk->setAlignment(MinAlign(LI->getAlignment() == 0 ?
+                                         DL->getABITypeAlignment(LI->getType()) :
+                                         LI->getAlignment(),
+                                     4*i));
+        Chunk->setVolatile(LI->isVolatile());
+        Chunk->setOrdering(LI->getOrdering());
+        Chunk->setSynchScope(LI->getSynchScope());
+        Chunks.push_back(Chunk);
+      }
+      break;
+    }
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(I);
+      Instruction *AI = CopyDebug(new PtrToIntInst(SI->getPointerOperand(), i32, "", I), I);
+      ChunksVec InputChunks = getChunks(SI->getValueOperand());
+      int Num = InputChunks.size();
+      for (int i = 0; i < Num; i++) {
+        Instruction *Add = i == 0 ? AI : CopyDebug(BinaryOperator::Create(Instruction::Add, AI, ConstantInt::get(i32, 4*i), "", I), I);
+        Instruction *Ptr = CopyDebug(new IntToPtrInst(Add, i32P, "", I), I);
+        StoreInst *Chunk = new StoreInst(InputChunks[i], Ptr, I);
+        Chunk->setAlignment(MinAlign(SI->getAlignment() == 0 ?
+                                         DL->getABITypeAlignment(SI->getValueOperand()->getType()) :
+                                         SI->getAlignment(),
+                                     4*i));
+        Chunk->setVolatile(SI->isVolatile());
+        Chunk->setOrdering(SI->getOrdering());
+        Chunk->setSynchScope(SI->getSynchScope());
+        CopyDebug(Chunk, I);
+      }
+      break;
+    }
+    case Instruction::Ret: {
+      assert(I->getOperand(0)->getType() == i64);
+      ChunksVec InputChunks = getChunks(I->getOperand(0));
+      ensureFuncs();
+      SmallVector<Value *, 1> Args;
+      Args.push_back(InputChunks[1]);
+      CopyDebug(CallInst::Create(SetHigh, Args, "", I), I);
+      CopyDebug(ReturnInst::Create(I->getContext(), InputChunks[0], I), I);
+      break;
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::Shl: {
+      ChunksVec LeftChunks = getChunks(I->getOperand(0));
+      ChunksVec RightChunks = getChunks(I->getOperand(1));
+      unsigned Num = getNumChunks(I->getType());
+      if (Num == 2) {
+        ensureFuncs();
+        Value *Low = NULL, *High = NULL;
+        Function *F = NULL;
+        switch (I->getOpcode()) {
+          case Instruction::Add:  F = Add;  break;
+          case Instruction::Sub:  F = Sub;  break;
+          case Instruction::Mul:  F = Mul;  break;
+          case Instruction::SDiv: F = SDiv; break;
+          case Instruction::UDiv: F = UDiv; break;
+          case Instruction::SRem: F = SRem; break;
+          case Instruction::URem: F = URem; break;
+          case Instruction::AShr: F = AShr; break;
+          case Instruction::LShr: {
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+              unsigned Shifts = CI->getZExtValue();
+              if (Shifts == 32) {
+                Low = LeftChunks[1];
+                High = Zero;
+                break;
+              }
+            }
+            F = LShr;
+            break;
+          }
+          case Instruction::Shl: {
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+              const APInt &Shifts = CI->getValue();
+              if (Shifts == 32) {
+                Low = Zero;
+                High = LeftChunks[0];
+                break;
+              }
+            }
+            F = Shl;
+            break;
+          }
+          default: assert(0);
+        }
+        if (F) {
+          // use a library call, no special optimization was found
+          SmallVector<Value *, 4> Args;
+          Args.push_back(LeftChunks[0]);
+          Args.push_back(LeftChunks[1]);
+          Args.push_back(RightChunks[0]);
+          Args.push_back(RightChunks[1]);
+          Low = CopyDebug(CallInst::Create(F, Args, "", I), I);
+          High = CopyDebug(CallInst::Create(GetHigh, "", I), I);
+        }
+        Chunks.push_back(Low);
+        Chunks.push_back(High);
+      } else {
+        // more than 64 bits. handle simple shifts for lshr and shl
+        assert(I->getOpcode() == Instruction::LShr || I->getOpcode() == Instruction::AShr || I->getOpcode() == Instruction::Shl);
+        ConstantInt *CI = cast<ConstantInt>(I->getOperand(1));
+        unsigned Shifts = CI->getZExtValue();
+        unsigned Fraction = Shifts % 32;
+        Constant *Frac = ConstantInt::get(i32, Fraction);
+        Constant *Comp = ConstantInt::get(i32, 32 - Fraction);
+        Instruction::BinaryOps Opcode, Reverse;
+        unsigned ShiftChunks, Dir;
+        Value *TopFiller = Zero;
+        if (I->getOpcode() == Instruction::Shl) {
+          Opcode = Instruction::Shl;
+          Reverse = Instruction::LShr;
+          ShiftChunks = -(Shifts/32);
+          Dir = -1;
+        } else {
+          Opcode = Instruction::LShr;
+          Reverse = Instruction::Shl;
+          ShiftChunks = Shifts/32;
+          Dir = 1;
+          if (I->getOpcode() == Instruction::AShr) {
+            Value *Cond = CopyDebug(new ICmpInst(I, ICmpInst::ICMP_SLT, LeftChunks[LeftChunks.size()-1], Zero), I);
+            TopFiller = CopyDebug(SelectInst::Create(Cond, ConstantInt::get(i32, -1), Zero, "", I), I);
+          }
+        }
+        for (unsigned i = 0; i < Num; i++) {
+          Value *L;
+          if (i + ShiftChunks < LeftChunks.size()) {
+            L = LeftChunks[i + ShiftChunks];
+          } else {
+            L = Zero;
+          }
+
+          Value *H;
+          if (i + ShiftChunks + Dir < LeftChunks.size()) {
+            H = LeftChunks[i + ShiftChunks + Dir];
+          } else {
+            H = TopFiller;
+          }
+
+          // shifted the fractional amount
+          if (Frac != Zero && L != Zero) {
+            if (Fraction == 32) {
+              L = Zero;
+            } else {
+              L = CopyDebug(BinaryOperator::Create(Opcode, L, Frac, "", I), I);
+            }
+          }
+          // shifted the complement-fractional amount to the other side
+          if (Comp != Zero && H != Zero) {
+            if (Fraction == 0) {
+              H = TopFiller;
+            } else {
+              H = CopyDebug(BinaryOperator::Create(Reverse, H, Comp, "", I), I);
+            }
+          }
+
+          // Or the parts together. Since we may have zero, try to fold it away.
+          if (Value *V = SimplifyBinOp(Instruction::Or, L, H, *DL)) {
+            Chunks.push_back(V);
+          } else {
+            Chunks.push_back(CopyDebug(BinaryOperator::Create(Instruction::Or, L, H, "", I), I));
+          }
+        }
+      }
+      break;
+    }
+    case Instruction::ICmp: {
+      ICmpInst *CE = cast<ICmpInst>(I);
+      ICmpInst::Predicate Pred = CE->getPredicate();
+      ChunksVec LeftChunks = getChunks(I->getOperand(0));
+      ChunksVec RightChunks = getChunks(I->getOperand(1));
+      switch (Pred) {
+        case ICmpInst::ICMP_EQ:
+        case ICmpInst::ICMP_NE: {
+          ICmpInst::Predicate PartPred; // the predicate to use on each of the parts
+          llvm::Instruction::BinaryOps CombineOp; // the predicate to use to combine the subcomparisons
+          int Num = LeftChunks.size();
+          if (Pred == ICmpInst::ICMP_EQ) {
+            PartPred = ICmpInst::ICMP_EQ;
+            CombineOp = Instruction::And;
+          } else {
+            PartPred = ICmpInst::ICMP_NE;
+            CombineOp = Instruction::Or;
+          }
+          // first combine 0 and 1. then combine that with 2, etc.
+          Value *Combined = NULL;
+          for (int i = 0; i < Num; i++) {
+            Value *Cmp = CopyDebug(new ICmpInst(I, PartPred, LeftChunks[i], RightChunks[i]), I);
+            Combined = !Combined ? Cmp : CopyDebug(BinaryOperator::Create(CombineOp, Combined, Cmp, "", I), I);
+          }
+          I->replaceAllUsesWith(Combined);
+          break;
+        }
+        case ICmpInst::ICMP_ULT:
+        case ICmpInst::ICMP_SLT:
+        case ICmpInst::ICMP_UGT:
+        case ICmpInst::ICMP_SGT:
+        case ICmpInst::ICMP_ULE:
+        case ICmpInst::ICMP_SLE:
+        case ICmpInst::ICMP_UGE:
+        case ICmpInst::ICMP_SGE: {
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+            if (CI->getZExtValue() == 0 && Pred == ICmpInst::ICMP_SLT) {
+              // strict < 0 is easy to do, even on non-i64, just the sign bit matters
+              Instruction *NewInst = new ICmpInst(I, ICmpInst::ICMP_SLT, LeftChunks[LeftChunks.size()-1], Zero);
+              CopyDebug(NewInst, I);
+              I->replaceAllUsesWith(NewInst);
+              return true;
+            }
+          }
+          Type *T = I->getOperand(0)->getType();
+          assert(T->isIntegerTy() && T->getIntegerBitWidth() % 32 == 0);
+          int NumChunks = getNumChunks(T);
+          assert(NumChunks >= 2);
+          ICmpInst::Predicate StrictPred = Pred;
+          ICmpInst::Predicate UnsignedPred = Pred;
+          switch (Pred) {
+            case ICmpInst::ICMP_ULE: StrictPred = ICmpInst::ICMP_ULT; break;
+            case ICmpInst::ICMP_UGE: StrictPred = ICmpInst::ICMP_UGT; break;
+            case ICmpInst::ICMP_SLE: StrictPred = ICmpInst::ICMP_SLT; UnsignedPred = ICmpInst::ICMP_ULE; break;
+            case ICmpInst::ICMP_SGE: StrictPred = ICmpInst::ICMP_SGT; UnsignedPred = ICmpInst::ICMP_UGE; break;
+            case ICmpInst::ICMP_SLT:                                  UnsignedPred = ICmpInst::ICMP_ULT; break;
+            case ICmpInst::ICMP_SGT:                                  UnsignedPred = ICmpInst::ICMP_UGT; break;
+            case ICmpInst::ICMP_ULT: break;
+            case ICmpInst::ICMP_UGT: break;
+            default: assert(0);
+          }
+          // general pattern is
+          // a,b,c < A,B,C    =>    c < C || (c == C && b < B) || (c == C && b == B && a < A)
+          Instruction *Final = CopyDebug(new ICmpInst(I, StrictPred, LeftChunks[NumChunks-1], RightChunks[NumChunks-1]), I);
+          for (int i = NumChunks-2; i >= 0; i--) {
+            Instruction *Curr = CopyDebug(new ICmpInst(I, UnsignedPred, LeftChunks[i], RightChunks[i]), I);
+            for (int j = NumChunks-1; j > i; j--) {
+              Instruction *Temp = CopyDebug(new ICmpInst(I, ICmpInst::ICMP_EQ, LeftChunks[j], RightChunks[j]), I);
+              Curr = CopyDebug(BinaryOperator::Create(Instruction::And, Temp, Curr, "", I), I);
+            }
+            Final = CopyDebug(BinaryOperator::Create(Instruction::Or, Final, Curr, "", I), I);
+          }
+          I->replaceAllUsesWith(Final);
+          break;
+        }
+        default: assert(0);
+      }
+      break;
+    }
+    case Instruction::Select: {
+      SelectInst *SI = cast<SelectInst>(I);
+      Value *Cond = SI->getCondition();
+      ChunksVec TrueChunks = getChunks(SI->getTrueValue());
+      ChunksVec FalseChunks = getChunks(SI->getFalseValue());
+      unsigned Num = getNumChunks(I->getType());
+      for (unsigned i = 0; i < Num; i++) {
+        Instruction *Part = CopyDebug(SelectInst::Create(Cond, TrueChunks[i], FalseChunks[i], "", I), I);
+        Chunks.push_back(Part);
+      }
+      break;
+    }
+    case Instruction::PHI: {
+      PHINode *Parent = cast<PHINode>(I);
+      int Num = getNumChunks(I->getType());
+      int PhiNum = Parent->getNumIncomingValues();
+      for (int i = 0; i < Num; i++) {
+        Instruction *P = CopyDebug(PHINode::Create(i32, PhiNum, "", I), I);
+        Chunks.push_back(P);
+      }
+      // PHI node operands may not be translated yet; we'll handle them at the end.
+      Phis.push_back(Parent);
+      break;
+    }
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      BinaryOperator *BO = cast<BinaryOperator>(I);
+      ChunksVec LeftChunks = getChunks(BO->getOperand(0));
+      ChunksVec RightChunks = getChunks(BO->getOperand(1));
+      int Num = getNumChunks(BO->getType());
+      for (int i = 0; i < Num; i++) {
+        // If there's a constant operand, it's likely enough that one of the
+        // chunks will be a trivial operation, so it's worth calling
+        // SimplifyBinOp here.
+        if (Value *V = SimplifyBinOp(BO->getOpcode(), LeftChunks[i], RightChunks[i], *DL)) {
+          Chunks.push_back(V);
+        } else {
+          Chunks.push_back(CopyDebug(BinaryOperator::Create(BO->getOpcode(), LeftChunks[i], RightChunks[i], "", BO), BO));
+        }
+      }
+      break;
+    }
+    case Instruction::Call: {
+      CallInst *CI = cast<CallInst>(I);
+      Function *F = CI->getCalledFunction();
+      if (F) {
+        // EM_ASMs should not have i64s as arguments
+        if (F->getName().startswith("emscripten_asm_const")) {
+          report_fatal_error("EM_ASM should not receive i64s as inputs, they are not valid in JS");
+        }
+        assert(okToRemainIllegal(F));
+        return false;
+      }
+      Value *CV = CI->getCalledValue();
+      FunctionType *OFT = NULL;
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+        assert(CE);
+        OFT = cast<FunctionType>(cast<PointerType>(CE->getType())->getElementType());
+        Constant *C = CE->getOperand(0);
+        if (CE->getOpcode() == Instruction::BitCast) {
+          CV = ConstantExpr::getBitCast(C, getLegalizedFunctionType(OFT)->getPointerTo());
+        } else if (CE->getOpcode() == Instruction::IntToPtr) {
+          CV = ConstantExpr::getIntToPtr(C, getLegalizedFunctionType(OFT)->getPointerTo());
+        } else {
+          llvm_unreachable("Bad CE in i64 Call");
+        }
+      } else {
+        // this is a function pointer call
+        OFT = cast<FunctionType>(cast<PointerType>(CV->getType())->getElementType());
+        // we need to add a bitcast
+        CV = new BitCastInst(CV, getLegalizedFunctionType(OFT)->getPointerTo(), "", I);
+      }
+      // create a call with space for legal args
+      SmallVector<Value *, 0> Args; // XXX
+      int Num = OFT->getNumParams();
+      for (int i = 0; i < Num; i++) {
+        Type *T = OFT->getParamType(i);
+        if (!isIllegal(T)) {
+          Args.push_back(CI->getArgOperand(i));
+        } else {
+          assert(T == i64);
+          ChunksVec ArgChunks = getChunks(CI->getArgOperand(i));
+          Args.push_back(ArgChunks[0]);
+          Args.push_back(ArgChunks[1]);
+        }
+      }
+      Instruction *L = CopyDebug(CallInst::Create(CV, Args, "", I), I);
+      Instruction *H = NULL;
+      // legalize return value as well, if necessary
+      if (isIllegal(I->getType())) {
+        assert(I->getType() == i64);
+        ensureFuncs();
+        H = CopyDebug(CallInst::Create(GetHigh, "", I), I);
+        Chunks.push_back(L);
+        Chunks.push_back(H);
+      } else {
+        I->replaceAllUsesWith(L);
+      }
+      break;
+    }
+    case Instruction::FPToUI:
+    case Instruction::FPToSI: {
+      assert(I->getType() == i64);
+      ensureFuncs();
+      SmallVector<Value *, 1> Args;
+      Value *Input = I->getOperand(0);
+      Args.push_back(Input);
+      Instruction *L, *H;
+      if (Input->getType()->isFloatTy()) {
+        L = CopyDebug(CallInst::Create(FtoILow, Args, "", I), I);
+        H = CopyDebug(CallInst::Create(FtoIHigh, Args, "", I), I);
+      } else {
+        L = CopyDebug(CallInst::Create(DtoILow, Args, "", I), I);
+        H = CopyDebug(CallInst::Create(DtoIHigh, Args, "", I), I);
+      }
+      Chunks.push_back(L);
+      Chunks.push_back(H);
+      break;
+    }
+    case Instruction::BitCast: {
+      if (I->getType() == Type::getDoubleTy(TheModule->getContext())) {
+        // fall through to itofp
+      } else if (I->getOperand(0)->getType() == Type::getDoubleTy(TheModule->getContext())) {
+        // double to i64
+        assert(I->getType() == i64);
+        ensureFuncs();
+        SmallVector<Value *, 1> Args;
+        Args.push_back(I->getOperand(0));
+        Instruction *L = CopyDebug(CallInst::Create(BDtoILow, Args, "", I), I);
+        Instruction *H = CopyDebug(CallInst::Create(BDtoIHigh, Args, "", I), I);
+        Chunks.push_back(L);
+        Chunks.push_back(H);
+        break;
+      } else if (isa<VectorType>(I->getOperand(0)->getType()) && !isa<VectorType>(I->getType())) {
+        unsigned NumElts = getNumChunks(I->getType());
+        VectorType *IVTy = VectorType::get(i32, NumElts);
+        Instruction *B = CopyDebug(new BitCastInst(I->getOperand(0), IVTy, "", I), I);
+        for (unsigned i = 0; i < NumElts; ++i) {
+          Constant *Idx = ConstantInt::get(i32, i);
+          Instruction *Ext = CopyDebug(ExtractElementInst::Create(B, Idx, "", I), I);
+          Chunks.push_back(Ext);
+        }
+        break;
+      } else {
+        // no-op bitcast
+        assert(I->getType() == I->getOperand(0)->getType() && "possible hint: optimize with -O0 or -O2+, and not -O1");
+        Chunks = getChunks(I->getOperand(0));
+        break;
+      }
+    }
+    case Instruction::SIToFP:
+    case Instruction::UIToFP: {
+      assert(I->getOperand(0)->getType() == i64);
+      ensureFuncs();
+      ChunksVec InputChunks = getChunks(I->getOperand(0));
+      Function *F;
+      switch (I->getOpcode()) {
+        case Instruction::SIToFP: F = I->getType() == Type::getDoubleTy(TheModule->getContext()) ? SItoD : SItoF; break;
+        case Instruction::UIToFP: F = I->getType() == Type::getDoubleTy(TheModule->getContext()) ? UItoD : UItoF; break;
+        case Instruction::BitCast: {
+          assert(I->getType() == Type::getDoubleTy(TheModule->getContext()));
+          F = BItoD;
+          break;
+        }
+        default: assert(0);
+      }
+      Instruction *D = CopyDebug(CallInst::Create(F, InputChunks, "", I), I);
+      I->replaceAllUsesWith(D);
+      break;
+    }
+    case Instruction::Switch: {
+      assert(I->getOperand(0)->getType() == i64);
+      ChunksVec InputChunks = getChunks(I->getOperand(0));
+
+      // do a switch on the lower 32 bits, into a different basic block for each target, then do a branch in each of those on the high 32 bits
+      SwitchInst* SI = cast<SwitchInst>(I);
+      BasicBlock *DD = SI->getDefaultDest();
+      BasicBlock *SwitchBB = I->getParent();
+      Function *F = SwitchBB->getParent();
+
+      unsigned NumItems = SI->getNumCases();
+      SwitchInst *LowSI = SwitchInst::Create(InputChunks[0], DD, NumItems, I); // same default destination: if lower bits do not match, go straight to default
+      CopyDebug(LowSI, I);
+
+      typedef std::pair<uint32_t, BasicBlock*> Pair;
+      typedef std::vector<Pair> Vec; // vector of pairs of high 32 bits, basic block
+      typedef std::map<uint32_t, Vec> Map; // maps low 32 bits to their Vec info
+      Map Groups;                          // (as two 64-bit values in the switch may share their lower bits)
+
+      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) {
+        BasicBlock *BB = i.getCaseSuccessor();
+        uint64_t Bits = i.getCaseValue()->getZExtValue();
+        uint32_t LowBits = (uint32_t)Bits;
+        uint32_t HighBits = (uint32_t)(Bits >> 32);
+        Vec& V = Groups[LowBits];
+        V.push_back(Pair(HighBits, BB));
+      }
+
+      unsigned Counter = 0;
+      BasicBlock *InsertPoint = SwitchBB;
+
+      for (Map::iterator GI = Groups.begin(); GI != Groups.end(); GI++) {
+        uint32_t LowBits = GI->first;
+        Vec &V = GI->second;
+
+        BasicBlock *NewBB = BasicBlock::Create(F->getContext(), "switch64_" + utostr(Counter++), F);
+        NewBB->moveAfter(InsertPoint);
+        InsertPoint = NewBB;
+        LowSI->addCase(cast<ConstantInt>(ConstantInt::get(i32, LowBits)), NewBB);
+
+        /*if (V.size() == 1) {
+          // just one option, create a branch
+          Instruction *CheckHigh = CopyDebug(new ICmpInst(*NewBB, ICmpInst::ICMP_EQ, InputChunks[1], ConstantInt::get(i32, V[0]->first)), I);
+          Split.ToFix.push_back(CheckHigh);
+          CopyDebug(BranchInst::Create(V[0]->second, DD, CheckHigh, NewBB), I);
+        } else {*/
+
+        // multiple options, create a switch - we could also optimize and make an icmp/branch if just one, as in commented code above
+        SwitchInst *HighSI = SwitchInst::Create(InputChunks[1], DD, V.size(), NewBB); // same default destination: if lower bits do not match, go straight to default
+        for (unsigned i = 0; i < V.size(); i++) {
+          BasicBlock *BB = V[i].second;
+          HighSI->addCase(cast<ConstantInt>(ConstantInt::get(i32, V[i].first)), BB);
+          // fix phis, we used to go SwitchBB->BB, but now go SwitchBB->NewBB->BB, so we look like we arrived from NewBB. Replace the phi from the
+          // now unneeded SwitchBB to the new BB
+          // We cannot do this here right now, as phis we encounter may be in the middle of processing (empty), so we queue these.
+          for (BasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+            PHINode *Phi = dyn_cast<PHINode>(I);
+            if (!Phi) break;
+            PhiBlockChange Change;
+            Change.DD = BB;
+            Change.SwitchBB = SwitchBB;
+            Change.NewBB = NewBB;
+            PhiBlockChanges.push_back(Change);
+            break; // we saw a phi on this BB, and pushed a Change
+          }
+        }
+
+        // We used to go SwitchBB->DD, but now go SwitchBB->NewBB->DD, fix that like with BB above. However here we do not replace,
+        // as the switch BB is still possible to arrive from - we can arrive at the default if either the lower bits were wrong (we
+        // arrive from the switchBB) or from the NewBB if the high bits were wrong.
+        PhiBlockChange Change;
+        Change.DD = DD;
+        Change.SwitchBB = SwitchBB;
+        Change.NewBB = NewBB;
+        PhiBlockChanges.push_back(Change);
+      }
+      break;
+    }
+    case Instruction::AtomicRMW: {
+      const AtomicRMWInst *rmwi = cast<AtomicRMWInst>(I);
+      ChunksVec Chunks32Bit = getChunks(I->getOperand(1));
+      unsigned Num = getNumChunks(I->getType());
+      assert(Num == 2 && "Only know how to handle 32-bit and 64-bit AtomicRMW instructions!");
+      ensureFuncs();
+      Value *Low = NULL, *High = NULL;
+      Function *F = NULL;
+      switch (rmwi->getOperation()) {
+        case AtomicRMWInst::Add: F = AtomicAdd; break;
+        case AtomicRMWInst::Sub: F = AtomicSub; break;
+        case AtomicRMWInst::And: F = AtomicAnd; break;
+        case AtomicRMWInst::Or: F = AtomicOr; break;
+        case AtomicRMWInst::Xor: F = AtomicXor; break;
+        case AtomicRMWInst::Xchg:
+        case AtomicRMWInst::Nand:
+        case AtomicRMWInst::Max:
+        case AtomicRMWInst::Min:
+        case AtomicRMWInst::UMax:
+        case AtomicRMWInst::UMin:
+        default: llvm_unreachable("Bad atomic operation");
+      }
+      SmallVector<Value *, 3> Args;
+      Args.push_back(new BitCastInst(I->getOperand(0), Type::getInt8PtrTy(TheModule->getContext()), "", I));
+      Args.push_back(Chunks32Bit[0]);
+      Args.push_back(Chunks32Bit[1]);
+      Low = CopyDebug(CallInst::Create(F, Args, "", I), I);
+      High = CopyDebug(CallInst::Create(GetHigh, "", I), I);
+      Chunks.push_back(Low);
+      Chunks.push_back(High);
+      break;
+    }
+    case Instruction::AtomicCmpXchg: {
+      assert(0 && "64-bit compare-and-exchange (__sync_bool_compare_and_swap & __sync_val_compare_and_swap) are not supported! Please directly call emscripten_atomic_cas_u64() instead in order to emulate!");
+      break;
+    }
+    default: {
+      I->dump();
+      assert(0 && "some i64 thing we can't legalize yet. possible hint: optimize with -O0 or -O2+, and not -O1");
+    }
+  }
+
+  return true;
+}
+
+ChunksVec ExpandI64::getChunks(Value *V, bool AllowUnreachable) {
+  assert(isIllegal(V->getType()));
+
+  unsigned Num = getNumChunks(V->getType());
+  Type *i32 = Type::getInt32Ty(V->getContext());
+
+  if (isa<UndefValue>(V))
+    return ChunksVec(Num, UndefValue::get(i32));
+
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    ChunksVec Chunks;
+    for (unsigned i = 0; i < Num; i++) {
+      Constant *Count = ConstantInt::get(C->getType(), i * 32);
+      Constant *NewC = ConstantExpr::getTrunc(ConstantExpr::getLShr(C, Count), i32);
+      TargetLibraryInfo *TLI = 0; // TODO
+      if (ConstantExpr *NewCE = dyn_cast<ConstantExpr>(NewC)) {
+        if (Constant *FoldedC = ConstantFoldConstant(NewCE, *DL, TLI)) {
+          NewC = FoldedC;
+        }
+      }
+
+      Chunks.push_back(NewC);
+    }
+    return Chunks;
+  }
+
+  if (Splits.find(V) == Splits.end()) {
+    if (AllowUnreachable)
+      return ChunksVec(Num, UndefValue::get(i32));
+    errs() << *V << "\n";
+    report_fatal_error("could not find chunks for illegal value");
+  }
+  assert(Splits[V].size() == Num);
+  return Splits[V];
+}
+
+void ExpandI64::ensureFuncs() {
+  if (Add != NULL) return;
+
+  Type *i32 = Type::getInt32Ty(TheModule->getContext());
+
+  SmallVector<Type*, 3> ThreeArgTypes;
+  ThreeArgTypes.push_back(Type::getInt8PtrTy(TheModule->getContext()));
+  ThreeArgTypes.push_back(i32);
+  ThreeArgTypes.push_back(i32);
+  FunctionType *ThreeFunc = FunctionType::get(i32, ThreeArgTypes, false);
+
+  AtomicAdd = TheModule->getFunction("_emscripten_atomic_fetch_and_add_u64");
+  if (!AtomicAdd) {
+    AtomicAdd = Function::Create(ThreeFunc, GlobalValue::ExternalLinkage,
+                                 "_emscripten_atomic_fetch_and_add_u64", TheModule);
+  }
+  AtomicSub = TheModule->getFunction("_emscripten_atomic_fetch_and_sub_u64");
+  if (!AtomicSub) {
+    AtomicSub = Function::Create(ThreeFunc, GlobalValue::ExternalLinkage,
+                                 "_emscripten_atomic_fetch_and_sub_u64", TheModule);
+  }
+  AtomicAnd = TheModule->getFunction("_emscripten_atomic_fetch_and_and_u64");
+  if (!AtomicAnd) {
+    AtomicAnd = Function::Create(ThreeFunc, GlobalValue::ExternalLinkage,
+                                 "_emscripten_atomic_fetch_and_and_u64", TheModule);
+  }
+  AtomicOr = TheModule->getFunction("_emscripten_atomic_fetch_and_or_u64");
+  if (!AtomicOr) {
+    AtomicOr = Function::Create(ThreeFunc, GlobalValue::ExternalLinkage,
+                                 "_emscripten_atomic_fetch_and_or_u64", TheModule);
+  }
+  AtomicXor = TheModule->getFunction("_emscripten_atomic_fetch_and_xor_u64");
+  if (!AtomicXor) {
+    AtomicXor = Function::Create(ThreeFunc, GlobalValue::ExternalLinkage,
+                                 "_emscripten_atomic_fetch_and_xor_u64", TheModule);
+  }
+
+  SmallVector<Type*, 4> FourArgTypes;
+  FourArgTypes.push_back(i32);
+  FourArgTypes.push_back(i32);
+  FourArgTypes.push_back(i32);
+  FourArgTypes.push_back(i32);
+  FunctionType *FourFunc = FunctionType::get(i32, FourArgTypes, false);
+
+  Add = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                         "i64Add", TheModule);
+  Sub = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                         "i64Subtract", TheModule);
+  Mul = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                         "__muldi3", TheModule);
+  SDiv = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                         "__divdi3", TheModule);
+  UDiv = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                         "__udivdi3", TheModule);
+  SRem = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                         "__remdi3", TheModule);
+  URem = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                         "__uremdi3", TheModule);
+  LShr = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                          "bitshift64Lshr", TheModule);
+  AShr = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                          "bitshift64Ashr", TheModule);
+  Shl = Function::Create(FourFunc, GlobalValue::ExternalLinkage,
+                          "bitshift64Shl", TheModule);
+
+  if (!(GetHigh = TheModule->getFunction("getHigh32"))) {
+    SmallVector<Type*, 0> GetHighArgTypes;
+    FunctionType *GetHighFunc = FunctionType::get(i32, GetHighArgTypes, false);
+    GetHigh = Function::Create(GetHighFunc, GlobalValue::ExternalLinkage,
+                               "getHigh32", TheModule);
+  }
+
+  Type *V = Type::getVoidTy(TheModule->getContext());
+
+  SmallVector<Type*, 1> SetHighArgTypes;
+  SetHighArgTypes.push_back(i32);
+  FunctionType *SetHighFunc = FunctionType::get(V, SetHighArgTypes, false);
+  SetHigh = Function::Create(SetHighFunc, GlobalValue::ExternalLinkage,
+                             "setHigh32", TheModule);
+
+  Type *Double = Type::getDoubleTy(TheModule->getContext());
+  Type *Float  = Type::getFloatTy(TheModule->getContext());
+
+  SmallVector<Type*, 1> FtoITypes;
+  FtoITypes.push_back(Float);
+  FunctionType *FtoIFunc = FunctionType::get(i32, FtoITypes, false);
+
+  SmallVector<Type*, 1> DtoITypes;
+  DtoITypes.push_back(Double);
+  FunctionType *DtoIFunc = FunctionType::get(i32, DtoITypes, false);
+
+  FtoILow = Function::Create(FtoIFunc, GlobalValue::ExternalLinkage,
+                             "FtoILow", TheModule);
+  FtoIHigh = Function::Create(FtoIFunc, GlobalValue::ExternalLinkage,
+                              "FtoIHigh", TheModule);
+  DtoILow = Function::Create(DtoIFunc, GlobalValue::ExternalLinkage,
+                             "DtoILow", TheModule);
+  DtoIHigh = Function::Create(DtoIFunc, GlobalValue::ExternalLinkage,
+                              "DtoIHigh", TheModule);
+  BDtoILow = Function::Create(DtoIFunc, GlobalValue::ExternalLinkage,
+                              "BDtoILow", TheModule);
+  BDtoIHigh = Function::Create(DtoIFunc, GlobalValue::ExternalLinkage,
+                               "BDtoIHigh", TheModule);
+
+  SmallVector<Type*, 2> ItoTypes;
+  ItoTypes.push_back(i32);
+  ItoTypes.push_back(i32);
+
+  FunctionType *ItoFFunc = FunctionType::get(Float, ItoTypes, false);
+  SItoF = Function::Create(ItoFFunc, GlobalValue::ExternalLinkage,
+                           "SItoF", TheModule);
+  UItoF = Function::Create(ItoFFunc, GlobalValue::ExternalLinkage,
+                           "UItoF", TheModule);
+
+  FunctionType *ItoDFunc = FunctionType::get(Double, ItoTypes, false);
+  SItoD = Function::Create(ItoDFunc, GlobalValue::ExternalLinkage,
+                           "SItoD", TheModule);
+  UItoD = Function::Create(ItoDFunc, GlobalValue::ExternalLinkage,
+                           "UItoD", TheModule);
+
+  BItoD = Function::Create(ItoDFunc, GlobalValue::ExternalLinkage,
+                           "BItoD", TheModule);
+}
+
+bool ExpandI64::runOnModule(Module &M) {
+  TheModule = &M;
+  DL = &M.getDataLayout();
+  Splits.clear();
+  Changed = false;
+
+  // pre pass - legalize functions
+  for (Module::iterator Iter = M.begin(), E = M.end(); Iter != E; ) {
+    Function *Func = &*Iter++;
+    ensureLegalFunc(Func);
+  }
+
+  // first pass - split
+  DeadVec Dead;
+  for (Module::iterator Iter = M.begin(), E = M.end(); Iter != E; ++Iter) {
+    Function *Func = &*Iter;
+    if (Func->isDeclaration()) {
+      continue;
+    }
+
+    // Walk the body of the function. We use reverse postorder so that we visit
+    // all operands of an instruction before the instruction itself. The
+    // exception to this is PHI nodes, which we put on a list and handle below.
+    ReversePostOrderTraversal<Function*> RPOT(Func);
+    for (ReversePostOrderTraversal<Function*>::rpo_iterator RI = RPOT.begin(),
+         RE = RPOT.end(); RI != RE; ++RI) {
+      BasicBlock *BB = *RI;
+      for (BasicBlock::iterator Iter = BB->begin(), E = BB->end();
+           Iter != E; ) {
+        Instruction *I = &*Iter++;
+        if (!isLegalInstruction(I)) {
+          if (splitInst(I)) {
+            Changed = true;
+            Dead.push_back(I);
+          }
+        }
+      }
+    }
+
+    // Fix up PHI node operands.
+    while (!Phis.empty()) {
+      PHINode *PN = Phis.pop_back_val();
+      ChunksVec OutputChunks = getChunks(PN);
+      for (unsigned j = 0, je = PN->getNumIncomingValues(); j != je; ++j) {
+        Value *Op = PN->getIncomingValue(j);
+        ChunksVec InputChunks = getChunks(Op, true);
+        for (unsigned k = 0, ke = OutputChunks.size(); k != ke; ++k) {
+          PHINode *NewPN = cast<PHINode>(OutputChunks[k]);
+          NewPN->addIncoming(InputChunks[k], PN->getIncomingBlock(j));
+        }
+      }
+      PN->dropAllReferences();
+    }
+
+    // Delete instructions which were replaced. We do this after the full walk
+    // of the instructions so that all uses are replaced first.
+    while (!Dead.empty()) {
+      Instruction *D = Dead.pop_back_val();
+      D->eraseFromParent();
+    }
+
+    // Apply basic block changes to phis, now that phis are all processed (and illegal phis erased)
+    for (unsigned i = 0; i < PhiBlockChanges.size(); i++) {
+      PhiBlockChange &Change = PhiBlockChanges[i];
+      for (BasicBlock::iterator I = Change.DD->begin(); I != Change.DD->end(); ++I) {
+        PHINode *Phi = dyn_cast<PHINode>(I);
+        if (!Phi) break;
+        int Index = Phi->getBasicBlockIndex(Change.SwitchBB);
+        assert(Index >= 0);
+        Phi->addIncoming(Phi->getIncomingValue(Index), Change.NewBB);
+      }
+    }
+    PhiBlockChanges.clear();
+
+    // We only visited blocks found by a DFS walk from the entry, so we haven't
+    // visited any unreachable blocks, and they may still contain illegal
+    // instructions at this point. Being unreachable, they can simply be deleted.
+    removeUnreachableBlocks(*Func);
+  }
+
+  // post pass - clean up illegal functions that were legalized. We do this
+  // after the full walk of the functions so that all uses are replaced first.
+  for (Module::iterator Iter = M.begin(), E = M.end(); Iter != E; ) {
+    Function *Func = &*Iter++;
+    removeIllegalFunc(Func);
+  }
+
+  return Changed;
+}
+
+ModulePass *llvm::createExpandI64Pass() {
+  return new ExpandI64();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandIndirectBr.cpp b/lib/Target/JSBackend/NaCl/ExpandIndirectBr.cpp
new file mode 100644
index 000000000000..974f0dfee16f
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandIndirectBr.cpp
@@ -0,0 +1,152 @@
+//===- ExpandIndirectBr.cpp - Expand out indirectbr and blockaddress-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands out indirectbr instructions and blockaddress
+// ConstantExprs, which are not currently supported in PNaCl's stable
+// ABI.  indirectbr is used to implement computed gotos (a GNU
+// extension to C).  This pass replaces indirectbr instructions with
+// switch instructions.
+//
+// The resulting use of switches might not be as fast as the original
+// indirectbrs.  If you are compiling a program that has a
+// compile-time option for using computed gotos, it's possible that
+// the program will run faster with the option turned off than with
+// using computed gotos + ExpandIndirectBr (for example, if the
+// program does extra work to take advantage of computed gotos).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  // This is a ModulePass so that it can expand out blockaddress
+  // ConstantExprs inside global variable initializers.
+  class ExpandIndirectBr : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ExpandIndirectBr() : ModulePass(ID) {
+      initializeExpandIndirectBrPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char ExpandIndirectBr::ID = 0;
+INITIALIZE_PASS(ExpandIndirectBr, "expand-indirectbr",
+                "Expand out indirectbr and blockaddress (computed gotos)",
+                false, false)
+
+static bool convertFunction(Function *Func) {
+  bool Changed = false;
+  IntegerType *I32 = Type::getInt32Ty(Func->getContext());
+
+  // Skip zero in case programs treat a null pointer as special.
+  uint32_t NextNum = 1;
+  DenseMap<BasicBlock *, ConstantInt *> LabelNums;
+  BasicBlock *DefaultBB = NULL;
+
+  // Replace each indirectbr with a switch.
+  //
+  // If there are multiple indirectbr instructions in the function,
+  // this could be expensive.  While an indirectbr is usually
+  // converted to O(1) machine instructions, the switch we generate
+  // here will be O(n) in the number of target labels.
+  //
+  // However, Clang usually generates just a single indirectbr per
+  // function anyway when compiling C computed gotos.
+  //
+  // We could try to generate one switch to handle all the indirectbr
+  // instructions in the function, but that would be complicated to
+  // implement given that variables that are live at one indirectbr
+  // might not be live at others.
+  for (llvm::Function::iterator BB = Func->begin(), E = Func->end();
+       BB != E; ++BB) {
+    if (IndirectBrInst *Br = dyn_cast<IndirectBrInst>(BB->getTerminator())) {
+      Changed = true;
+
+      if (!DefaultBB) {
+        DefaultBB = BasicBlock::Create(Func->getContext(),
+                                       "indirectbr_default", Func);
+        new UnreachableInst(Func->getContext(), DefaultBB);
+      }
+
+      // An indirectbr can list the same target block multiple times.
+      // Keep track of the basic blocks we've handled to avoid adding
+      // the same case multiple times.
+      DenseSet<BasicBlock *> BlocksSeen;
+
+      Value *Cast = new PtrToIntInst(Br->getAddress(), I32,
+                                     "indirectbr_cast", Br);
+      unsigned Count = Br->getNumSuccessors();
+      SwitchInst *Switch = SwitchInst::Create(Cast, DefaultBB, Count, Br);
+      for (unsigned I = 0; I < Count; ++I) {
+        BasicBlock *Dest = Br->getSuccessor(I);
+        if (!BlocksSeen.insert(Dest).second) {
+          // Remove duplicated entries from phi nodes.
+          for (BasicBlock::iterator Inst = Dest->begin(); ; ++Inst) {
+            PHINode *Phi = dyn_cast<PHINode>(Inst);
+            if (!Phi)
+              break;
+            Phi->removeIncomingValue(Br->getParent());
+          }
+          continue;
+        }
+        ConstantInt *Val;
+        if (LabelNums.count(Dest) == 0) {
+          Val = ConstantInt::get(I32, NextNum++);
+          LabelNums[Dest] = Val;
+
+          BlockAddress *BA = BlockAddress::get(Func, Dest);
+          Value *ValAsPtr = ConstantExpr::getIntToPtr(Val, BA->getType());
+          BA->replaceAllUsesWith(ValAsPtr);
+          BA->destroyConstant();
+        } else {
+          Val = LabelNums[Dest];
+        }
+        Switch->addCase(Val, Br->getSuccessor(I));
+      }
+      Br->eraseFromParent();
+    }
+  }
+
+  // If there are any blockaddresses that are never used by an
+  // indirectbr, replace them with dummy values.
+  SmallVector<Value *, 20> Users(Func->user_begin(), Func->user_end());
+  for (auto U : Users) {
+    if (BlockAddress *BA = dyn_cast<BlockAddress>(U)) {
+      Changed = true;
+      Value *DummyVal = ConstantExpr::getIntToPtr(ConstantInt::get(I32, ~0L),
+                                                  BA->getType());
+      BA->replaceAllUsesWith(DummyVal);
+      BA->destroyConstant();
+    }
+  }
+  return Changed;
+}
+
+bool ExpandIndirectBr::runOnModule(Module &M) {
+  bool Changed = false;
+  for (Module::iterator Func = M.begin(), E = M.end(); Func != E; ++Func) {
+    Changed |= convertFunction(&*Func);
+  }
+  return Changed;
+}
+
+ModulePass *llvm::createExpandIndirectBrPass() {
+  return new ExpandIndirectBr();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandInsertExtractElement.cpp b/lib/Target/JSBackend/NaCl/ExpandInsertExtractElement.cpp
new file mode 100644
index 000000000000..7c1c88004be2
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandInsertExtractElement.cpp
@@ -0,0 +1,100 @@
+//==- ExpandInsertExtractElement.cpp - Expand vector insert and extract -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------------===//
+//
+// This pass expands insertelement and extractelement instructions with
+// variable indices, which SIMD.js doesn't natively support yet.
+//
+//===------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <map>
+#include <vector>
+
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+  class ExpandInsertExtractElement : public FunctionPass {
+    bool Changed;
+
+  public:
+    static char ID;
+    ExpandInsertExtractElement() : FunctionPass(ID) {
+      initializeExpandInsertExtractElementPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+  };
+}
+
+char ExpandInsertExtractElement::ID = 0;
+INITIALIZE_PASS(ExpandInsertExtractElement, "expand-insert-extract-elements",
+                "Expand and lower insert and extract element operations",
+                false, false)
+
+// Utilities
+
+bool ExpandInsertExtractElement::runOnFunction(Function &F) {
+  Changed = false;
+
+  Instruction *Entry = &*F.getEntryBlock().begin();
+  Type *Int32 = Type::getInt32Ty(F.getContext());
+  Constant *Zero = ConstantInt::get(Int32, 0);
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+
+    if (InsertElementInst *III = dyn_cast<InsertElementInst>(Inst)) {
+      if (isa<ConstantInt>(III->getOperand(2)))
+          continue;
+
+      Type *AllocaTy = III->getType();
+      Instruction *A = new AllocaInst(AllocaTy, 0, "", Entry);
+      CopyDebug(new StoreInst(III->getOperand(0), A, III), III);
+
+      Value *Idxs[] = { Zero, III->getOperand(2) };
+      Instruction *B = CopyDebug(
+          GetElementPtrInst::Create(AllocaTy, A, Idxs, "", III), III);
+      CopyDebug(new StoreInst(III->getOperand(1), B, III), III);
+
+      Instruction *L = CopyDebug(new LoadInst(A, "", III), III);
+      III->replaceAllUsesWith(L);
+      III->eraseFromParent();
+    } else if (ExtractElementInst *EII = dyn_cast<ExtractElementInst>(Inst)) {
+      if (isa<ConstantInt>(EII->getOperand(1)))
+          continue;
+
+      Type *AllocaTy = EII->getOperand(0)->getType();
+      Instruction *A = new AllocaInst(AllocaTy, 0, "", Entry);
+      CopyDebug(new StoreInst(EII->getOperand(0), A, EII), EII);
+
+      Value *Idxs[] = { Zero, EII->getOperand(1) };
+      Instruction *B = CopyDebug(
+          GetElementPtrInst::Create(AllocaTy, A, Idxs, "", EII), EII);
+      Instruction *L = CopyDebug(new LoadInst(B, "", EII), EII);
+      EII->replaceAllUsesWith(L);
+      EII->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createExpandInsertExtractElementPass() {
+  return new ExpandInsertExtractElement();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandLargeIntegers.cpp b/lib/Target/JSBackend/NaCl/ExpandLargeIntegers.cpp
new file mode 100644
index 000000000000..495d1f9e6d46
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandLargeIntegers.cpp
@@ -0,0 +1,674 @@
+//===- ExpandLargeIntegers.cpp - Expand illegal integers for PNaCl ABI ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// A limited set of transformations to expand illegal-sized int types.
+//
+//===----------------------------------------------------------------------===//
+//
+// Legal sizes for the purposes of expansion are anything 64 bits or less.
+// Operations on large integers are split into operations on smaller-sized
+// integers. The low parts should always be powers of 2, but the high parts may
+// not be. A subsequent pass can promote those. For now this pass only intends
+// to support the uses generated by clang, which is basically just for large
+// bitfields.
+//
+// Limitations:
+// 1) It can't change function signatures or global variables.
+// 3) Doesn't support mul, div/rem, switch.
+// 4) Doesn't handle arrays or structs (or GEPs) with illegal types.
+// 5) Doesn't handle constant expressions (it also doesn't produce them, so it
+//    can run after ExpandConstantExpr).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nacl-expand-ints"
+
+// Break instructions up into no larger than 64-bit chunks.
+static const unsigned kChunkBits = 64;
+static const unsigned kChunkBytes = kChunkBits / CHAR_BIT;
+
+namespace {
+class ExpandLargeIntegers : public FunctionPass {
+public:
+  static char ID;
+  ExpandLargeIntegers() : FunctionPass(ID) {
+    initializeExpandLargeIntegersPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+};
+
+template <typename T> struct LoHiPair {
+  T Lo, Hi;
+  LoHiPair() : Lo(), Hi() {}
+  LoHiPair(T Lo, T Hi) : Lo(Lo), Hi(Hi) {}
+};
+template <typename T> struct LoHiBitTriple {
+  T Lo, Hi, Bit;
+  LoHiBitTriple() : Lo(), Hi(), Bit() {}
+  LoHiBitTriple(T Lo, T Hi, T Bit) : Lo(Lo), Hi(Hi), Bit(Bit) {}
+};
+typedef LoHiPair<IntegerType *> TypePair;
+typedef LoHiPair<Value *> ValuePair;
+typedef LoHiPair<unsigned> AlignPair;
+typedef LoHiBitTriple<Value *> ValueTriple;
+
+// Information needed to patch a phi node which forward-references a value.
+struct ForwardPHI {
+  Value *Val;
+  PHINode *Lo, *Hi;
+  unsigned ValueNumber;
+  ForwardPHI(Value *Val, PHINode *Lo, PHINode *Hi, unsigned ValueNumber)
+      : Val(Val), Lo(Lo), Hi(Hi), ValueNumber(ValueNumber) {}
+};
+}
+
+char ExpandLargeIntegers::ID = 0;
+INITIALIZE_PASS(ExpandLargeIntegers, "nacl-expand-ints",
+                "Expand integer types that are illegal in PNaCl", false, false)
+
+#define DIE_IF(COND, VAL, MSG)                                                 \
+  do {                                                                         \
+    if (COND) {                                                                \
+      errs() << "Unsupported: " << *(VAL) << '\n';                             \
+      report_fatal_error(                                                      \
+          MSG " not yet supported for integer types larger than 64 bits");     \
+    }                                                                          \
+  } while (0)
+
+static bool isLegalBitSize(unsigned Bits) {
+  assert(Bits && "Can't have zero-size integers");
+  return Bits <= kChunkBits;
+}
+
+static TypePair getExpandedIntTypes(Type *Ty) {
+  unsigned BitWidth = Ty->getIntegerBitWidth();
+  assert(!isLegalBitSize(BitWidth));
+  return {IntegerType::get(Ty->getContext(), kChunkBits),
+          IntegerType::get(Ty->getContext(), BitWidth - kChunkBits)};
+}
+
+// Return true if Val is an int which should be converted.
+static bool shouldConvert(const Value *Val) {
+  Type *Ty = Val->getType();
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+    return !isLegalBitSize(ITy->getBitWidth());
+  return false;
+}
+
+// Return a pair of constants expanded from C.
+static ValuePair expandConstant(Constant *C) {
+  assert(shouldConvert(C));
+  TypePair ExpandedTypes = getExpandedIntTypes(C->getType());
+  if (isa<UndefValue>(C)) {
+    return {UndefValue::get(ExpandedTypes.Lo),
+            UndefValue::get(ExpandedTypes.Hi)};
+  } else if (ConstantInt *CInt = dyn_cast<ConstantInt>(C)) {
+    Constant *ShiftAmt = ConstantInt::get(
+        CInt->getType(), ExpandedTypes.Lo->getBitWidth(), false);
+    return {ConstantExpr::getTrunc(CInt, ExpandedTypes.Lo),
+            ConstantExpr::getTrunc(ConstantExpr::getLShr(CInt, ShiftAmt),
+                                   ExpandedTypes.Hi)};
+  }
+  DIE_IF(true, C, "Constant value");
+}
+
+template <typename T>
+static AlignPair getAlign(const DataLayout &DL, T *I, Type *PrefAlignTy) {
+  unsigned LoAlign = I->getAlignment();
+  if (LoAlign == 0)
+    LoAlign = DL.getPrefTypeAlignment(PrefAlignTy);
+  unsigned HiAlign = MinAlign(LoAlign, kChunkBytes);
+  return {LoAlign, HiAlign};
+}
+
+static ValuePair createBit(IRBuilder<> *IRB, const BinaryOperator *Binop,
+                           const ValuePair &Lhs, const ValuePair &Rhs,
+                           const TypePair &Tys, const StringRef &Name) {
+  auto Op = Binop->getOpcode();
+  Value *Lo = IRB->CreateBinOp(Op, Lhs.Lo, Rhs.Lo, Twine(Name, ".lo"));
+  Value *Hi = IRB->CreateBinOp(Op, Lhs.Hi, Rhs.Hi, Twine(Name, ".hi"));
+  return {Lo, Hi};
+}
+
+static ValuePair createShl(IRBuilder<> *IRB, const BinaryOperator *Binop,
+                           const ValuePair &Lhs, const ValuePair &Rhs,
+                           const TypePair &Tys, const StringRef &Name) {
+  ConstantInt *ShlAmount = dyn_cast<ConstantInt>(Rhs.Lo);
+  // TODO(dschuff): Expansion of variable-sized shifts isn't supported
+  // because the behavior depends on whether the shift amount is less than
+  // the size of the low part of the expanded type, and I haven't yet
+  // figured out a way to do it for variable-sized shifts without splitting
+  // the basic block. I don't believe it's actually necessary for
+  // bitfields. Likewise for LShr below.
+  DIE_IF(!ShlAmount, Binop, "Expansion of variable-sized shifts");
+  unsigned ShiftAmount = ShlAmount->getZExtValue();
+  if (ShiftAmount >= Binop->getType()->getIntegerBitWidth())
+    ShiftAmount = 0; // Undefined behavior.
+  unsigned HiBits = Tys.Hi->getIntegerBitWidth();
+  // |<------------Hi---------->|<-------Lo------>|
+  // |                          |                 |
+  // +--------+--------+--------+--------+--------+
+  // |abcdefghijklmnopqrstuvwxyz|ABCDEFGHIJKLMNOPQ|
+  // +--------+--------+--------+--------+--------+
+  // Possible shifts:
+  // |efghijklmnopqrstuvwxyzABCD|EFGHIJKLMNOPQ0000| Some Lo into Hi.
+  // |vwxyzABCDEFGHIJKLMNOPQ0000|00000000000000000| Lo is 0, keep some Hi.
+  // |DEFGHIJKLMNOPQ000000000000|00000000000000000| Lo is 0, no Hi left.
+  Value *Lo, *Hi;
+  if (ShiftAmount < kChunkBits) {
+    Lo = IRB->CreateShl(Lhs.Lo, ShiftAmount, Twine(Name, ".lo"));
+    Hi =
+        IRB->CreateZExtOrTrunc(IRB->CreateLShr(Lhs.Lo, kChunkBits - ShiftAmount,
+                                               Twine(Name, ".lo.shr")),
+                               Tys.Hi, Twine(Name, ".lo.ext"));
+  } else {
+    Lo = ConstantInt::get(Tys.Lo, 0);
+    Hi = IRB->CreateShl(
+        IRB->CreateZExtOrTrunc(Lhs.Lo, Tys.Hi, Twine(Name, ".lo.ext")),
+        ShiftAmount - kChunkBits, Twine(Name, ".lo.shl"));
+  }
+  if (ShiftAmount < HiBits)
+    Hi = IRB->CreateOr(
+        Hi, IRB->CreateShl(Lhs.Hi, ShiftAmount, Twine(Name, ".hi.shl")),
+        Twine(Name, ".or"));
+  return {Lo, Hi};
+}
+
+static ValuePair createShr(IRBuilder<> *IRB, const BinaryOperator *Binop,
+                           const ValuePair &Lhs, const ValuePair &Rhs,
+                           const TypePair &Tys, const StringRef &Name) {
+  auto Op = Binop->getOpcode();
+  ConstantInt *ShrAmount = dyn_cast<ConstantInt>(Rhs.Lo);
+  // TODO(dschuff): Expansion of variable-sized shifts isn't supported
+  // because the behavior depends on whether the shift amount is less than
+  // the size of the low part of the expanded type, and I haven't yet
+  // figured out a way to do it for variable-sized shifts without splitting
+  // the basic block. I don't believe it's actually necessary for bitfields.
+  DIE_IF(!ShrAmount, Binop, "Expansion of variable-sized shifts");
+  bool IsArith = Op == Instruction::AShr;
+  unsigned ShiftAmount = ShrAmount->getZExtValue();
+  if (ShiftAmount >= Binop->getType()->getIntegerBitWidth())
+    ShiftAmount = 0; // Undefined behavior.
+  unsigned HiBitWidth = Tys.Hi->getIntegerBitWidth();
+  // |<--Hi-->|<-------Lo------>|
+  // |        |                 |
+  // +--------+--------+--------+
+  // |abcdefgh|ABCDEFGHIJKLMNOPQ|
+  // +--------+--------+--------+
+  // Possible shifts (0 is sign when doing AShr):
+  // |0000abcd|defgABCDEFGHIJKLM| Some Hi into Lo.
+  // |00000000|00abcdefgABCDEFGH| Hi is 0, keep some Lo.
+  // |00000000|000000000000abcde| Hi is 0, no Lo left.
+  Value *Lo, *Hi;
+  if (ShiftAmount < kChunkBits) {
+    Lo = IRB->CreateShl(
+        IsArith
+            ? IRB->CreateSExtOrTrunc(Lhs.Hi, Tys.Lo, Twine(Name, ".hi.ext"))
+            : IRB->CreateZExtOrTrunc(Lhs.Hi, Tys.Lo, Twine(Name, ".hi.ext")),
+        kChunkBits - ShiftAmount, Twine(Name, ".hi.shl"));
+    Lo = IRB->CreateOr(
+        Lo, IRB->CreateLShr(Lhs.Lo, ShiftAmount, Twine(Name, ".lo.shr")),
+        Twine(Name, ".lo"));
+  } else {
+    Lo = IRB->CreateBinOp(Op, Lhs.Hi,
+                          ConstantInt::get(Tys.Hi, ShiftAmount - kChunkBits),
+                          Twine(Name, ".hi.shr"));
+    Lo = IsArith ? IRB->CreateSExtOrTrunc(Lo, Tys.Lo, Twine(Name, ".lo.ext"))
+                 : IRB->CreateZExtOrTrunc(Lo, Tys.Lo, Twine(Name, ".lo.ext"));
+  }
+  if (ShiftAmount < HiBitWidth) {
+    Hi = IRB->CreateBinOp(Op, Lhs.Hi, ConstantInt::get(Tys.Hi, ShiftAmount),
+                          Twine(Name, ".hi"));
+  } else {
+    Hi = IsArith ? IRB->CreateAShr(Lhs.Hi, HiBitWidth - 1, Twine(Name, ".hi"))
+                 : ConstantInt::get(Tys.Hi, 0);
+  }
+  return {Lo, Hi};
+}
+
+static Value *createCarry(IRBuilder<> *IRB, Value *Lhs, Value *Rhs,
+                          Value *Added, Type *Ty, const StringRef &Name) {
+  return IRB->CreateZExt(
+      IRB->CreateICmpULT(
+          Added,
+          IRB->CreateSelect(IRB->CreateICmpULT(Lhs, Rhs, Twine(Name, ".cmp")),
+                            Rhs, Lhs, Twine(Name, ".limit")),
+          Twine(Name, ".overflowed")),
+      Ty, Twine(Name, ".carry"));
+}
+
+static ValueTriple createAdd(IRBuilder<> *IRB, const ValuePair &Lhs,
+                             const ValuePair &Rhs, const TypePair &Tys,
+                             const StringRef &Name, Type *HiCarryTy) {
+  auto Op = Instruction::Add;
+  // Don't propagate NUW/NSW to the lo operation: it can overflow.
+  Value *Lo = IRB->CreateBinOp(Op, Lhs.Lo, Rhs.Lo, Twine(Name, ".lo"));
+  Value *LoCarry = createCarry(IRB, Lhs.Lo, Rhs.Lo, Lo, Tys.Hi, Name);
+  // TODO(jfb) The hi operation could be tagged with NUW/NSW.
+  Value *HiAdd = IRB->CreateBinOp(Op, Lhs.Hi, Rhs.Hi, Twine(Name, ".hi"));
+  Value *Hi = IRB->CreateBinOp(Op, HiAdd, LoCarry, Twine(Name, ".carried"));
+  Value *HiCarry = HiCarryTy
+                       ? createCarry(IRB, Lhs.Hi, Rhs.Hi, Hi, HiCarryTy, Name)
+                       : nullptr;
+  return {Lo, Hi, HiCarry};
+}
+
+static ValuePair createSub(IRBuilder<> *IRB, const ValuePair &Lhs,
+                           const ValuePair &Rhs, const TypePair &Tys,
+                           const StringRef &Name) {
+  auto Op = Instruction::Sub;
+  Value *Borrowed = IRB->CreateSExt(
+      IRB->CreateICmpULT(Lhs.Lo, Rhs.Lo, Twine(Name, ".borrow")), Tys.Hi,
+      Twine(Name, ".borrowing"));
+  Value *Lo = IRB->CreateBinOp(Op, Lhs.Lo, Rhs.Lo, Twine(Name, ".lo"));
+  Value *Hi =
+      IRB->CreateBinOp(Instruction::Add,
+                       IRB->CreateBinOp(Op, Lhs.Hi, Rhs.Hi, Twine(Name, ".hi")),
+                       Borrowed, Twine(Name, ".borrowed"));
+  return {Lo, Hi};
+}
+
+static Value *createICmpEquality(IRBuilder<> *IRB, CmpInst::Predicate Pred,
+                                 const ValuePair &Lhs, const ValuePair &Rhs,
+                                 const StringRef &Name) {
+  assert(Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE);
+  Value *Lo = IRB->CreateICmp(Pred, Lhs.Lo, Rhs.Lo, Twine(Name, ".lo"));
+  Value *Hi = IRB->CreateICmp(Pred, Lhs.Hi, Rhs.Hi, Twine(Name, ".hi"));
+  return IRB->CreateBinOp(
+      Instruction::And, Lo, Hi,
+      Twine(Name, Pred == CmpInst::ICMP_EQ ? ".eq" : ".ne"));
+}
+
+static Value *createICmp(IRBuilder<> *IRB, const ICmpInst *ICmp,
+                         const ValuePair &Lhs, const ValuePair &Rhs,
+                         const TypePair &Tys, const StringRef &Name) {
+  auto Pred = ICmp->getPredicate();
+  switch (Pred) {
+  case CmpInst::ICMP_EQ:
+  case CmpInst::ICMP_NE:
+    return createICmpEquality(IRB, ICmp->getPredicate(), Lhs, Rhs, Name);
+
+  case CmpInst::ICMP_UGT: // C == 1 and Z == 0
+  case CmpInst::ICMP_UGE: // C == 1
+  case CmpInst::ICMP_ULT: // C == 0 and Z == 0
+  case CmpInst::ICMP_ULE: // C == 0
+  {
+    Value *Carry = createAdd(IRB, Lhs, Rhs, Tys, Name, ICmp->getType()).Bit;
+    if (Pred == CmpInst::ICMP_ULT || Pred == CmpInst::ICMP_ULE)
+      Carry = IRB->CreateNot(Carry, Name);
+    if (Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_ULT)
+      Carry = IRB->CreateBinOp(
+          Instruction::And, Carry,
+          createICmpEquality(IRB, CmpInst::ICMP_EQ, Lhs, Rhs, Name), Name);
+    return Carry;
+  }
+
+  case CmpInst::ICMP_SGT: // N == V and Z == 0
+  case CmpInst::ICMP_SGE: // N == V
+  case CmpInst::ICMP_SLT: // N != V
+  case CmpInst::ICMP_SLE: // N != V or Z == 1
+    DIE_IF(true, ICmp, "Signed comparisons");
+  default:
+    llvm_unreachable("Invalid integer comparison");
+  }
+}
+
+static ValuePair createLoad(IRBuilder<> *IRB, const DataLayout &DL,
+                            LoadInst *Load) {
+  DIE_IF(!Load->isSimple(), Load, "Volatile and atomic loads");
+  Value *Op = Load->getPointerOperand();
+  TypePair Tys = getExpandedIntTypes(Load->getType());
+  AlignPair Align = getAlign(DL, Load, Load->getType());
+  Value *Loty = IRB->CreateBitCast(Op, Tys.Lo->getPointerTo(),
+                                   Twine(Op->getName(), ".loty"));
+  Value *Lo =
+      IRB->CreateAlignedLoad(Loty, Align.Lo, Twine(Load->getName(), ".lo"));
+  Value *HiAddr =
+      IRB->CreateConstGEP1_32(Loty, 1, Twine(Op->getName(), ".hi.gep"));
+  Value *HiTy = IRB->CreateBitCast(HiAddr, Tys.Hi->getPointerTo(),
+                                   Twine(Op->getName(), ".hity"));
+  Value *Hi =
+      IRB->CreateAlignedLoad(HiTy, Align.Hi, Twine(Load->getName(), ".hi"));
+  return {Lo, Hi};
+}
+
+static ValuePair createStore(IRBuilder<> *IRB, const DataLayout &DL,
+                             StoreInst *Store, const ValuePair &StoreVals) {
+  DIE_IF(!Store->isSimple(), Store, "Volatile and atomic stores");
+  Value *Ptr = Store->getPointerOperand();
+  TypePair Tys = getExpandedIntTypes(Store->getValueOperand()->getType());
+  AlignPair Align = getAlign(DL, Store, Store->getValueOperand()->getType());
+  Value *Loty = IRB->CreateBitCast(Ptr, Tys.Lo->getPointerTo(),
+                                   Twine(Ptr->getName(), ".loty"));
+  Value *Lo = IRB->CreateAlignedStore(StoreVals.Lo, Loty, Align.Lo);
+  Value *HiAddr =
+      IRB->CreateConstGEP1_32(Loty, 1, Twine(Ptr->getName(), ".hi.gep"));
+  Value *HiTy = IRB->CreateBitCast(HiAddr, Tys.Hi->getPointerTo(),
+                                   Twine(Ptr->getName(), ".hity"));
+  Value *Hi = IRB->CreateAlignedStore(StoreVals.Hi, HiTy, Align.Hi);
+  return {Lo, Hi};
+}
+
+namespace {
+// Holds the state for converting/replacing values. We visit instructions in
+// reverse post-order, phis are therefore the only instructions which can be
+// visited before the value they use.
+class ConversionState {
+public:
+  // Return the expanded values for Val.
+  ValuePair getConverted(Value *Val) {
+    assert(shouldConvert(Val));
+    // Directly convert constants.
+    if (Constant *C = dyn_cast<Constant>(Val))
+      return expandConstant(C);
+    if (RewrittenIllegals.count(Val)) {
+      ValuePair Found = RewrittenIllegals[Val];
+      if (RewrittenLegals.count(Found.Lo))
+        Found.Lo = RewrittenLegals[Found.Lo];
+      if (RewrittenLegals.count(Found.Hi))
+        Found.Hi = RewrittenLegals[Found.Hi];
+      return Found;
+    }
+    errs() << "Value: " << *Val << "\n";
+    report_fatal_error("Expanded value not found in map");
+  }
+
+  // Returns whether a converted value has been recorded. This is only useful
+  // for phi instructions: they can be encountered before the incoming
+  // instruction, whereas RPO order guarantees that other instructions always
+  // use converted values.
+  bool hasConverted(Value *Val) {
+    assert(shouldConvert(Val));
+    return dyn_cast<Constant>(Val) || RewrittenIllegals.count(Val);
+  }
+
+  // Record a forward phi, temporarily setting it to use Undef. This will be
+  // patched up at the end of RPO.
+  ValuePair recordForwardPHI(Value *Val, PHINode *Lo, PHINode *Hi,
+                             unsigned ValueNumber) {
+    DEBUG(dbgs() << "\tRecording as forward PHI\n");
+    ForwardPHIs.push_back(ForwardPHI(Val, Lo, Hi, ValueNumber));
+    return {UndefValue::get(Lo->getType()), UndefValue::get(Hi->getType())};
+  }
+
+  void recordConverted(Instruction *From, const ValuePair &To) {
+    DEBUG(dbgs() << "\tTo:  " << *To.Lo << "\n");
+    DEBUG(dbgs() << "\tAnd: " << *To.Hi << "\n");
+    ToErase.push_back(From);
+    RewrittenIllegals[From] = To;
+  }
+
+  // Replace the uses of From with To, give From's name to To, and mark To for
+  // deletion.
+  void recordConverted(Instruction *From, Value *To) {
+    assert(!shouldConvert(From));
+    DEBUG(dbgs() << "\tTo:  " << *To << "\n");
+    ToErase.push_back(From);
+    // From does not produce an illegal value, update its users in place.
+    From->replaceAllUsesWith(To);
+    To->takeName(From);
+    RewrittenLegals[From] = To;
+  }
+
+  void recordToErase(Instruction *TE) {
+    ToErase.push_back(TE);
+  }
+
+  void patchForwardPHIs() {
+    DEBUG(if (!ForwardPHIs.empty()) dbgs() << "Patching forward PHIs:\n");
+    for (ForwardPHI &F : ForwardPHIs) {
+      ValuePair Ops = getConverted(F.Val);
+      F.Lo->setIncomingValue(F.ValueNumber, Ops.Lo);
+      F.Hi->setIncomingValue(F.ValueNumber, Ops.Hi);
+      DEBUG(dbgs() << "\t" << *F.Lo << "\n\t" << *F.Hi << "\n");
+    }
+  }
+
+  void eraseReplacedInstructions() {
+    for (Instruction *I : ToErase)
+      I->dropAllReferences();
+    for (Instruction *I : ToErase)
+      I->eraseFromParent();
+  }
+
+private:
+  // Maps illegal values to their new converted lo/hi values.
+  DenseMap<Value *, ValuePair> RewrittenIllegals;
+  // Maps legal values to their new converted value.
+  DenseMap<Value *, Value *> RewrittenLegals;
+  // Illegal values which have already been converted, will be erased.
+  SmallVector<Instruction *, 32> ToErase;
+  // PHIs which were encountered but had forward references. They need to get
+  // patched up after RPO traversal.
+  SmallVector<ForwardPHI, 32> ForwardPHIs;
+};
+} // Anonymous namespace
+
+static void convertInstruction(Instruction *Inst, ConversionState &State,
+                               const DataLayout &DL) {
+  DEBUG(dbgs() << "Expanding Large Integer: " << *Inst << "\n");
+  // Set the insert point *after* Inst, so that any instructions inserted here
+  // will be visited again. That allows iterative expansion of types > i128.
+  BasicBlock::iterator InsertPos(Inst);
+  IRBuilder<> IRB(&*++InsertPos);
+  StringRef Name = Inst->getName();
+
+  if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+    unsigned N = Phi->getNumIncomingValues();
+    TypePair OpTys = getExpandedIntTypes(Phi->getIncomingValue(0)->getType());
+    PHINode *Lo = IRB.CreatePHI(OpTys.Lo, N, Twine(Name + ".lo"));
+    PHINode *Hi = IRB.CreatePHI(OpTys.Hi, N, Twine(Name + ".hi"));
+    for (unsigned I = 0; I != N; ++I) {
+      Value *InVal = Phi->getIncomingValue(I);
+      BasicBlock *InBB = Phi->getIncomingBlock(I);
+      // If the value hasn't already been converted then this is a
+      // forward-reference PHI which needs to be patched up after RPO traversal.
+      ValuePair Ops = State.hasConverted(InVal)
+                          ? State.getConverted(InVal)
+                          : State.recordForwardPHI(InVal, Lo, Hi, I);
+      Lo->addIncoming(Ops.Lo, InBB);
+      Hi->addIncoming(Ops.Hi, InBB);
+    }
+    State.recordConverted(Phi, {Lo, Hi});
+
+  } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(Inst)) {
+    Value *Operand = ZExt->getOperand(0);
+    Type *OpTy = Operand->getType();
+    TypePair Tys = getExpandedIntTypes(Inst->getType());
+    Value *Lo, *Hi;
+    if (OpTy->getIntegerBitWidth() <= kChunkBits) {
+      Lo = IRB.CreateZExt(Operand, Tys.Lo, Twine(Name, ".lo"));
+      Hi = ConstantInt::get(Tys.Hi, 0);
+    } else {
+      ValuePair Ops = State.getConverted(Operand);
+      Lo = Ops.Lo;
+      Hi = IRB.CreateZExt(Ops.Hi, Tys.Hi, Twine(Name, ".hi"));
+    }
+    State.recordConverted(ZExt, {Lo, Hi});
+
+  } else if (TruncInst *Trunc = dyn_cast<TruncInst>(Inst)) {
+    Value *Operand = Trunc->getOperand(0);
+    assert(shouldConvert(Operand) && "TruncInst is expandable but not its op");
+    ValuePair Ops = State.getConverted(Operand);
+    if (!shouldConvert(Inst)) {
+      Value *NewInst = IRB.CreateTrunc(Ops.Lo, Trunc->getType(), Name);
+      State.recordConverted(Trunc, NewInst);
+    } else {
+      TypePair Tys = getExpandedIntTypes(Trunc->getType());
+      assert(Tys.Lo == getExpandedIntTypes(Operand->getType()).Lo);
+      Value *Lo = Ops.Lo;
+      Value *Hi = IRB.CreateTrunc(Ops.Hi, Tys.Hi, Twine(Name, ".hi"));
+      State.recordConverted(Trunc, {Lo, Hi});
+    }
+
+  } else if (BinaryOperator *Binop = dyn_cast<BinaryOperator>(Inst)) {
+    ValuePair Lhs = State.getConverted(Binop->getOperand(0));
+    ValuePair Rhs = State.getConverted(Binop->getOperand(1));
+    TypePair Tys = getExpandedIntTypes(Binop->getType());
+    ValuePair Conv;
+    switch (Binop->getOpcode()) {
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      Conv = createBit(&IRB, Binop, Lhs, Rhs, Tys, Name);
+      break;
+    case Instruction::Shl:
+      Conv = createShl(&IRB, Binop, Lhs, Rhs, Tys, Name);
+      break;
+    case Instruction::AShr:
+    case Instruction::LShr:
+      Conv = createShr(&IRB, Binop, Lhs, Rhs, Tys, Name);
+      break;
+    case Instruction::Add: {
+      ValueTriple VT =
+          createAdd(&IRB, Lhs, Rhs, Tys, Name, /*HiCarryTy=*/nullptr);
+      Conv = {VT.Lo, VT.Hi}; // Ignore Hi carry.
+    } break;
+    case Instruction::Sub:
+      Conv = createSub(&IRB, Lhs, Rhs, Tys, Name);
+      break;
+    default:
+      DIE_IF(true, Binop, "Binary operator type");
+    }
+    State.recordConverted(Binop, Conv);
+
+  } else if (ICmpInst *ICmp = dyn_cast<ICmpInst>(Inst)) {
+    ValuePair Lhs = State.getConverted(ICmp->getOperand(0));
+    ValuePair Rhs = State.getConverted(ICmp->getOperand(1));
+    TypePair Tys = getExpandedIntTypes(ICmp->getOperand(0)->getType());
+    State.recordConverted(ICmp, createICmp(&IRB, ICmp, Lhs, Rhs, Tys, Name));
+
+  } else if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+    State.recordConverted(Load, createLoad(&IRB, DL, Load));
+
+  } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
+    ValuePair StoreVals = State.getConverted(Store->getValueOperand());
+    State.recordConverted(Store, createStore(&IRB, DL, Store, StoreVals));
+
+  } else if (SelectInst *Select = dyn_cast<SelectInst>(Inst)) {
+    Value *Cond = Select->getCondition();
+    ValuePair True = State.getConverted(Select->getTrueValue());
+    ValuePair False = State.getConverted(Select->getFalseValue());
+    Value *Lo = IRB.CreateSelect(Cond, True.Lo, False.Lo, Twine(Name, ".lo"));
+    Value *Hi = IRB.CreateSelect(Cond, True.Hi, False.Hi, Twine(Name, ".hi"));
+    State.recordConverted(Select, {Lo, Hi});
+
+  } else if (BitCastInst *BitCast = dyn_cast<BitCastInst>(Inst)) {
+    // XXX EMSCRIPTEN handle bitcast <4 x i32|float> or <2 x double> to i128
+    Value *Input = BitCast->getOperand(0);
+    if (!Input->getType()->isVectorTy()) {
+      return; // we can't do anything for it, but see below on trivial casts to i128 and back, it might get handled there
+    }
+    VectorType *VT = cast<VectorType>(Input->getType());
+    Type *ET = VT->getElementType();
+
+    // handle trivial casts to i128 and immediately back
+    if (BitCast->hasOneUse()) {
+      User* U = *BitCast->user_begin();
+      if (BitCastInst *UserBitCast = dyn_cast<BitCastInst>(U)) {
+        if (UserBitCast->getType()->isVectorTy()) {
+          Value* Direct = Input;
+          if (VT != UserBitCast->getType()) {
+            Direct = IRB.CreateBitCast(Direct, UserBitCast->getType(), Twine(Name, "dcast"));
+          }
+          State.recordToErase(BitCast);
+          State.recordConverted(UserBitCast, Direct);
+          return;
+        }
+      }
+    }
+
+    Type *I32 = Type::getInt32Ty(VT->getContext());
+
+    if (VT->getNumElements() == 4) {
+      assert(ET->isIntegerTy(32) || ET->isFloatTy());
+      if (ET->isFloatTy()) {
+        Input = IRB.CreateBitCast(Input, VectorType::get(I32, 4), Twine(Name, "toint"));
+      }
+    } else if (VT->getNumElements() == 2) {
+      assert(ET->isDoubleTy());
+      Input = IRB.CreateBitCast(Input, VectorType::get(I32, 4), Twine(Name, "toint"));
+    } else {
+      DIE_IF(true, Inst, "BitCast Instruction");
+    }
+
+    Value *P0 = IRB.CreateExtractElement(Input, ConstantInt::get(I32, 0), Twine(Name, ".p0"));
+    Value *P1 = IRB.CreateExtractElement(Input, ConstantInt::get(I32, 1), Twine(Name, ".p1"));
+    Value *P2 = IRB.CreateExtractElement(Input, ConstantInt::get(I32, 2), Twine(Name, ".p2"));
+    Value *P3 = IRB.CreateExtractElement(Input, ConstantInt::get(I32, 3), Twine(Name, ".p3"));
+
+    Type *I64 = Type::getInt64Ty(VT->getContext());
+    P0 = IRB.CreateZExt(P0, I64, Twine(Name, ".p0.64"));
+    P1 = IRB.CreateZExt(P1, I64, Twine(Name, ".p1.64"));
+    P2 = IRB.CreateZExt(P2, I64, Twine(Name, ".p2.64"));
+    P3 = IRB.CreateZExt(P3, I64, Twine(Name, ".p3.64"));
+
+    Value *Lo = IRB.CreateBinOp(Instruction::BinaryOps::Or, P0, IRB.CreateBinOp(Instruction::BinaryOps::Shl, P1, ConstantInt::get(I64, 32), Twine(Name, ".mid.lo")), Twine(Name, ".lo"));
+    Value *Hi = IRB.CreateBinOp(Instruction::BinaryOps::Or, P2, IRB.CreateBinOp(Instruction::BinaryOps::Shl, P3, ConstantInt::get(I64, 32), Twine(Name, ".mid.hi")), Twine(Name, ".hi"));
+    State.recordConverted(BitCast, {Lo, Hi});
+
+  } else {
+    DIE_IF(true, Inst, "Instruction");
+  }
+}
+
+bool ExpandLargeIntegers::runOnFunction(Function &F) {
+  // Don't support changing the function arguments. Illegal function arguments
+  // should not be generated by clang.
+  for (const Argument &Arg : F.args())
+    if (shouldConvert(&Arg))
+      report_fatal_error("Function " + F.getName() +
+                         " has illegal integer argument");
+
+  // TODO(jfb) This should loop to handle nested forward PHIs.
+
+  ConversionState State;
+  DataLayout DL(F.getParent());
+  bool Modified = false;
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (ReversePostOrderTraversal<Function *>::rpo_iterator FI = RPOT.begin(),
+                                                           FE = RPOT.end();
+       FI != FE; ++FI) {
+    BasicBlock *BB = *FI;
+    for (Instruction &I : *BB) {
+      // Only attempt to convert an instruction if its result or any of its
+      // operands are illegal.
+      bool ShouldConvert = shouldConvert(&I);
+      for (Value *Op : I.operands())
+        ShouldConvert |= shouldConvert(Op);
+      if (ShouldConvert) {
+        convertInstruction(&I, State, DL);
+        Modified = true;
+      }
+    }
+  }
+  State.patchForwardPHIs();
+  State.eraseReplacedInstructions();
+  return Modified;
+}
+
+FunctionPass *llvm::createExpandLargeIntegersPass() {
+  return new ExpandLargeIntegers();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandShuffleVector.cpp b/lib/Target/JSBackend/NaCl/ExpandShuffleVector.cpp
new file mode 100644
index 000000000000..7212668216bd
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandShuffleVector.cpp
@@ -0,0 +1,110 @@
+//===- ExpandShuffleVector.cpp - shufflevector to {insert/extract}element -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace all shufflevector instructions by insertelement / extractelement.
+// BackendCanonicalize is able to reconstruct the shufflevector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+class ExpandShuffleVector : public BasicBlockPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ExpandShuffleVector() : BasicBlockPass(ID), M(0) {
+    initializeExpandShuffleVectorPass(*PassRegistry::getPassRegistry());
+  }
+  using BasicBlockPass::doInitialization;
+  bool doInitialization(Module &Mod) override {
+    M = &Mod;
+    return false; // Unchanged.
+  }
+  bool runOnBasicBlock(BasicBlock &BB) override;
+
+private:
+  const Module *M;
+  void Expand(ShuffleVectorInst *Shuf, Type *Int32);
+};
+}
+
+char ExpandShuffleVector::ID = 0;
+INITIALIZE_PASS(
+    ExpandShuffleVector, "expand-shufflevector",
+    "Expand shufflevector instructions into insertelement and extractelement",
+    false, false)
+
+void ExpandShuffleVector::Expand(ShuffleVectorInst *Shuf, Type *Int32) {
+  Value *L = Shuf->getOperand(0);
+  Value *R = Shuf->getOperand(1);
+  assert(L->getType() == R->getType());
+  VectorType *SrcVecTy = cast<VectorType>(L->getType());
+  VectorType *DstVecTy = Shuf->getType();
+  Type *ElemTy = DstVecTy->getElementType();
+  SmallVector<int, 16> Mask = Shuf->getShuffleMask();
+  unsigned NumSrcElems = SrcVecTy->getNumElements();
+  unsigned NumDstElems = Mask.size();
+
+  // Start with an undefined vector, extract each element from either L
+  // or R according to the Mask, and insert it into contiguous element
+  // locations in the result vector.
+  //
+  // The sources for shufflevector must have the same type but the
+  // destination could be a narrower or wider vector with the same
+  // element type.
+  Instruction *ExtractLoc = Shuf;
+  Value *Res = UndefValue::get(DstVecTy);
+  for (unsigned Elem = 0; Elem != NumDstElems; ++Elem) {
+    bool IsUndef =
+        0 > Mask[Elem] || static_cast<unsigned>(Mask[Elem]) >= NumSrcElems * 2;
+    bool IsL = static_cast<unsigned>(Mask[Elem]) < NumSrcElems;
+    Value *From = IsL ? L : R;
+    int Adjustment = IsL ? 0 : NumSrcElems;
+    Constant *ExtractIdx = ConstantInt::get(Int32, Mask[Elem] - Adjustment);
+    Constant *InsertIdx = ConstantInt::get(Int32, Elem);
+    Value *ElemToInsert = IsUndef ? UndefValue::get(ElemTy)
+                                  : (Value *)ExtractElementInst::Create(
+                                        From, ExtractIdx, "", ExtractLoc);
+    Res = InsertElementInst::Create(Res, ElemToInsert, InsertIdx, "", Shuf);
+    if (ExtractLoc == Shuf)
+      // All the extracts should be added just before the first insert we added.
+      ExtractLoc = cast<Instruction>(Res);
+  }
+
+  Shuf->replaceAllUsesWith(Res);
+  Shuf->eraseFromParent();
+}
+
+bool ExpandShuffleVector::runOnBasicBlock(BasicBlock &BB) {
+  Type *Int32 = Type::getInt32Ty(M->getContext());
+  typedef SmallVector<ShuffleVectorInst *, 8> Instructions;
+  Instructions Shufs;
+
+  for (BasicBlock::iterator BBI = BB.begin(); BBI != BB.end(); ++BBI)
+    if (ShuffleVectorInst *S = dyn_cast<ShuffleVectorInst>(&*BBI))
+      Shufs.push_back(S);
+
+  for (Instructions::iterator S = Shufs.begin(), E = Shufs.end(); S != E; ++S)
+    Expand(*S, Int32);
+
+  return !Shufs.empty();
+}
+
+BasicBlockPass *llvm::createExpandShuffleVectorPass() {
+  return new ExpandShuffleVector();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandSmallArguments.cpp b/lib/Target/JSBackend/NaCl/ExpandSmallArguments.cpp
new file mode 100644
index 000000000000..a2c58034568d
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandSmallArguments.cpp
@@ -0,0 +1,250 @@
+//===- ExpandSmallArguments.cpp - Expand out arguments smaller than i32----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// LLVM IR allows function return types and argument types such as
+// "zeroext i8" and "signext i8".  The Language Reference says that
+// zeroext "indicates to the code generator that the parameter or
+// return value should be zero-extended to the extent required by the
+// target's ABI (which is usually 32-bits, but is 8-bits for a i1 on
+// x86-64) by the caller (for a parameter) or the callee (for a return
+// value)".
+//
+// This can lead to non-portable behaviour when calling functions
+// without C prototypes or with wrong C prototypes.
+//
+// In order to remove this non-portability from PNaCl, and to simplify
+// the language that the PNaCl translator accepts, the
+// ExpandSmallArguments pass widens integer arguments and return types
+// to be at least 32 bits.  The pass inserts explicit cast
+// instructions (ZExtInst/SExtInst/TruncInst) as needed.
+//
+// The pass chooses between ZExtInst and SExtInst widening based on
+// whether a "signext" attribute is present.  However, in principle
+// the pass could always use zero-extension, because the extent to
+// which either zero-extension or sign-extension is done is up to the
+// target ABI, which is up to PNaCl to specify.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+namespace {
+  // This is a ModulePass because the pass recreates functions in
+  // order to change their arguments' types.
+  class ExpandSmallArguments : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ExpandSmallArguments() : ModulePass(ID) {
+      initializeExpandSmallArgumentsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char ExpandSmallArguments::ID = 0;
+INITIALIZE_PASS(ExpandSmallArguments, "expand-small-arguments",
+                "Expand function arguments to be at least 32 bits in size",
+                false, false)
+
+// Returns the normalized version of the given argument/return type.
+static Type *NormalizeType(Type *Ty) {
+  if (IntegerType *IntTy = dyn_cast<IntegerType>(Ty)) {
+    if (IntTy->getBitWidth() < 32) {
+      return IntegerType::get(Ty->getContext(), 32);
+    }
+  }
+  return Ty;
+}
+
+// Returns the normalized version of the given function type.
+static FunctionType *NormalizeFunctionType(FunctionType *FTy) {
+  if (FTy->isVarArg()) {
+    report_fatal_error(
+        "ExpandSmallArguments does not handle varargs functions");
+  }
+  SmallVector<Type *, 8> ArgTypes;
+  for (unsigned I = 0; I < FTy->getNumParams(); ++I) {
+    ArgTypes.push_back(NormalizeType(FTy->getParamType(I)));
+  }
+  return FunctionType::get(NormalizeType(FTy->getReturnType()),
+                           ArgTypes, false);
+}
+
+// Convert the given function to use normalized argument/return types.
+static bool ConvertFunction(Function *Func) {
+  FunctionType *FTy = Func->getFunctionType();
+  FunctionType *NFTy = NormalizeFunctionType(FTy);
+  if (NFTy == FTy)
+    return false; // No change needed.
+  Function *NewFunc = RecreateFunction(Func, NFTy);
+
+  // Move the arguments across to the new function.
+  for (Function::arg_iterator I = Func->arg_begin(), E = Func->arg_end(),
+         NewI = NewFunc->arg_begin();
+       I != E; ++I, ++NewI) {
+    auto Arg = &*I;
+    auto NewArg = &*NewI;
+    NewArg->takeName(Arg);
+    if (Arg->getType() == NewArg->getType()) {
+      Arg->replaceAllUsesWith(NewArg);
+    } else {
+      Instruction *Trunc = new TruncInst(
+          NewArg, Arg->getType(), NewArg->getName() + ".arg_trunc",
+          &*NewFunc->getEntryBlock().getFirstInsertionPt());
+      Arg->replaceAllUsesWith(Trunc);
+    }
+  }
+
+  if (FTy->getReturnType() != NFTy->getReturnType()) {
+    // Fix up return instructions.
+    Instruction::CastOps CastType =
+        Func->getAttributes().hasAttribute(0, Attribute::SExt) ?
+        Instruction::SExt : Instruction::ZExt;
+    for (Function::iterator BB = NewFunc->begin(), E = NewFunc->end();
+         BB != E;
+         ++BB) {
+      for (BasicBlock::iterator Iter = BB->begin(), E = BB->end();
+           Iter != E; ) {
+        Instruction *Inst = &*Iter++;
+        if (ReturnInst *Ret = dyn_cast<ReturnInst>(Inst)) {
+          Value *Ext = CopyDebug(
+              CastInst::Create(CastType, Ret->getReturnValue(),
+                               NFTy->getReturnType(),
+                               Ret->getReturnValue()->getName() + ".ret_ext",
+                               Ret),
+              Ret);
+          CopyDebug(ReturnInst::Create(Ret->getContext(), Ext, Ret), Ret);
+          Ret->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  Func->eraseFromParent();
+  return true;
+}
+
+// Convert the given call to use normalized argument/return types.
+template <class T> static bool ConvertCall(T *Call, Pass *P) {
+  // Don't try to change calls to intrinsics.
+  if (isa<IntrinsicInst>(Call))
+    return false;
+  FunctionType *FTy = cast<FunctionType>(
+      Call->getCalledValue()->getType()->getPointerElementType());
+  FunctionType *NFTy = NormalizeFunctionType(FTy);
+  if (NFTy == FTy)
+    return false; // No change needed.
+
+  // Convert arguments.
+  SmallVector<Value *, 8> Args;
+  for (unsigned I = 0; I < Call->getNumArgOperands(); ++I) {
+    Value *Arg = Call->getArgOperand(I);
+    if (NFTy->getParamType(I) != FTy->getParamType(I)) {
+      Instruction::CastOps CastType =
+          Call->getAttributes().hasAttribute(I + 1, Attribute::SExt) ?
+          Instruction::SExt : Instruction::ZExt;
+      Arg = CopyDebug(CastInst::Create(CastType, Arg, NFTy->getParamType(I),
+                                       "arg_ext", Call), Call);
+    }
+    Args.push_back(Arg);
+  }
+  Value *CastFunc =
+    CopyDebug(new BitCastInst(Call->getCalledValue(), NFTy->getPointerTo(),
+                              Call->getName() + ".arg_cast", Call), Call);
+  Value *Result = NULL;
+  if (CallInst *OldCall = dyn_cast<CallInst>(Call)) {
+    CallInst *NewCall = CopyDebug(CallInst::Create(CastFunc, Args, "", OldCall),
+                                  OldCall);
+    NewCall->takeName(OldCall);
+    NewCall->setAttributes(OldCall->getAttributes());
+    NewCall->setCallingConv(OldCall->getCallingConv());
+    NewCall->setTailCall(OldCall->isTailCall());
+    Result = NewCall;
+
+    if (FTy->getReturnType() != NFTy->getReturnType()) {
+      Result = CopyDebug(new TruncInst(NewCall, FTy->getReturnType(),
+                                       NewCall->getName() + ".ret_trunc", Call),
+                         Call);
+    }
+  } else if (InvokeInst *OldInvoke = dyn_cast<InvokeInst>(Call)) {
+    BasicBlock *Parent = OldInvoke->getParent();
+    BasicBlock *NormalDest = OldInvoke->getNormalDest();
+    BasicBlock *UnwindDest = OldInvoke->getUnwindDest();
+
+    if (FTy->getReturnType() != NFTy->getReturnType()) {
+      if (BasicBlock *SplitDest = SplitCriticalEdge(Parent, NormalDest)) {
+        NormalDest = SplitDest;
+      }
+    }
+
+    InvokeInst *New = CopyDebug(InvokeInst::Create(CastFunc, NormalDest,
+                                                   UnwindDest, Args,
+                                                   "", OldInvoke),
+                                OldInvoke);
+    New->takeName(OldInvoke);
+
+    if (FTy->getReturnType() != NFTy->getReturnType()) {
+      Result = CopyDebug(new TruncInst(New, FTy->getReturnType(),
+                                       New->getName() + ".ret_trunc",
+                                       NormalDest->getTerminator()),
+                         OldInvoke);
+    } else {
+      Result = New;
+    }
+
+    New->setAttributes(OldInvoke->getAttributes());
+    New->setCallingConv(OldInvoke->getCallingConv());
+  }
+  Call->replaceAllUsesWith(Result);
+  Call->eraseFromParent();
+  return true;
+}
+
+bool ExpandSmallArguments::runOnModule(Module &M) {
+  bool Changed = false;
+  for (Module::iterator Iter = M.begin(), E = M.end(); Iter != E; ) {
+    Function *Func = &*Iter++;
+    // Don't try to change intrinsic declarations because intrinsics
+    // will continue to have non-normalized argument types.  For
+    // example, memset() takes an i8 argument.  It shouldn't matter
+    // whether we modify the types of other function declarations, but
+    // we don't expect to see non-intrinsic function declarations in a
+    // PNaCl pexe.
+    if (Func->empty())
+      continue;
+
+    for (Function::iterator BB = Func->begin(), E = Func->end(); BB != E;
+         ++BB) {
+      for (BasicBlock::iterator Iter = BB->begin(), E = BB->end(); Iter != E;) {
+        Instruction *Inst = &*Iter++;
+        if (CallInst *Call = dyn_cast<CallInst>(Inst)) {
+          Changed |= ConvertCall(Call, this);
+        } else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
+          Changed |= ConvertCall(Invoke, this);
+        }
+      }
+    }
+
+    Changed |= ConvertFunction(Func);
+  }
+  return Changed;
+}
+
+ModulePass *llvm::createExpandSmallArgumentsPass() {
+  return new ExpandSmallArguments();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandStructRegs.cpp b/lib/Target/JSBackend/NaCl/ExpandStructRegs.cpp
new file mode 100644
index 000000000000..ca1fb9db5da6
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandStructRegs.cpp
@@ -0,0 +1,655 @@
+//===- ExpandStructRegs.cpp - Expand out variables with struct type--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands out some uses of LLVM variables
+// (a.k.a. registers) of struct type.  It replaces loads and stores of
+// structs with separate loads and stores of the structs' fields.  The
+// motivation is to omit struct types from PNaCl's stable ABI.
+//
+// ExpandStructRegs does not yet handle all possible uses of struct
+// values.  It is intended to handle the uses that Clang and the SROA
+// pass generate.  Clang generates struct loads and stores, along with
+// extractvalue instructions, in its implementation of C++ method
+// pointers, and the SROA pass sometimes converts this code to using
+// insertvalue instructions too.
+//
+// ExpandStructRegs does not handle:
+//
+//  * Array types.
+//  * Function types containing arguments or return values of struct
+//    type without the "byval" or "sret" attributes.  Since by-value
+//    struct-passing generally uses "byval"/"sret", this does not
+//    matter.
+//
+// Other limitations:
+//
+//  * ExpandStructRegs does not attempt to use memcpy() where that
+//    might be more appropriate than copying fields individually.
+//  * ExpandStructRegs does not preserve the contents of padding
+//    between fields when copying structs.  However, the contents of
+//    padding fields are not defined anyway.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+#define DEBUG_TYPE "expand-struct-regs"
+
+using namespace llvm;
+
+namespace {
+struct ExpandStructRegs : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  ExpandStructRegs() : FunctionPass(ID) {
+    initializeExpandStructRegsPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnFunction(Function &F);
+};
+}
+
+char ExpandStructRegs::ID = 0;
+INITIALIZE_PASS(ExpandStructRegs, "expand-struct-regs",
+                "Expand out variables with struct types", false, false)
+
+static bool DoAnotherPass(Type *Ty) {
+  return isa<StructType>(Ty) || isa<ArrayType>(Ty);
+}
+static bool DoAnotherPass(Value *V) { return DoAnotherPass(V->getType()); }
+
+static bool SplitUpPHINode(PHINode *Phi) {
+  StructType *STy = cast<StructType>(Phi->getType());
+
+  Value *NewStruct = UndefValue::get(STy);
+  Instruction *NewStructInsertPt = &*Phi->getParent()->getFirstInsertionPt();
+
+  bool NeedsAnotherPass = false;
+
+  // Create a separate PHINode for each struct field.
+  for (unsigned Index = 0; Index < STy->getNumElements(); ++Index) {
+    SmallVector<unsigned, 1> EVIndexes;
+    EVIndexes.push_back(Index);
+
+    Type *ElemTy = STy->getElementType(Index);
+    NeedsAnotherPass = NeedsAnotherPass || DoAnotherPass(ElemTy);
+
+    PHINode *NewPhi = PHINode::Create(ElemTy, Phi->getNumIncomingValues(),
+                                      Phi->getName() + ".index", Phi);
+    CopyDebug(NewPhi, Phi);
+    for (unsigned PhiIndex = 0; PhiIndex < Phi->getNumIncomingValues();
+         ++PhiIndex) {
+      BasicBlock *IncomingBB = Phi->getIncomingBlock(PhiIndex);
+      Value *EV = CopyDebug(
+          ExtractValueInst::Create(Phi->getIncomingValue(PhiIndex), EVIndexes,
+                                   Phi->getName() + ".extract",
+                                   IncomingBB->getTerminator()),
+          Phi);
+      NewPhi->addIncoming(EV, IncomingBB);
+    }
+
+    // Reconstruct the original struct value.
+    NewStruct = CopyDebug(InsertValueInst::Create(NewStruct, NewPhi, EVIndexes,
+                                                  Phi->getName() + ".insert",
+                                                  NewStructInsertPt),
+                          Phi);
+  }
+  Phi->replaceAllUsesWith(NewStruct);
+  Phi->eraseFromParent();
+
+  return NeedsAnotherPass;
+}
+
+static bool SplitUpSelect(SelectInst *Select) {
+  StructType *STy = cast<StructType>(Select->getType());
+  Value *NewStruct = UndefValue::get(STy);
+
+  bool NeedsAnotherPass = false;
+  // Create a separate SelectInst for each struct field.
+  for (unsigned Index = 0; Index < STy->getNumElements(); ++Index) {
+    SmallVector<unsigned, 1> EVIndexes;
+    EVIndexes.push_back(Index);
+
+    Value *TrueVal = CopyDebug(
+        ExtractValueInst::Create(Select->getTrueValue(), EVIndexes,
+                                 Select->getName() + ".extract", Select),
+        Select);
+    Value *FalseVal = CopyDebug(
+        ExtractValueInst::Create(Select->getFalseValue(), EVIndexes,
+                                 Select->getName() + ".extract", Select),
+        Select);
+    Value *NewSelect =
+        CopyDebug(SelectInst::Create(Select->getCondition(), TrueVal, FalseVal,
+                                     Select->getName() + ".index", Select),
+                  Select);
+
+    NeedsAnotherPass = NeedsAnotherPass || DoAnotherPass(NewSelect);
+
+    // Reconstruct the original struct value.
+    NewStruct = CopyDebug(
+        InsertValueInst::Create(NewStruct, NewSelect, EVIndexes,
+                                Select->getName() + ".insert", Select),
+        Select);
+  }
+  Select->replaceAllUsesWith(NewStruct);
+  Select->eraseFromParent();
+
+  return NeedsAnotherPass;
+}
+
+template <class InstType>
+static void ProcessLoadOrStoreAttrs(InstType *Dest, InstType *Src,
+                                    StructType* STy, const unsigned Index,
+                                    const DataLayout *DL) {
+  CopyDebug(Dest, Src);
+  Dest->setVolatile(Src->isVolatile());
+  if (Src->isAtomic()) {
+    errs() << "Use: " << *Src << "\n";
+    report_fatal_error("Atomic struct loads/stores not supported");
+  }
+
+  if (!Src->getAlignment()) {
+    return;
+  }
+
+  const StructLayout *SL = DL->getStructLayout(STy);
+  const unsigned Alignment = Src->getAlignment();
+  Dest->setAlignment(MinAlign(Alignment, SL->getElementOffset(Index)));
+}
+
+template <class InstType>
+static void ProcessArrayLoadOrStoreAttrs(InstType *Dest, InstType *Src,
+                                         ArrayType* ATy, const unsigned Index,
+                                         const DataLayout *DL) {
+  CopyDebug(Dest, Src);
+  Dest->setVolatile(Src->isVolatile());
+  if (Src->isAtomic()) {
+    errs() << "Use: " << *Src << "\n";
+    report_fatal_error("Atomic struct loads/stores not supported");
+  }
+
+  if (!Src->getAlignment()) {
+    return;
+  }
+
+  const unsigned Alignment = Src->getAlignment();
+  Dest->setAlignment(MinAlign(Alignment, Index * DL->getTypeSizeInBits(ATy->getElementType())));
+}
+
+static bool SplitUpStore(StoreInst *Store, const DataLayout *DL) {
+  StructType *STy = cast<StructType>(Store->getValueOperand()->getType());
+
+  bool NeedsAnotherPass = false;
+  // Create a separate store instruction for each struct field.
+  for (unsigned Index = 0; Index < STy->getNumElements(); ++Index) {
+    SmallVector<Value *, 2> Indexes;
+    Indexes.push_back(ConstantInt::get(Store->getContext(), APInt(32, 0)));
+    Indexes.push_back(ConstantInt::get(Store->getContext(), APInt(32, Index)));
+    Value *GEP =
+        CopyDebug(GetElementPtrInst::Create(
+                      STy,
+                      Store->getPointerOperand(), Indexes,
+                      Store->getPointerOperand()->getName() + ".index", Store),
+                  Store);
+    NeedsAnotherPass =
+        NeedsAnotherPass || DoAnotherPass(GEP->getType()->getContainedType(0));
+
+    SmallVector<unsigned, 1> EVIndexes;
+    EVIndexes.push_back(Index);
+    Value *Field = ExtractValueInst::Create(Store->getValueOperand(), EVIndexes,
+                                            "", Store);
+    StoreInst *NewStore = new StoreInst(Field, GEP, Store);
+    ProcessLoadOrStoreAttrs(NewStore, Store, STy, Index, DL);
+  }
+  Store->eraseFromParent();
+
+  return NeedsAnotherPass;
+}
+
+static bool SplitUpLoad(LoadInst *Load, const DataLayout *DL) {
+  StructType *STy = cast<StructType>(Load->getType());
+  Value *NewStruct = UndefValue::get(STy);
+
+  bool NeedsAnotherPass = false;
+  // Create a separate load instruction for each struct field.
+  for (unsigned Index = 0; Index < STy->getNumElements(); ++Index) {
+    SmallVector<Value *, 2> Indexes;
+    Indexes.push_back(ConstantInt::get(Load->getContext(), APInt(32, 0)));
+    Indexes.push_back(ConstantInt::get(Load->getContext(), APInt(32, Index)));
+    Value *GEP =
+        CopyDebug(GetElementPtrInst::Create(STy,
+                                            Load->getPointerOperand(), Indexes,
+                                            Load->getName() + ".index", Load),
+                  Load);
+    LoadInst *NewLoad = new LoadInst(GEP, Load->getName() + ".field", Load);
+
+    NeedsAnotherPass = NeedsAnotherPass || DoAnotherPass(NewLoad);
+    ProcessLoadOrStoreAttrs(NewLoad, Load, STy, Index, DL);
+
+    // Reconstruct the struct value.
+    SmallVector<unsigned, 1> EVIndexes;
+    EVIndexes.push_back(Index);
+    NewStruct =
+        CopyDebug(InsertValueInst::Create(NewStruct, NewLoad, EVIndexes,
+                                          Load->getName() + ".insert", Load),
+                  Load);
+  }
+  Load->replaceAllUsesWith(NewStruct);
+  Load->eraseFromParent();
+
+  return NeedsAnotherPass;
+}
+
+static bool SplitUpArrayStore(StoreInst *Store, const DataLayout *DL) {
+  ArrayType *ATy = cast<ArrayType>(Store->getValueOperand()->getType());
+
+  bool NeedsAnotherPass = false;
+  // Create a separate store instruction for each array field.
+  for (unsigned Index = 0; Index < ATy->getNumElements(); ++Index) {
+    SmallVector<Value *, 2> Indexes;
+    Indexes.push_back(ConstantInt::get(Store->getContext(), APInt(32, 0)));
+    Indexes.push_back(ConstantInt::get(Store->getContext(), APInt(32, Index)));
+    Value *GEP =
+        CopyDebug(GetElementPtrInst::Create(
+                      ATy,
+                      Store->getPointerOperand(), Indexes,
+                      Store->getPointerOperand()->getName() + ".index", Store),
+                  Store);
+    NeedsAnotherPass =
+        NeedsAnotherPass || DoAnotherPass(GEP->getType()->getContainedType(0));
+
+    SmallVector<unsigned, 1> EVIndexes;
+    EVIndexes.push_back(Index);
+    Value *Field = ExtractValueInst::Create(Store->getValueOperand(), EVIndexes,
+                                            "", Store);
+    StoreInst *NewStore = new StoreInst(Field, GEP, Store);
+    ProcessArrayLoadOrStoreAttrs(NewStore, Store, ATy, Index, DL);
+  }
+  Store->eraseFromParent();
+
+  return NeedsAnotherPass;
+}
+
+static bool SplitUpArrayLoad(LoadInst *Load, const DataLayout *DL) {
+  ArrayType *ATy = cast<ArrayType>(Load->getType());
+  Value *NewArray = UndefValue::get(ATy);
+
+  bool NeedsAnotherPass = false;
+  // Create a separate load instruction for each array field.
+  for (unsigned Index = 0; Index < ATy->getNumElements(); ++Index) {
+    SmallVector<Value *, 2> Indexes;
+    Indexes.push_back(ConstantInt::get(Load->getContext(), APInt(32, 0)));
+    Indexes.push_back(ConstantInt::get(Load->getContext(), APInt(32, Index)));
+    Value *GEP =
+        CopyDebug(GetElementPtrInst::Create(ATy,
+                                            Load->getPointerOperand(), Indexes,
+                                            Load->getName() + ".index", Load),
+                  Load);
+    LoadInst *NewLoad = new LoadInst(GEP, Load->getName() + ".field", Load);
+
+    NeedsAnotherPass = NeedsAnotherPass || DoAnotherPass(NewLoad);
+    ProcessArrayLoadOrStoreAttrs(NewLoad, Load, ATy, Index, DL);
+
+    // Reconstruct the array value.
+    SmallVector<unsigned, 1> EVIndexes;
+    EVIndexes.push_back(Index);
+    NewArray =
+        CopyDebug(InsertValueInst::Create(NewArray, NewLoad, EVIndexes,
+                                          Load->getName() + ".insert", Load),
+                  Load);
+  }
+  Load->replaceAllUsesWith(NewArray);
+  Load->eraseFromParent();
+
+  return NeedsAnotherPass;
+}
+
+static bool SplitUpArraySelect(SelectInst *Select) {
+  ArrayType *ATy = cast<ArrayType>(Select->getType());
+  Value *NewArray = UndefValue::get(ATy);
+
+  bool NeedsAnotherPass = false;
+  // Create a separate select instruction for each array field.
+  for (unsigned Index = 0; Index < ATy->getNumElements(); ++Index) {
+    SmallVector<unsigned, 1> EVIndexes;
+    EVIndexes.push_back(Index);
+    Value *TrueValue = ExtractValueInst::Create(Select->getTrueValue(),
+                                                EVIndexes, "", Select);
+    Value *FalseValue = ExtractValueInst::Create(Select->getFalseValue(),
+                                                 EVIndexes, "", Select);
+    SelectInst *NewSelect = SelectInst::Create(Select->getCondition(),
+                                               TrueValue, FalseValue,
+                                               Select->getName(),
+                                               Select);
+    NeedsAnotherPass = NeedsAnotherPass || DoAnotherPass(NewSelect);
+
+    // Reconstruct the struct value.
+    SmallVector<unsigned, 1> IVIndexes;
+    IVIndexes.push_back(Index);
+    NewArray =
+        CopyDebug(InsertValueInst::Create(NewArray, NewSelect, IVIndexes,
+                                          Select->getName() + ".insert", Select),
+                  Select);
+  }
+  Select->replaceAllUsesWith(NewArray);
+  Select->eraseFromParent();
+
+  return NeedsAnotherPass;
+}
+
+static bool SplitUpArrayPHINode(PHINode *Phi) {
+  ArrayType *ATy = cast<ArrayType>(Phi->getType());
+
+  Value *NewArray = UndefValue::get(ATy);
+  Instruction *NewArrayInsertPt = &*Phi->getParent()->getFirstInsertionPt();
+
+  bool NeedsAnotherPass = false;
+
+  // Create a separate PHINode for each array field.
+  for (unsigned Index = 0; Index < ATy->getNumElements(); ++Index) {
+    SmallVector<unsigned, 1> EVIndexes;
+    EVIndexes.push_back(Index);
+
+    Type *ElemTy = ATy->getElementType();
+    NeedsAnotherPass = NeedsAnotherPass || DoAnotherPass(ElemTy);
+
+    PHINode *NewPhi = PHINode::Create(ElemTy, Phi->getNumIncomingValues(),
+                                      Phi->getName() + ".index", Phi);
+    CopyDebug(NewPhi, Phi);
+    for (unsigned PhiIndex = 0; PhiIndex < Phi->getNumIncomingValues();
+         ++PhiIndex) {
+      BasicBlock *IncomingBB = Phi->getIncomingBlock(PhiIndex);
+      Value *EV = CopyDebug(
+          ExtractValueInst::Create(Phi->getIncomingValue(PhiIndex), EVIndexes,
+                                   Phi->getName() + ".extract",
+                                   IncomingBB->getTerminator()),
+          Phi);
+      NewPhi->addIncoming(EV, IncomingBB);
+    }
+
+    // Reconstruct the original array value.
+    NewArray = CopyDebug(InsertValueInst::Create(NewArray, NewPhi, EVIndexes,
+                                                 Phi->getName() + ".insert",
+                                                 NewArrayInsertPt),
+                          Phi);
+  }
+  Phi->replaceAllUsesWith(NewArray);
+  Phi->eraseFromParent();
+
+  return NeedsAnotherPass;
+}
+
+static bool ExpandExtractValue(ExtractValueInst *EV,
+                               SmallVectorImpl<Instruction *> *ToErase) {
+  // Search for the insertvalue instruction that inserts the struct field
+  // referenced by this extractvalue instruction, excluding CmpXchg which
+  // returns a struct and is handled by RewriteAtomics.
+  Value *StructVal = EV->getAggregateOperand();
+  Value *ResultField = nullptr;
+
+  // The current depth of the search. It's impossible to backtrack in our search
+  // tree (all prior (not in the CFG sense) extractvalues will already be
+  // expanded), so this variable is never reset to zero.
+  size_t EVIndex = 0;
+
+  // Some intrinsics and cmpxchg returns struct vals and this pass can't do
+  // anything but ignore them.
+  if (isa<IntrinsicInst>(StructVal) || isa<AtomicCmpXchgInst>(StructVal))
+    return false;
+
+  for (;;) {
+    DEBUG(dbgs() << "Expanding struct value: " << *StructVal << "\n");
+
+    if (InsertValueInst *IV = dyn_cast<InsertValueInst>(StructVal)) {
+
+      size_t IVIndex = 0;
+      for (; EVIndex < EV->getIndices().size() &&
+                 IVIndex < IV->getIndices().size();
+           ++IVIndex, ++EVIndex) {
+
+        const bool Equal =
+            (EV->getIndices()[EVIndex] == IV->getIndices()[IVIndex]);
+
+        if (IVIndex + 1 == IV->getIndices().size() && Equal) {
+          if (EVIndex + 1 == EV->getIndices().size()) {
+            // Exact match. We break out of all loops and ResultField will
+            // replace EV.
+            ResultField = IV->getInsertedValueOperand();
+          } else {
+            // We've found a match, but haven't reached the end of EV's indexes.
+            // We continue looping through the outermost loop, and search for
+            // indices on the next level down (ie we increment EVIndex).
+            // This branch is common when encountering nested insertvalues; for
+            // example:
+            // ```llvm
+            // %1 = insertvalue { i32 } undef, i32 1, 0
+            // %2 = insertvalue { { i32 } } %1, { i32 } %1, 0
+            // %3 = extractvalue { { i32 } } %2, 0, 0
+            // ```
+            StructVal = IV->getInsertedValueOperand();
+            ++EVIndex;
+          }
+          break;
+        } else if (!Equal) {
+          // No match.  Try the next struct value in the chain.
+          // For example:
+          // ```llvm
+          // %1 = insertvalue { i32, i32, i32 } undef, i32 5, 0
+          // %2 = insertvalue { i32, i32, i32 } %1, i32 10, 1
+          // %3 = insertvalue { i32, i32, i32 } %2, i32 15, 2
+          // %4 = extractvalue { i32, i32, i32 } %3, 0
+          // ```
+          // In this case, to expand %4, this branch will hit insertvalues %3
+          // and %2 before
+          // it finds the solution, %1.
+          StructVal = IV->getAggregateOperand();
+          break;
+        }
+
+        // One last case worth mentioning:
+        // ```llvm
+        // %aa = alloca { i32 }
+        // %a = insertvalue { i32 } undef, i32 1, 0
+        // %b = insertvalue { { i32 } } undef, { i32 } %a, 0
+        // %c = extractvalue { { i32 } } %b, 0
+        // store { i32 } %c, { i32 }* %aa
+        // ```
+        // In the case of %c, the condition of our inner loop will be false, and
+        // we will fall into (EVIndex == EV->getIndices().size())
+        // Note that in this case, SplitStore will have inserted an extra
+        // extractvalue and GEP:
+        // ```llvm
+        // %aa = alloca { i32 }
+        // %a = insertvalue { i32 } undef, i32 1, 0
+        // %b = insertvalue { { i32 } } undef, { i32 } %a, 0
+        // %c.extractval = extractvalue { i32 } %a, 0
+        // %aa.index = getelementptr { i32 }* %aa, i32 0, i32 0
+        // store i32 %c, i32* %aa.index
+        // ```
+      }
+      if (ResultField) {
+        // \O/ We're done with this ExtractValueInst!
+        break;
+      } else if (EVIndex == EV->getIndices().size()) {
+        // We've found an insertvalue that inserts at one or more levels deeper
+        // than this extractvalue. For example (borrowed from the tests), where
+        // %h is EV && %e is IV:
+        // ```llvm
+        // %e = insertvalue { { { i32, i64 } }, i64 } undef, { i32, i64 } %b, 0, 0
+        // %h = extractvalue { { { i32, i64 } }, i64 } %e, 0
+        // ; later on..
+        // %1 = extractvalue { { i32, i64 } } %h, 0
+        // ```
+        // This expands to:
+        // ```llvm
+        // %e = insertvalue { { { i32, i64 } }, i64 } undef, { i32, i64 } %b, 0, 0
+        // %1 = insertvalue { { i32, i64 } } undef, { i32, i64 } %b, 0
+        // %h = extractvalue { { { i32, i64 } }, i64 } %e, 0
+        // %2 = extractvalue { { i32, i64 } } %h, 0
+        // ```
+        // Then, outside the outer loop, %h is deleted:
+        // ```llvm
+        // %e = insertvalue { { { i32, i64 } }, i64 } undef, { i32, i64 } %b, 0, 0
+        // %1 = insertvalue { { i32, i64 } } undef, { i32, i64 } %b, 0
+        // %2 = extractvalue { { i32, i64 } } %1, 0
+        // ```
+        // %2 will be expanded at a later point.
+        // This branch used the second index in %e to create %1 (because %2 &&
+        // %e's first indices where equal).
+        //
+        // Additionally, it's impossible to not change StructVal && not hit this
+        // branch (but the reverse is not true!).
+
+        SmallVector<unsigned, 4> Indices(IV->getIndices().begin() + IVIndex,
+                                         IV->getIndices().end());
+
+        InsertValueInst *Insert = InsertValueInst::Create(
+            UndefValue::get(EV->getType()), IV->getInsertedValueOperand(),
+            Indices, "", EV);
+        ToErase->push_back(Insert);
+        ResultField = CopyDebug(Insert, EV);
+        break;
+      }
+
+      // At this point, StructVal must be changed.
+    } else if (Constant *C = dyn_cast<Constant>(StructVal)) {
+      SmallVector<unsigned, 4> Indices(EV->getIndices().begin() + EVIndex,
+                                       EV->getIndices().end());
+      ResultField = ConstantExpr::getExtractValue(C, Indices);
+      break;
+    } else if (isa<LoadInst>(StructVal)) {
+      ResultField = StructVal;
+      break;
+    } else {
+      errs() << "Value: " << *StructVal << "\n";
+      report_fatal_error("Unrecognized struct value");
+    }
+  }
+
+  assert(ResultField); // Failsafe.
+  EV->replaceAllUsesWith(ResultField);
+  EV->eraseFromParent();
+  return true;
+}
+
+static bool ExpandExtractValues(Function &Func, bool Finalize) {
+  bool Changed = false;
+
+  SmallVector<Instruction *, 10> ToErase;
+  // Expand out all the extractvalue instructions.  Also collect up
+  // the insertvalue instructions for later deletion so that we do not
+  // need to make extra passes across the whole function.
+
+  for (auto &BB : Func) {
+    for (BasicBlock::iterator Iter = BB.begin(), E = BB.end(); Iter != E;) {
+      Instruction *Inst = &*Iter++;
+      if (ExtractValueInst *EV = dyn_cast<ExtractValueInst>(Inst)) {
+        Changed |= ExpandExtractValue(EV, &ToErase);
+      } else if (isa<InsertValueInst>(Inst)) {
+        ToErase.push_back(Inst);
+        Changed = true;
+      }
+    }
+  }
+
+  if (Finalize) {
+    // Delete the insertvalue instructions. These can reference each
+    // other, so we must do dropAllReferences() before doing
+    // eraseFromParent(), otherwise we will try to erase instructions
+    // that are still referenced.
+    for (Instruction *I : ToErase) {
+      I->dropAllReferences();
+    }
+
+    for (Instruction *I : ToErase) {
+      I->eraseFromParent();
+    }
+  }
+
+  return Changed;
+}
+
+bool ExpandStructRegs::runOnFunction(Function &Func) {
+  bool Changed = false;
+  const DataLayout *DL = &Func.getParent()->getDataLayout();
+
+  auto SplitUpInstructions = [&]() {
+    bool NeedsAnotherPass;
+    do {
+      NeedsAnotherPass = false;
+      // Split up aggregate loads, stores and phi nodes into operations on
+      // scalar types.  This inserts extractvalue and insertvalue
+      // instructions which we will expand out later.
+      for (Function::iterator BB = Func.begin(), E = Func.end(); BB != E; ++BB) {
+        for (BasicBlock::iterator Iter = BB->begin(), E = BB->end(); Iter != E;) {
+          Instruction *Inst = &*Iter++;
+          if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
+            if (Store->getValueOperand()->getType()->isStructTy()) {
+              NeedsAnotherPass |= SplitUpStore(Store, DL);
+              Changed = true;
+            } else if (Store->getValueOperand()->getType()->isArrayTy()) {
+              NeedsAnotherPass |= SplitUpArrayStore(Store, DL);
+              Changed = true;
+            }
+          } else if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+            if (Load->getType()->isStructTy()) {
+              NeedsAnotherPass |= SplitUpLoad(Load, DL);
+              Changed = true;
+            } else if (Load->getType()->isArrayTy()) {
+              NeedsAnotherPass |= SplitUpArrayLoad(Load, DL);
+              Changed = true;
+            }
+          } else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+            if (Phi->getType()->isStructTy()) {
+              NeedsAnotherPass |= SplitUpPHINode(Phi);
+              Changed = true;
+            }else if (Phi->getType()->isArrayTy()) {
+              NeedsAnotherPass |= SplitUpArrayPHINode(Phi);
+              Changed = true;
+            }
+          } else if (SelectInst *Select = dyn_cast<SelectInst>(Inst)) {
+            if (Select->getType()->isStructTy()) {
+              NeedsAnotherPass |= SplitUpSelect(Select);
+              Changed = true;
+            } else if (Select->getType()->isArrayTy()) {
+              NeedsAnotherPass |= SplitUpArraySelect(Select);
+              Changed = true;
+            }
+          }
+        }
+      }
+    } while (NeedsAnotherPass);
+  };
+
+  SplitUpInstructions();
+  Changed |= ExpandExtractValues(Func, false);
+
+  if (Changed) {
+    // insertvalues that receive insertvalues may require additional splitting
+    // and expansion.
+    // TODO: do we need an arbitrary amount of such passes?
+    SplitUpInstructions();
+    ExpandExtractValues(Func, true);
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createExpandStructRegsPass() {
+  return new ExpandStructRegs();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandTls.cpp b/lib/Target/JSBackend/NaCl/ExpandTls.cpp
new file mode 100644
index 000000000000..b254672ea2a1
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandTls.cpp
@@ -0,0 +1,336 @@
+//===- ExpandTls.cpp - Convert TLS variables to a concrete layout----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands out uses of thread-local (TLS) variables into
+// more primitive operations.
+//
+// A reference to the address of a TLS variable is expanded into code
+// which gets the current thread's thread pointer using
+// @llvm.nacl.read.tp() and adds a fixed offset.
+//
+// This pass allocates the offsets (relative to the thread pointer)
+// that will be used for TLS variables.  It sets up the global
+// variables __tls_template_start, __tls_template_end etc. to contain
+// a template for initializing TLS variables' values for each thread.
+// This is a task normally performed by the linker in ELF systems.
+//
+//===----------------------------------------------------------------------===//
+
+#include <vector>
+
+#include "llvm/Pass.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  struct VarInfo {
+    GlobalVariable *TlsVar;
+    bool IsBss; // Whether variable is in zero-intialized part of template
+    int TemplateIndex;
+  };
+
+  class PassState {
+  public:
+    PassState(Module *M): M(M), DL(M), Offset(0), Alignment(1) {}
+
+    Module *M;
+    DataLayout DL;
+    uint64_t Offset;
+    // 'Alignment' is the maximum variable alignment seen so far, in
+    // bytes.  After visiting all TLS variables, this is the overall
+    // alignment required for the TLS template.
+    uint32_t Alignment;
+  };
+
+  class ExpandTls : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ExpandTls() : ModulePass(ID) {
+      initializeExpandTlsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char ExpandTls::ID = 0;
+INITIALIZE_PASS(ExpandTls, "nacl-expand-tls",
+                "Expand out TLS variables and fix TLS variable layout",
+                false, false)
+
+static void setGlobalVariableValue(Module &M, const char *Name,
+                                   Constant *Value) {
+  if (GlobalVariable *Var = M.getNamedGlobal(Name)) {
+    if (Var->hasInitializer()) {
+      report_fatal_error(std::string("Variable ") + Name +
+                         " already has an initializer");
+    }
+    Var->replaceAllUsesWith(ConstantExpr::getBitCast(Value, Var->getType()));
+    Var->eraseFromParent();
+  }
+}
+
+// Insert alignment padding into the TLS template.
+static void padToAlignment(PassState *State,
+                           std::vector<Type*> *FieldTypes,
+                           std::vector<Constant*> *FieldValues,
+                           unsigned Alignment) {
+  if ((State->Offset & (Alignment - 1)) != 0) {
+    unsigned PadSize = Alignment - (State->Offset & (Alignment - 1));
+    Type *i8 = Type::getInt8Ty(State->M->getContext());
+    Type *PadType = ArrayType::get(i8, PadSize);
+    FieldTypes->push_back(PadType);
+    if (FieldValues)
+      FieldValues->push_back(Constant::getNullValue(PadType));
+    State->Offset += PadSize;
+  }
+  if (State->Alignment < Alignment) {
+    State->Alignment = Alignment;
+  }
+}
+
+static void addVarToTlsTemplate(PassState *State,
+                                std::vector<Type*> *FieldTypes,
+                                std::vector<Constant*> *FieldValues,
+                                GlobalVariable *TlsVar) {
+  unsigned Alignment = State->DL.getPreferredAlignment(TlsVar);
+  padToAlignment(State, FieldTypes, FieldValues, Alignment);
+
+  FieldTypes->push_back(TlsVar->getType()->getElementType());
+  if (FieldValues)
+    FieldValues->push_back(TlsVar->getInitializer());
+  State->Offset +=
+      State->DL.getTypeAllocSize(TlsVar->getType()->getElementType());
+}
+
+static StructType *buildTlsTemplate(Module &M, std::vector<VarInfo> *TlsVars) {
+  std::vector<Type*> FieldBssTypes;
+  std::vector<Type*> FieldInitTypes;
+  std::vector<Constant*> FieldInitValues;
+  PassState State(&M);
+
+  for (Module::global_iterator GV = M.global_begin();
+       GV != M.global_end();
+       ++GV) {
+    if (GV->isThreadLocal()) {
+      if (!GV->hasInitializer()) {
+        // Since this is a whole-program transformation, "extern" TLS
+        // variables are not allowed at this point.
+        report_fatal_error(std::string("TLS variable without an initializer: ")
+                           + GV->getName());
+      }
+      if (!GV->getInitializer()->isNullValue()) {
+        addVarToTlsTemplate(&State, &FieldInitTypes,
+                            &FieldInitValues, &*GV);
+        VarInfo Info;
+        Info.TlsVar = &*GV;
+        Info.IsBss = false;
+        Info.TemplateIndex = FieldInitTypes.size() - 1;
+        TlsVars->push_back(Info);
+      }
+    }
+  }
+  // Handle zero-initialized TLS variables in a second pass, because
+  // these should follow non-zero-initialized TLS variables.
+  for (Module::global_iterator GV = M.global_begin();
+       GV != M.global_end();
+       ++GV) {
+    if (GV->isThreadLocal() && GV->getInitializer()->isNullValue()) {
+      addVarToTlsTemplate(&State, &FieldBssTypes, NULL, &*GV);
+      VarInfo Info;
+      Info.TlsVar = &*GV;
+      Info.IsBss = true;
+      Info.TemplateIndex = FieldBssTypes.size() - 1;
+      TlsVars->push_back(Info);
+    }
+  }
+  // Add final alignment padding so that
+  //   (struct tls_struct *) __nacl_read_tp() - 1
+  // gives the correct, aligned start of the TLS variables given the
+  // x86-style layout we are using.  This requires some more bytes to
+  // be memset() to zero at runtime.  This wastage doesn't seem
+  // important gives that we're not trying to optimize packing by
+  // reordering to put similarly-aligned variables together.
+  padToAlignment(&State, &FieldBssTypes, NULL, State.Alignment);
+
+  // We create the TLS template structs as "packed" because we insert
+  // alignment padding ourselves, and LLVM's implicit insertion of
+  // padding would interfere with ours.  tls_bss_template can start at
+  // a non-aligned address immediately following the last field in
+  // tls_init_template.
+  StructType *InitTemplateType =
+      StructType::create(M.getContext(), "tls_init_template");
+  InitTemplateType->setBody(FieldInitTypes, /*isPacked=*/true);
+  StructType *BssTemplateType =
+      StructType::create(M.getContext(), "tls_bss_template");
+  BssTemplateType->setBody(FieldBssTypes, /*isPacked=*/true);
+
+  StructType *TemplateType = StructType::create(M.getContext(), "tls_struct");
+  SmallVector<Type*, 2> TemplateTopFields;
+  TemplateTopFields.push_back(InitTemplateType);
+  TemplateTopFields.push_back(BssTemplateType);
+  TemplateType->setBody(TemplateTopFields, /*isPacked=*/true);
+  PointerType *TemplatePtrType = PointerType::get(TemplateType, 0);
+
+  // We define the following symbols, which are the same as those
+  // defined by NaCl's original customized binutils linker scripts:
+  //   __tls_template_start
+  //   __tls_template_tdata_end
+  //   __tls_template_end
+  // We also define __tls_template_alignment, which was not defined by
+  // the original linker scripts.
+
+  const char *StartSymbol = "__tls_template_start";
+  Constant *TemplateData = ConstantStruct::get(InitTemplateType,
+                                               FieldInitValues);
+  GlobalVariable *TemplateDataVar =
+      new GlobalVariable(M, InitTemplateType, /*isConstant=*/true,
+                         GlobalValue::InternalLinkage, TemplateData);
+  setGlobalVariableValue(M, StartSymbol, TemplateDataVar);
+  TemplateDataVar->setName(StartSymbol);
+
+  Constant *TdataEnd = ConstantExpr::getGetElementPtr(
+      InitTemplateType,
+      TemplateDataVar,
+      ConstantInt::get(M.getContext(), APInt(32, 1)));
+  setGlobalVariableValue(M, "__tls_template_tdata_end", TdataEnd);
+
+  Constant *TotalEnd = ConstantExpr::getGetElementPtr(
+      TemplateType,
+      ConstantExpr::getBitCast(TemplateDataVar, TemplatePtrType),
+      ConstantInt::get(M.getContext(), APInt(32, 1)));
+  setGlobalVariableValue(M, "__tls_template_end", TotalEnd);
+
+  const char *AlignmentSymbol = "__tls_template_alignment";
+  Type *i32 = Type::getInt32Ty(M.getContext());
+  GlobalVariable *AlignmentVar = new GlobalVariable(
+      M, i32, /*isConstant=*/true,
+      GlobalValue::InternalLinkage,
+      ConstantInt::get(M.getContext(), APInt(32, State.Alignment)));
+  setGlobalVariableValue(M, AlignmentSymbol, AlignmentVar);
+  AlignmentVar->setName(AlignmentSymbol);
+
+  return TemplateType;
+}
+
+static void rewriteTlsVars(Module &M, std::vector<VarInfo> *TlsVars,
+                           StructType *TemplateType) {
+  // Set up the intrinsic that reads the thread pointer.
+  Function *ReadTpFunc = Intrinsic::getDeclaration(&M, Intrinsic::nacl_read_tp);
+
+  for (std::vector<VarInfo>::iterator VarInfo = TlsVars->begin();
+       VarInfo != TlsVars->end();
+       ++VarInfo) {
+    GlobalVariable *Var = VarInfo->TlsVar;
+    while (Var->hasNUsesOrMore(1)) {
+      Use *U = &*Var->use_begin();
+      Instruction *InsertPt = PhiSafeInsertPt(U);
+      Value *RawThreadPtr = CallInst::Create(ReadTpFunc, "tls_raw", InsertPt);
+      Value *TypedThreadPtr = new BitCastInst(
+          RawThreadPtr, TemplateType->getPointerTo(), "tls_struct", InsertPt);
+      SmallVector<Value*, 3> Indexes;
+      // We use -1 because we use the x86-style TLS layout in which
+      // the TLS data is stored at addresses below the thread pointer.
+      // This is largely because a check in nacl_irt_thread_create()
+      // in irt/irt_thread.c requires the thread pointer to be a
+      // self-pointer on x86-32.
+      // TODO(mseaborn): I intend to remove that check because it is
+      // non-portable.  In the mean time, we want PNaCl pexes to work
+      // in older Chromium releases when translated to nexes.
+      Indexes.push_back(ConstantInt::get(
+          M.getContext(), APInt(32, -1)));
+      Indexes.push_back(ConstantInt::get(
+          M.getContext(), APInt(32, VarInfo->IsBss ? 1 : 0)));
+      Indexes.push_back(ConstantInt::get(
+          M.getContext(), APInt(32, VarInfo->TemplateIndex)));
+      Value *TlsField = GetElementPtrInst::Create(
+          TemplateType, TypedThreadPtr, Indexes, "field", InsertPt);
+      PhiSafeReplaceUses(U, TlsField);
+    }
+    VarInfo->TlsVar->eraseFromParent();
+  }
+}
+
+static void replaceFunction(Module &M, const char *Name, Value *NewFunc) {
+  if (Function *Func = M.getFunction(Name)) {
+    if (Func->hasLocalLinkage())
+      return;
+    if (!Func->isDeclaration())
+      report_fatal_error(std::string("Function already defined: ") + Name);
+    Func->replaceAllUsesWith(NewFunc);
+    Func->eraseFromParent();
+  }
+}
+
+// Provide fixed definitions for NaCl's TLS layout functions,
+// __nacl_tp_*().  We adopt the x86-style layout: ExpandTls will
+// output a program that uses the x86-style layout wherever it runs.
+//
+// This overrides the architecture-specific definitions of
+// __nacl_tp_*() that PNaCl's native support code makes available to
+// non-ABI-stable code.
+static void defineTlsLayoutFunctions(Module &M) {
+  Type *i32 = Type::getInt32Ty(M.getContext());
+  SmallVector<Type*, 1> ArgTypes;
+  ArgTypes.push_back(i32);
+  FunctionType *FuncType = FunctionType::get(i32, ArgTypes, /*isVarArg=*/false);
+  Function *NewFunc;
+  BasicBlock *BB;
+
+  // Define the function as follows:
+  //   uint32_t __nacl_tp_tdb_offset(uint32_t tdb_size) {
+  //     return 0;
+  //   }
+  // This means the thread pointer points to the TDB.
+  NewFunc = Function::Create(FuncType, GlobalValue::InternalLinkage,
+                             "nacl_tp_tdb_offset", &M);
+  BB = BasicBlock::Create(M.getContext(), "entry", NewFunc);
+  ReturnInst::Create(M.getContext(),
+                     ConstantInt::get(M.getContext(), APInt(32, 0)), BB);
+  replaceFunction(M, "__nacl_tp_tdb_offset", NewFunc);
+
+  // Define the function as follows:
+  //   uint32_t __nacl_tp_tls_offset(uint32_t tls_size) {
+  //     return -tls_size;
+  //   }
+  // This means the TLS variables are stored below the thread pointer.
+  NewFunc = Function::Create(FuncType, GlobalValue::InternalLinkage,
+                             "nacl_tp_tls_offset", &M);
+  BB = BasicBlock::Create(M.getContext(), "entry", NewFunc);
+  Value *Arg = &*NewFunc->arg_begin();
+  Arg->setName("size");
+  Value *Result = BinaryOperator::CreateNeg(Arg, "result", BB);
+  ReturnInst::Create(M.getContext(), Result, BB);
+  replaceFunction(M, "__nacl_tp_tls_offset", NewFunc);
+}
+
+bool ExpandTls::runOnModule(Module &M) {
+  ModulePass *Pass = createExpandTlsConstantExprPass();
+  Pass->runOnModule(M);
+  delete Pass;
+
+  std::vector<VarInfo> TlsVars;
+  StructType *TemplateType = buildTlsTemplate(M, &TlsVars);
+  rewriteTlsVars(M, &TlsVars, TemplateType);
+
+  defineTlsLayoutFunctions(M);
+
+  return true;
+}
+
+ModulePass *llvm::createExpandTlsPass() {
+  return new ExpandTls();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandTlsConstantExpr.cpp b/lib/Target/JSBackend/NaCl/ExpandTlsConstantExpr.cpp
new file mode 100644
index 000000000000..7426ce68641e
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandTlsConstantExpr.cpp
@@ -0,0 +1,107 @@
+//===- ExpandTlsConstantExpr.cpp - Convert ConstantExprs to Instructions---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a helper used by the ExpandTls pass.
+//
+// LLVM treats the address of a TLS variable as a ConstantExpr.  This
+// is arguably a bug because the address of a TLS variable is *not* a
+// constant: it varies between threads.
+//
+// See http://llvm.org/bugs/show_bug.cgi?id=14353
+//
+// This is also a problem for the ExpandTls pass, which wants to use
+// replaceUsesOfWith() to replace each TLS variable with an
+// Instruction sequence that calls @llvm.nacl.read.tp().  This doesn't
+// work if the TLS variable is used inside other ConstantExprs,
+// because ConstantExprs are interned and are not associated with any
+// function, whereas each Instruction must be part of a function.
+//
+// To fix that problem, this pass converts ConstantExprs that
+// reference TLS variables into Instructions.
+//
+// For example, this use of a 'ptrtoint' ConstantExpr:
+//
+//   ret i32 ptrtoint (i32* @tls_var to i32)
+//
+// is converted into this 'ptrtoint' Instruction:
+//
+//   %expanded = ptrtoint i32* @tls_var to i32
+//   ret i32 %expanded
+//
+//===----------------------------------------------------------------------===//
+
+#include <vector>
+
+#include "llvm/Pass.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  class ExpandTlsConstantExpr : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ExpandTlsConstantExpr() : ModulePass(ID) {
+      initializeExpandTlsConstantExprPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char ExpandTlsConstantExpr::ID = 0;
+INITIALIZE_PASS(ExpandTlsConstantExpr, "nacl-expand-tls-constant-expr",
+                "Eliminate ConstantExpr references to TLS variables",
+                false, false)
+
+// This removes ConstantExpr references to the given Constant.
+static void expandConstExpr(Constant *Expr) {
+  // First, ensure that ConstantExpr references to Expr are converted
+  // to Instructions so that we can modify them.
+  for (Use &U : Expr->uses())
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser()))
+      expandConstExpr(CE);
+  Expr->removeDeadConstantUsers();
+
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Expr)) {
+    while (Expr->hasNUsesOrMore(1)) {
+      Use *U = &*Expr->use_begin();
+      Instruction *NewInst = CE->getAsInstruction();
+      NewInst->insertBefore(PhiSafeInsertPt(U));
+      NewInst->setName("expanded");
+      PhiSafeReplaceUses(U, NewInst);
+    }
+  }
+}
+
+bool ExpandTlsConstantExpr::runOnModule(Module &M) {
+  for (Module::alias_iterator Iter = M.alias_begin();
+       Iter != M.alias_end(); ) {
+    GlobalAlias *GA = &*Iter++;
+    if (GA->isThreadDependent()) {
+      GA->replaceAllUsesWith(GA->getAliasee());
+      GA->eraseFromParent();
+    }
+  }
+  for (Module::global_iterator Global = M.global_begin();
+       Global != M.global_end();
+       ++Global) {
+    if (Global->isThreadLocal()) {
+      expandConstExpr(&*Global);
+    }
+  }
+  return true;
+}
+
+ModulePass *llvm::createExpandTlsConstantExprPass() {
+  return new ExpandTlsConstantExpr();
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandUtils.cpp b/lib/Target/JSBackend/NaCl/ExpandUtils.cpp
new file mode 100644
index 000000000000..96ec40d87f04
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandUtils.cpp
@@ -0,0 +1,58 @@
+//===-- ExpandUtils.cpp - Helper functions for expansion passes -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+Instruction *llvm::PhiSafeInsertPt(Use *U) {
+  Instruction *InsertPt = cast<Instruction>(U->getUser());
+  if (PHINode *PN = dyn_cast<PHINode>(InsertPt)) {
+    // We cannot insert instructions before a PHI node, so insert
+    // before the incoming block's terminator.  This could be
+    // suboptimal if the terminator is a conditional.
+    InsertPt = PN->getIncomingBlock(*U)->getTerminator();
+  }
+  return InsertPt;
+}
+
+void llvm::PhiSafeReplaceUses(Use *U, Value *NewVal) {
+  User *UR = U->getUser();
+  if (PHINode *PN = dyn_cast<PHINode>(UR)) {
+    // A PHI node can have multiple incoming edges from the same
+    // block, in which case all these edges must have the same
+    // incoming value.
+    BasicBlock *BB = PN->getIncomingBlock(*U);
+    for (unsigned I = 0; I < PN->getNumIncomingValues(); ++I) {
+      if (PN->getIncomingBlock(I) == BB)
+        PN->setIncomingValue(I, NewVal);
+    }
+  } else {
+    UR->replaceUsesOfWith(U->get(), NewVal);
+  }
+}
+
+Function *llvm::RecreateFunction(Function *Func, FunctionType *NewType) {
+  Function *NewFunc = Function::Create(NewType, Func->getLinkage());
+  NewFunc->copyAttributesFrom(Func);
+  Func->getParent()->getFunctionList().insert(Func->getIterator(), NewFunc);
+  NewFunc->takeName(Func);
+  NewFunc->getBasicBlockList().splice(NewFunc->begin(),
+                                      Func->getBasicBlockList());
+  Func->replaceAllUsesWith(
+      ConstantExpr::getBitCast(NewFunc,
+                               Func->getFunctionType()->getPointerTo()));
+  return NewFunc;
+}
diff --git a/lib/Target/JSBackend/NaCl/ExpandVarArgs.cpp b/lib/Target/JSBackend/NaCl/ExpandVarArgs.cpp
new file mode 100644
index 000000000000..0afddae79de0
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ExpandVarArgs.cpp
@@ -0,0 +1,324 @@
+//===- ExpandVarArgs.cpp - Expand out variable argument function calls-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands out all use of variable argument functions.
+//
+// This pass replaces a varargs function call with a function call in
+// which a pointer to the variable arguments is passed explicitly.
+// The callee explicitly allocates space for the variable arguments on
+// the stack using "alloca".
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+class ExpandVarArgs : public ModulePass {
+public:
+  static char ID;
+  ExpandVarArgs() : ModulePass(ID) {
+    initializeExpandVarArgsPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnModule(Module &M);
+};
+}
+
+char ExpandVarArgs::ID = 0;
+INITIALIZE_PASS(ExpandVarArgs, "expand-varargs",
+                "Expand out variable argument function definitions and calls",
+                false, false)
+
+static bool isEmscriptenJSArgsFunc(Module *M, StringRef Name) {
+  // TODO(jfb) Make these intrinsics in clang and remove the assert: these
+  //           intrinsics should only exist for Emscripten.
+  bool isEmscriptenSpecial = Name.equals("emscripten_asm_const_int") ||
+                             Name.equals("emscripten_asm_const_double") ||
+                             Name.equals("emscripten_landingpad") ||
+                             Name.equals("emscripten_resume");
+  assert(isEmscriptenSpecial ? Triple(M->getTargetTriple()).isOSEmscripten()
+                             : true);
+  return isEmscriptenSpecial;
+}
+
+static bool ExpandVarArgFunc(Module *M, Function *Func) {
+  if (Func->isDeclaration() && Func->use_empty())
+    return false; // No point in doing any work.
+
+  if (isEmscriptenJSArgsFunc(M, Func->getName()))
+    return false;
+
+  Type *PtrType = Type::getInt8PtrTy(Func->getContext());
+
+  FunctionType *FTy = Func->getFunctionType();
+  SmallVector<Type *, 8> Params(FTy->param_begin(), FTy->param_end());
+  Params.push_back(PtrType);
+  FunctionType *NFTy =
+      FunctionType::get(FTy->getReturnType(), Params, /*isVarArg=*/false);
+  Function *NewFunc = RecreateFunction(Func, NFTy);
+
+  // Declare the new argument as "noalias".
+  NewFunc->setAttributes(Func->getAttributes().addAttribute(
+      Func->getContext(), FTy->getNumParams() + 1, Attribute::NoAlias));
+
+  // Move the arguments across to the new function.
+  auto NewArg = NewFunc->arg_begin();
+  for (Argument &Arg : Func->args()) {
+    Arg.replaceAllUsesWith(&*NewArg);
+    NewArg->takeName(&Arg);
+    ++NewArg;
+  }
+  // The last argument is the new `i8 * noalias %varargs`.
+  NewArg->setName("varargs");
+
+  Func->eraseFromParent();
+
+  // Expand out uses of llvm.va_start in this function.
+  for (BasicBlock &BB : *NewFunc) {
+    for (auto BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      Instruction *I = &*BI++;
+      if (auto *VAS = dyn_cast<VAStartInst>(I)) {
+        IRBuilder<> IRB(VAS);
+        Value *Cast = IRB.CreateBitCast(VAS->getArgList(),
+                                        PtrType->getPointerTo(), "arglist");
+        IRB.CreateStore(&*NewArg, Cast);
+        VAS->eraseFromParent();
+      }
+    }
+  }
+
+  return true;
+}
+
+static void ExpandVAArgInst(VAArgInst *Inst, DataLayout *DL) {
+  Type *IntPtrTy = DL->getIntPtrType(Inst->getContext());
+  auto *One = ConstantInt::get(IntPtrTy, 1);
+  IRBuilder<> IRB(Inst);
+  auto *ArgList = IRB.CreateBitCast(
+      Inst->getPointerOperand(),
+      Inst->getType()->getPointerTo()->getPointerTo(), "arglist");
+
+  // The caller spilled all of the va_args onto the stack in an unpacked
+  // struct. Each va_arg load from that struct needs to realign the element to
+  // its target-appropriate alignment in the struct in order to jump over
+  // padding that may have been in-between arguments. Do this with ConstantExpr
+  // to ensure good code gets generated, following the same approach as
+  // Support/MathExtras.h:alignAddr:
+  //   ((uintptr_t)Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1)
+  // This assumes the alignment of the type is a power of 2 (or 1, in which case
+  // no realignment occurs).
+  auto *Ptr = IRB.CreateLoad(ArgList, "arglist_current");
+  auto *AlignOf = ConstantExpr::getIntegerCast(
+      ConstantExpr::getAlignOf(Inst->getType()), IntPtrTy, /*isSigned=*/false);
+  auto *AlignMinus1 = ConstantExpr::getNUWSub(AlignOf, One);
+  auto *NotAlignMinus1 = IRB.CreateNot(AlignMinus1);
+  auto *CurrentPtr = IRB.CreateIntToPtr(
+      IRB.CreateAnd(
+          IRB.CreateNUWAdd(IRB.CreatePtrToInt(Ptr, IntPtrTy), AlignMinus1),
+          NotAlignMinus1),
+      Ptr->getType());
+
+  auto *Result = IRB.CreateLoad(CurrentPtr, "va_arg");
+  Result->takeName(Inst);
+
+  // Update the va_list to point to the next argument.
+  Value *Indexes[] = {One};
+  auto *Next = IRB.CreateInBoundsGEP(CurrentPtr, Indexes, "arglist_next");
+  IRB.CreateStore(Next, ArgList);
+
+  Inst->replaceAllUsesWith(Result);
+  Inst->eraseFromParent();
+}
+
+static void ExpandVAEnd(VAEndInst *VAE) {
+  // va_end() is a no-op in this implementation.
+  VAE->eraseFromParent();
+}
+
+static void ExpandVACopyInst(VACopyInst *Inst) {
+  // va_list may have more space reserved, but we only need to
+  // copy a single pointer.
+  Type *PtrTy = Type::getInt8PtrTy(Inst->getContext())->getPointerTo();
+  IRBuilder<> IRB(Inst);
+  auto *Src = IRB.CreateBitCast(Inst->getSrc(), PtrTy, "vacopy_src");
+  auto *Dest = IRB.CreateBitCast(Inst->getDest(), PtrTy, "vacopy_dest");
+  auto *CurrentPtr = IRB.CreateLoad(Src, "vacopy_currentptr");
+  IRB.CreateStore(CurrentPtr, Dest);
+  Inst->eraseFromParent();
+}
+
+// ExpandVarArgCall() converts a CallInst or InvokeInst to expand out
+// of varargs.  It returns whether the module was modified.
+template <class InstType>
+static bool ExpandVarArgCall(Module *M, InstType *Call, DataLayout *DL) {
+  FunctionType *FuncType = cast<FunctionType>(
+      Call->getCalledValue()->getType()->getPointerElementType());
+  if (!FuncType->isFunctionVarArg())
+    return false;
+  if (auto *F = dyn_cast<Function>(Call->getCalledValue()))
+    if (isEmscriptenJSArgsFunc(M, F->getName()))
+      return false;
+
+  Function *F = Call->getParent()->getParent();
+  LLVMContext &Ctx = M->getContext();
+
+  SmallVector<AttributeSet, 8> Attrs;
+  Attrs.push_back(Call->getAttributes().getFnAttributes());
+  Attrs.push_back(Call->getAttributes().getRetAttributes());
+
+  // Split argument list into fixed and variable arguments.
+  SmallVector<Value *, 8> FixedArgs;
+  SmallVector<Value *, 8> VarArgs;
+  SmallVector<Type *, 8> VarArgsTypes;
+  for (unsigned I = 0, E = FuncType->getNumParams(); I < E; ++I) {
+    FixedArgs.push_back(Call->getArgOperand(I));
+    // AttributeSets use 1-based indexing.
+    Attrs.push_back(Call->getAttributes().getParamAttributes(I + 1));
+  }
+  for (unsigned I = FuncType->getNumParams(), E = Call->getNumArgOperands();
+       I < E; ++I) {
+    Value *ArgVal = Call->getArgOperand(I);
+    VarArgs.push_back(ArgVal);
+    bool isByVal = Call->getAttributes().hasAttribute(I + 1, Attribute::ByVal);
+    // For "byval" arguments we must dereference the pointer.
+    VarArgsTypes.push_back(isByVal ? ArgVal->getType()->getPointerElementType()
+                                   : ArgVal->getType());
+  }
+  if (VarArgsTypes.size() == 0) {
+    // Some buggy code (e.g. 176.gcc in Spec2k) uses va_arg on an
+    // empty argument list, which gives undefined behaviour in C.  To
+    // work around such programs, we create a dummy varargs buffer on
+    // the stack even though there are no arguments to put in it.
+    // This allows va_arg to read an undefined value from the stack
+    // rather than crashing by reading from an uninitialized pointer.
+    // An alternative would be to pass a null pointer to catch the
+    // invalid use of va_arg.
+    VarArgsTypes.push_back(Type::getInt32Ty(Ctx));
+  }
+
+  // Create struct type for packing variable arguments into.
+  StructType *VarArgsTy = StructType::get(Ctx, VarArgsTypes);
+
+  // Allocate space for the variable argument buffer.  Do this at the
+  // start of the function so that we don't leak space if the function
+  // is called in a loop.
+  IRBuilder<> IRB(&*F->getEntryBlock().getFirstInsertionPt());
+  auto *Buf = IRB.CreateAlloca(VarArgsTy, nullptr, "vararg_buffer");
+
+  // Call llvm.lifetime.start/end intrinsics to indicate that Buf is
+  // only used for the duration of the function call, so that the
+  // stack space can be reused elsewhere.
+  auto LifetimeStart = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
+  auto LifetimeEnd = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
+  auto *I8Ptr = Type::getInt8Ty(Ctx)->getPointerTo();
+  auto *BufPtr = IRB.CreateBitCast(Buf, I8Ptr, "vararg_lifetime_bitcast");
+  auto *BufSize =
+      ConstantInt::get(Ctx, APInt(64, DL->getTypeAllocSize(VarArgsTy)));
+  IRB.CreateCall(LifetimeStart, {BufSize, BufPtr});
+
+  // Copy variable arguments into buffer.
+  int Index = 0;
+  IRB.SetInsertPoint(Call);
+  for (Value *Arg : VarArgs) {
+    Value *Indexes[] = {ConstantInt::get(Ctx, APInt(32, 0)),
+                        ConstantInt::get(Ctx, APInt(32, Index))};
+    Value *Ptr = IRB.CreateInBoundsGEP(Buf, Indexes, "vararg_ptr");
+    bool isByVal = Call->getAttributes().hasAttribute(
+        FuncType->getNumParams() + Index + 1, Attribute::ByVal);
+    if (isByVal)
+      IRB.CreateMemCpy(Ptr, Arg, DL->getTypeAllocSize(
+                                     Arg->getType()->getPointerElementType()),
+                       /*Align=*/1);
+    else
+      IRB.CreateStore(Arg, Ptr);
+    ++Index;
+  }
+
+  // Cast function to new type to add our extra pointer argument.
+  SmallVector<Type *, 8> ArgTypes(FuncType->param_begin(),
+                                  FuncType->param_end());
+  ArgTypes.push_back(VarArgsTy->getPointerTo());
+  FunctionType *NFTy = FunctionType::get(FuncType->getReturnType(), ArgTypes,
+                                         /*isVarArg=*/false);
+  Value *CastFunc = IRB.CreateBitCast(Call->getCalledValue(),
+                                      NFTy->getPointerTo(), "vararg_func");
+
+  // Create the converted function call.
+  FixedArgs.push_back(Buf);
+  Instruction *NewCall;
+  if (auto *C = dyn_cast<CallInst>(Call)) {
+    auto *N = IRB.CreateCall(CastFunc, FixedArgs);
+    N->setAttributes(AttributeSet::get(Ctx, Attrs));
+    NewCall = N;
+    IRB.CreateCall(LifetimeEnd, {BufSize, BufPtr});
+  } else if (auto *C = dyn_cast<InvokeInst>(Call)) {
+    auto *N = IRB.CreateInvoke(CastFunc, C->getNormalDest(), C->getUnwindDest(),
+                               FixedArgs, C->getName());
+    N->setAttributes(AttributeSet::get(Ctx, Attrs));
+    (IRBuilder<>(&*C->getNormalDest()->getFirstInsertionPt()))
+        .CreateCall(LifetimeEnd, {BufSize, BufPtr});
+    (IRBuilder<>(&*C->getUnwindDest()->getFirstInsertionPt()))
+        .CreateCall(LifetimeEnd, {BufSize, BufPtr});
+    NewCall = N;
+  } else {
+    llvm_unreachable("not a call/invoke");
+  }
+
+  NewCall->takeName(Call);
+  Call->replaceAllUsesWith(NewCall);
+  Call->eraseFromParent();
+
+  return true;
+}
+
+bool ExpandVarArgs::runOnModule(Module &M) {
+  bool Changed = false;
+  DataLayout DL(&M);
+
+  for (auto MI = M.begin(), ME = M.end(); MI != ME;) {
+    Function *F = &*MI++;
+    for (BasicBlock &BB : *F) {
+      for (auto BI = BB.begin(), BE = BB.end(); BI != BE;) {
+        Instruction *I = &*BI++;
+        if (auto *VI = dyn_cast<VAArgInst>(I)) {
+          Changed = true;
+          ExpandVAArgInst(VI, &DL);
+        } else if (auto *VAE = dyn_cast<VAEndInst>(I)) {
+          Changed = true;
+          ExpandVAEnd(VAE);
+        } else if (auto *VAC = dyn_cast<VACopyInst>(I)) {
+          Changed = true;
+          ExpandVACopyInst(VAC);
+        } else if (auto *Call = dyn_cast<CallInst>(I)) {
+          Changed |= ExpandVarArgCall(&M, Call, &DL);
+        } else if (auto *Call = dyn_cast<InvokeInst>(I)) {
+          Changed |= ExpandVarArgCall(&M, Call, &DL);
+        }
+      }
+    }
+
+    if (F->isVarArg())
+      Changed |= ExpandVarArgFunc(&M, F);
+  }
+
+  return Changed;
+}
+
+ModulePass *llvm::createExpandVarArgsPass() { return new ExpandVarArgs(); }
diff --git a/lib/Target/JSBackend/NaCl/FixVectorLoadStoreAlignment.cpp b/lib/Target/JSBackend/NaCl/FixVectorLoadStoreAlignment.cpp
new file mode 100644
index 000000000000..5a7a4998eaf4
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/FixVectorLoadStoreAlignment.cpp
@@ -0,0 +1,264 @@
+//===- FixVectorLoadStoreAlignment.cpp - Vector load/store alignment ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Fix vector load/store alignment by:
+// - Leaving as-is if the alignment is equal to the vector's element width.
+// - Reducing the alignment to vector's element width if it's greater and the
+//   current alignment is a factor of the element alignment.
+// - Scalarizing if the alignment is smaller than the element-wise alignment.
+//
+// Volatile vector load/store are handled the same, and can therefore be broken
+// up as allowed by C/C++.
+//
+// TODO(jfb) Atomic accesses cause errors at compile-time. This could be
+//           implemented as a call to the C++ runtime, since 128-bit atomics
+//           aren't usually lock-free.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+class FixVectorLoadStoreAlignment : public BasicBlockPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  FixVectorLoadStoreAlignment() : BasicBlockPass(ID), M(0), DL(0) {
+    initializeFixVectorLoadStoreAlignmentPass(*PassRegistry::getPassRegistry());
+  }
+  using BasicBlockPass::doInitialization;
+  bool doInitialization(Module &Mod) override {
+    M = &Mod;
+    return false; // Unchanged.
+  }
+  bool runOnBasicBlock(BasicBlock &BB) override;
+
+private:
+  typedef SmallVector<Instruction *, 8> Instructions;
+  const Module *M;
+  const DataLayout *DL;
+
+  /// Some sub-classes of Instruction have a non-virtual function
+  /// indicating which operand is the pointer operand. This template
+  /// function returns the pointer operand's type, and requires that
+  /// InstTy have a getPointerOperand function.
+  template <typename InstTy>
+  static PointerType *pointerOperandType(const InstTy *I) {
+    return cast<PointerType>(I->getPointerOperand()->getType());
+  }
+
+  /// Similar to pointerOperandType, this template function checks
+  /// whether the pointer operand is a pointer to a vector type.
+  template <typename InstTy>
+  static bool pointerOperandIsVectorPointer(const Instruction *I) {
+    return pointerOperandType(cast<InstTy>(I))->getElementType()->isVectorTy();
+  }
+
+  /// Returns true if one of the Instruction's operands is a pointer to
+  /// a vector type. This is more general than the above and assumes we
+  /// don't know which Instruction type is provided.
+  static bool hasVectorPointerOperand(const Instruction *I) {
+    for (User::const_op_iterator IB = I->op_begin(), IE = I->op_end(); IB != IE;
+         ++IB)
+      if (PointerType *PtrTy = dyn_cast<PointerType>((*IB)->getType()))
+        if (isa<VectorType>(PtrTy->getElementType()))
+          return true;
+    return false;
+  }
+
+  /// Vectors are expected to be element-aligned. If they are, leave as-is; if
+  /// the alignment is too much then narrow the alignment (when possible);
+  /// otherwise return false.
+  template <typename InstTy>
+  static bool tryFixVectorAlignment(const DataLayout *DL, Instruction *I) {
+    InstTy *LoadStore = cast<InstTy>(I);
+    VectorType *VecTy =
+        cast<VectorType>(pointerOperandType(LoadStore)->getElementType());
+    Type *ElemTy = VecTy->getElementType();
+    uint64_t ElemBitSize = DL->getTypeSizeInBits(ElemTy);
+    uint64_t ElemByteSize = ElemBitSize / CHAR_BIT;
+    uint64_t CurrentByteAlign = LoadStore->getAlignment();
+    bool isABIAligned = CurrentByteAlign == 0;
+    uint64_t VecABIByteAlign = DL->getABITypeAlignment(VecTy);
+    CurrentByteAlign = isABIAligned ? VecABIByteAlign : CurrentByteAlign;
+
+    if (CHAR_BIT * ElemByteSize != ElemBitSize)
+      return false; // Minimum byte-size elements.
+    if (MinAlign(ElemByteSize, CurrentByteAlign) == ElemByteSize) {
+      // Element-aligned, or compatible over-aligned. Keep element-aligned.
+      LoadStore->setAlignment(ElemByteSize);
+      return true;
+    }
+    return false; // Under-aligned.
+  }
+
+  void visitVectorLoadStore(BasicBlock &BB, Instructions &Loads,
+                            Instructions &Stores) const;
+  void scalarizeVectorLoadStore(BasicBlock &BB, const Instructions &Loads,
+                                const Instructions &Stores) const;
+};
+} // anonymous namespace
+
+char FixVectorLoadStoreAlignment::ID = 0;
+INITIALIZE_PASS(FixVectorLoadStoreAlignment, "fix-vector-load-store-alignment",
+                "Ensure vector load/store have element-size alignment",
+                false, false)
+
+void FixVectorLoadStoreAlignment::visitVectorLoadStore(
+    BasicBlock &BB, Instructions &Loads, Instructions &Stores) const {
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE;
+       ++BBI) {
+    Instruction *I = &*BBI;
+    // The following list of instructions is based on mayReadOrWriteMemory.
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+      if (pointerOperandIsVectorPointer<LoadInst>(I)) {
+        if (cast<LoadInst>(I)->isAtomic())
+          report_fatal_error("unhandled: atomic vector store");
+        if (!tryFixVectorAlignment<LoadInst>(DL, I))
+          Loads.push_back(I);
+      }
+      break;
+    case Instruction::Store:
+      if (pointerOperandIsVectorPointer<StoreInst>(I)) {
+        if (cast<StoreInst>(I)->isAtomic())
+          report_fatal_error("unhandled: atomic vector store");
+        if (!tryFixVectorAlignment<StoreInst>(DL, I))
+          Stores.push_back(I);
+      }
+      break;
+    case Instruction::Alloca:
+    case Instruction::Fence:
+    case Instruction::VAArg:
+      // Leave these memory operations as-is, even when they deal with
+      // vectors.
+      break;
+    case Instruction::Call:
+    case Instruction::Invoke:
+      // Call/invoke don't touch memory per-se, leave them as-is.
+      break;
+    case Instruction::AtomicCmpXchg:
+      if (pointerOperandIsVectorPointer<AtomicCmpXchgInst>(I))
+        report_fatal_error(
+            "unhandled: atomic compare and exchange operation on vector");
+      break;
+    case Instruction::AtomicRMW:
+      if (pointerOperandIsVectorPointer<AtomicRMWInst>(I))
+        report_fatal_error("unhandled: atomic RMW operation on vector");
+      break;
+    default:
+      if (I->mayReadOrWriteMemory() && hasVectorPointerOperand(I)) {
+        errs() << "Not handled: " << *I << '\n';
+        report_fatal_error(
+            "unexpected: vector operations which may read/write memory");
+      }
+      break;
+    }
+  }
+}
+
+void FixVectorLoadStoreAlignment::scalarizeVectorLoadStore(
+    BasicBlock &BB, const Instructions &Loads,
+    const Instructions &Stores) const {
+  for (Instructions::const_iterator IB = Loads.begin(), IE = Loads.end();
+       IB != IE; ++IB) {
+    LoadInst *VecLoad = cast<LoadInst>(*IB);
+    VectorType *LoadedVecTy =
+        cast<VectorType>(pointerOperandType(VecLoad)->getElementType());
+    Type *ElemTy = LoadedVecTy->getElementType();
+
+    // The base of the vector is as aligned as the vector load (where
+    // zero means ABI alignment for the vector), whereas subsequent
+    // elements are as aligned as the base+offset can be.
+    unsigned BaseAlign = VecLoad->getAlignment()
+                             ? VecLoad->getAlignment()
+                             : DL->getABITypeAlignment(LoadedVecTy);
+    unsigned ElemAllocSize = DL->getTypeAllocSize(ElemTy);
+
+    // Fill in the vector element by element.
+    IRBuilder<> IRB(VecLoad);
+    Value *Loaded = UndefValue::get(LoadedVecTy);
+    Value *Base =
+        IRB.CreateBitCast(VecLoad->getPointerOperand(), ElemTy->getPointerTo());
+
+    for (unsigned Elem = 0, NumElems = LoadedVecTy->getNumElements();
+         Elem != NumElems; ++Elem) {
+      unsigned Align = MinAlign(BaseAlign, ElemAllocSize * Elem);
+      Value *GEP = IRB.CreateConstInBoundsGEP1_32(ElemTy, Base, Elem);
+      LoadInst *LoadedElem =
+          IRB.CreateAlignedLoad(GEP, Align, VecLoad->isVolatile());
+      LoadedElem->setSynchScope(VecLoad->getSynchScope());
+      Loaded = IRB.CreateInsertElement(
+          Loaded, LoadedElem,
+          ConstantInt::get(Type::getInt32Ty(M->getContext()), Elem));
+    }
+
+    VecLoad->replaceAllUsesWith(Loaded);
+    VecLoad->eraseFromParent();
+  }
+
+  for (Instructions::const_iterator IB = Stores.begin(), IE = Stores.end();
+       IB != IE; ++IB) {
+    StoreInst *VecStore = cast<StoreInst>(*IB);
+    Value *StoredVec = VecStore->getValueOperand();
+    VectorType *StoredVecTy = cast<VectorType>(StoredVec->getType());
+    Type *ElemTy = StoredVecTy->getElementType();
+
+    unsigned BaseAlign = VecStore->getAlignment()
+                             ? VecStore->getAlignment()
+                             : DL->getABITypeAlignment(StoredVecTy);
+    unsigned ElemAllocSize = DL->getTypeAllocSize(ElemTy);
+
+    // Fill in the vector element by element.
+    IRBuilder<> IRB(VecStore);
+    Value *Base = IRB.CreateBitCast(VecStore->getPointerOperand(),
+                                    ElemTy->getPointerTo());
+
+    for (unsigned Elem = 0, NumElems = StoredVecTy->getNumElements();
+         Elem != NumElems; ++Elem) {
+      unsigned Align = MinAlign(BaseAlign, ElemAllocSize * Elem);
+      Value *GEP = IRB.CreateConstInBoundsGEP1_32(ElemTy, Base, Elem);
+      Value *ElemToStore = IRB.CreateExtractElement(
+          StoredVec, ConstantInt::get(Type::getInt32Ty(M->getContext()), Elem));
+      StoreInst *StoredElem = IRB.CreateAlignedStore(ElemToStore, GEP, Align,
+                                                     VecStore->isVolatile());
+      StoredElem->setSynchScope(VecStore->getSynchScope());
+    }
+
+    VecStore->eraseFromParent();
+  }
+}
+
+bool FixVectorLoadStoreAlignment::runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+  if (!DL)
+    DL = &BB.getParent()->getParent()->getDataLayout();
+  Instructions Loads;
+  Instructions Stores;
+  visitVectorLoadStore(BB, Loads, Stores);
+  if (!(Loads.empty() && Stores.empty())) {
+    Changed = true;
+    scalarizeVectorLoadStore(BB, Loads, Stores);
+  }
+  return Changed;
+}
+
+BasicBlockPass *llvm::createFixVectorLoadStoreAlignmentPass() {
+  return new FixVectorLoadStoreAlignment();
+}
diff --git a/lib/Target/JSBackend/NaCl/FlattenGlobals.cpp b/lib/Target/JSBackend/NaCl/FlattenGlobals.cpp
new file mode 100644
index 000000000000..94da2e1c32ba
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/FlattenGlobals.cpp
@@ -0,0 +1,546 @@
+//===- FlattenGlobals.cpp - Flatten global variable initializers-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts initializers for global variables into a
+// flattened normal form which removes nested struct types and
+// simplifies ConstantExprs.
+//
+// In this normal form, an initializer is either a SimpleElement or a
+// CompoundElement.
+//
+// A SimpleElement is one of the following:
+//
+// 1) An i8 array literal or zeroinitializer:
+//
+//      [SIZE x i8] c"DATA"
+//      [SIZE x i8] zeroinitializer
+//
+// 2) A reference to a GlobalValue (a function or global variable)
+//    with an optional 32-bit byte offset added to it (the addend):
+//
+//      ptrtoint (TYPE* @GLOBAL to i32)
+//      add (i32 ptrtoint (TYPE* @GLOBAL to i32), i32 ADDEND)
+//
+//    We use ptrtoint+add rather than bitcast+getelementptr because
+//    the constructor for getelementptr ConstantExprs performs
+//    constant folding which introduces more complex getelementptrs,
+//    and it is hard to check that they follow a normal form.
+//
+//    For completeness, the pass also allows a BlockAddress as well as
+//    a GlobalValue here, although BlockAddresses are currently not
+//    allowed in the PNaCl ABI, so this should not be considered part
+//    of the normal form.
+//
+// A CompoundElement is a unnamed, packed struct containing only
+// SimpleElements.
+//
+// Limitations:
+//
+// LLVM IR allows ConstantExprs that calculate the difference between
+// two globals' addresses.  FlattenGlobals rejects these because Clang
+// does not generate these and because ELF does not support such
+// relocations in general.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+
+  // Defines a (non-constant) handle that records a use of a
+  // constant. Used to make sure a relocation, within flattened global
+  // variable initializers, does not get destroyed when method
+  // removeDeadConstantUsers gets called. For simplicity, rather than
+  // defining a new (non-constant) construct, we use a return
+  // instruction as the handle.
+  typedef ReturnInst RelocUserType;
+
+  // Define map from a relocation, appearing in the flattened global variable
+  // initializers, to it's corresponding use handle.
+  typedef DenseMap<Constant*, RelocUserType*> RelocMapType;
+
+  // Define the list to hold the list of global variables being flattened.
+  struct FlattenedGlobal;
+  typedef std::vector<FlattenedGlobal*> FlattenedGlobalsVectorType;
+
+  // Returns the corresponding relocation, for the given user handle.
+  Constant *getRelocUseConstant(RelocUserType *RelocUser) {
+    return cast<Constant>(RelocUser->getReturnValue());
+  }
+
+  // The state associated with flattening globals of a module.
+  struct FlattenGlobalsState {
+    /// The module being flattened.
+    Module &M;
+    /// The data layout to be used.
+    DataLayout DL;
+    /// The relocations (within the original global variable initializers)
+    /// that must be kept.
+    RelocMapType RelocMap;
+    /// The list of global variables that are being flattened.
+    FlattenedGlobalsVectorType FlattenedGlobalsVector;
+    /// True if the module was modified during the "flatten globals" pass.
+    bool Modified;
+    /// The type model of a byte.
+    Type *ByteType;
+    /// The type model of the integer pointer type.
+    Type *IntPtrType;
+    /// The size of the pointer type.
+    unsigned PtrSize;
+
+    explicit FlattenGlobalsState(Module &M)
+        : M(M), DL(&M), RelocMap(),
+          Modified(false),
+          ByteType(Type::getInt8Ty(M.getContext())),
+          IntPtrType(DL.getIntPtrType(M.getContext())),
+          PtrSize(DL.getPointerSize())
+    {}
+
+    ~FlattenGlobalsState() {
+      // Remove added user handles.
+      for (RelocMapType::iterator
+               I = RelocMap.begin(), E = RelocMap.end(); I != E; ++I) {
+        delete I->second;
+      }
+      // Remove flatteners for global varaibles.
+      DeleteContainerPointers(FlattenedGlobalsVector);
+    }
+
+    /// Collect Global variables whose initializers should be
+    /// flattened.  Creates corresponding flattened initializers (if
+    /// applicable), and creates uninitialized replacement global
+    /// variables.
+    void flattenGlobalsWithInitializers();
+
+    /// Remove initializers from original global variables, and
+    /// then remove the portions of the initializers that are
+    /// no longer used.
+    void removeDeadInitializerConstants();
+
+    // Replace the original global variables with their flattened
+    // global variable counterparts.
+    void replaceGlobalsWithFlattenedGlobals();
+
+    // Builds and installs initializers for flattened global
+    // variables, based on the flattened initializers of the
+    // corresponding original global variables.
+    void installFlattenedGlobalInitializers();
+
+    // Returns the user handle associated with the reloc, so that it
+    // won't be deleted during the flattening process.
+    RelocUserType *getRelocUserHandle(Constant *Reloc) {
+      RelocUserType *RelocUser = RelocMap[Reloc];
+      if (RelocUser == NULL) {
+        RelocUser = ReturnInst::Create(M.getContext(), Reloc);
+        RelocMap[Reloc] = RelocUser;
+      }
+      return RelocUser;
+    }
+  };
+
+  // A FlattenedConstant represents a global variable initializer that
+  // has been flattened and may be converted into the normal form.
+  class FlattenedConstant {
+    FlattenGlobalsState &State;
+
+    // A flattened global variable initializer is represented as:
+    // 1) an array of bytes;
+    unsigned BufSize;
+    uint8_t *Buf;
+    // XXX EMSCRIPTEN: There used to be a BufEnd here. No more.
+
+    // 2) an array of relocations.
+    class Reloc {
+    private:
+      unsigned RelOffset;  // Offset at which the relocation is to be applied.
+      RelocUserType *RelocUser;
+   public:
+
+      unsigned getRelOffset() const { return RelOffset; }
+      Constant *getRelocUse() const { return getRelocUseConstant(RelocUser); }
+      Reloc(FlattenGlobalsState &State, unsigned RelOffset, Constant *NewVal)
+          : RelOffset(RelOffset), RelocUser(State.getRelocUserHandle(NewVal)) {}
+
+      explicit Reloc(const Reloc &R)
+          : RelOffset(R.RelOffset), RelocUser(R.RelocUser) {}
+
+      void operator=(const Reloc &R) {
+        RelOffset = R.RelOffset;
+        RelocUser = R.RelocUser;
+      }
+    };
+    typedef SmallVector<Reloc, 10> RelocArray;
+    RelocArray Relocs;
+
+    const DataLayout &getDataLayout() const { return State.DL; }
+
+    Module &getModule() const { return State.M; }
+
+    Type *getIntPtrType() const { return State.IntPtrType; }
+
+    Type *getByteType() const { return State.ByteType; }
+
+    unsigned getPtrSize() const { return State.PtrSize; }
+
+    void putAtDest(Constant *Value, uint8_t *Dest);
+
+    Constant *dataSlice(unsigned StartPos, unsigned EndPos) const {
+      return ConstantDataArray::get(
+          getModule().getContext(),
+          ArrayRef<uint8_t>(Buf + StartPos, Buf + EndPos));
+    }
+
+    Type *dataSliceType(unsigned StartPos, unsigned EndPos) const {
+      return ArrayType::get(getByteType(), EndPos - StartPos);
+    }
+
+  public:
+    FlattenedConstant(FlattenGlobalsState &State, Constant *Value):
+        State(State),
+        BufSize(getDataLayout().getTypeAllocSize(Value->getType())),
+        Buf(new uint8_t[BufSize]) {
+      memset(Buf, 0, BufSize);
+      putAtDest(Value, Buf);
+    }
+
+    ~FlattenedConstant() {
+      delete[] Buf;
+    }
+
+    // Returns the corresponding flattened initializer.
+    Constant *getAsNormalFormConstant() const;
+
+    // Returns the type of the corresponding flattened initializer;
+    Type *getAsNormalFormType() const;
+
+  };
+
+  // Structure used to flatten a global variable.
+  struct FlattenedGlobal {
+    // The state of the flatten globals pass.
+    FlattenGlobalsState &State;
+    // The global variable to flatten.
+    GlobalVariable *Global;
+    // The replacement global variable, if known.
+    GlobalVariable *NewGlobal;
+    // True if Global has an initializer.
+    bool HasInitializer;
+    // The flattened initializer, if the initializer would not just be
+    // filled with zeroes.
+    FlattenedConstant *FlatConst;
+    // The type of GlobalType, when used in an initializer.
+    Type *GlobalType;
+    // The size of the initializer.
+    uint64_t Size;
+  public:
+    FlattenedGlobal(FlattenGlobalsState &State, GlobalVariable *Global)
+        : State(State),
+          Global(Global),
+          NewGlobal(NULL),
+          HasInitializer(Global->hasInitializer()),
+          FlatConst(NULL),
+          GlobalType(Global->getType()->getPointerElementType()),
+          Size(GlobalType->isSized()
+               ? getDataLayout().getTypeAllocSize(GlobalType) : 0) {
+      Type *NewType = NULL;
+      if (HasInitializer) {
+        if (Global->getInitializer()->isNullValue()) {
+          // Special case of NullValue. As an optimization, for large
+          // BSS variables, avoid allocating a buffer that would only be filled
+          // with zeros.
+          NewType = ArrayType::get(getByteType(), Size);
+        } else {
+          FlatConst = new FlattenedConstant(State, Global->getInitializer());
+          NewType = FlatConst->getAsNormalFormType();
+        }
+      } else {
+        NewType = ArrayType::get(getByteType(), Size);
+      }
+      NewGlobal = new GlobalVariable(getModule(), NewType,
+                                     Global->isConstant(),
+                                     Global->getLinkage(),
+                                     NULL, "", Global,
+                                     Global->getThreadLocalMode());
+      NewGlobal->copyAttributesFrom(Global);
+      if (NewGlobal->getAlignment() == 0 && GlobalType->isSized())
+        NewGlobal->setAlignment(getDataLayout().
+                                getPrefTypeAlignment(GlobalType));
+      NewGlobal->setExternallyInitialized(Global->isExternallyInitialized());
+      NewGlobal->takeName(Global);
+    }
+
+    ~FlattenedGlobal() {
+      delete FlatConst;
+    }
+
+    const DataLayout &getDataLayout() const { return State.DL; }
+
+    Module &getModule() const { return State.M; }
+
+    Type *getByteType() const { return State.ByteType; }
+
+    // Removes the original initializer from the global variable to be
+    // flattened, if applicable.
+    void removeOriginalInitializer() {
+      if (HasInitializer) Global->setInitializer(NULL);
+    }
+
+    // Replaces the original global variable with the corresponding
+    // flattened global variable.
+    void replaceGlobalWithFlattenedGlobal() {
+      Global->replaceAllUsesWith(
+          ConstantExpr::getBitCast(NewGlobal, Global->getType()));
+      Global->eraseFromParent();
+    }
+
+    // Installs flattened initializers to the corresponding flattened
+    // global variable.
+    void installFlattenedInitializer() {
+      if (HasInitializer) {
+        Constant *NewInit = NULL;
+        if (FlatConst == NULL) {
+          // Special case of NullValue.
+          NewInit = ConstantAggregateZero::get(ArrayType::get(getByteType(),
+                                                              Size));
+        } else {
+          NewInit = FlatConst->getAsNormalFormConstant();
+        }
+        NewGlobal->setInitializer(NewInit);
+      }
+    }
+  };
+
+  class FlattenGlobals : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    FlattenGlobals() : ModulePass(ID) {
+      initializeFlattenGlobalsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+static void ExpandConstant(const DataLayout *DL, Constant *Val,
+                           Constant **ResultGlobal, uint64_t *ResultOffset) {
+  if (isa<GlobalValue>(Val) || isa<BlockAddress>(Val)) {
+    *ResultGlobal = Val;
+    *ResultOffset = 0;
+  } else if (isa<ConstantPointerNull>(Val)) {
+    *ResultGlobal = NULL;
+    *ResultOffset = 0;
+  } else if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    *ResultGlobal = NULL;
+    *ResultOffset = CI->getZExtValue();
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Val)) {
+    ExpandConstant(DL, CE->getOperand(0), ResultGlobal, ResultOffset);
+    if (CE->getOpcode() == Instruction::GetElementPtr) {
+      auto *PtrTy = cast<PointerType>(CE->getOperand(0)->getType());
+      SmallVector<Value *, 8> Indexes(CE->op_begin() + 1, CE->op_end());
+      *ResultOffset += DL->getIndexedOffsetInType(PtrTy->getElementType(),
+                                                  Indexes);
+    } else if (CE->getOpcode() == Instruction::BitCast ||
+               CE->getOpcode() == Instruction::IntToPtr) {
+      // Nothing more to do.
+    } else if (CE->getOpcode() == Instruction::PtrToInt) {
+      if (Val->getType()->getIntegerBitWidth() < DL->getPointerSizeInBits()) {
+        errs() << "Not handled: " << *CE << "\n";
+        report_fatal_error("FlattenGlobals: a ptrtoint that truncates "
+                           "a pointer is not allowed");
+      }
+    } else {
+      errs() << "Not handled: " << *CE << "\n";
+      report_fatal_error(
+          std::string("FlattenGlobals: ConstantExpr opcode not handled: ")
+          + CE->getOpcodeName());
+    }
+  } else {
+    errs() << "Not handled: " << *Val << "\n";
+    report_fatal_error("FlattenGlobals: Constant type not handled for reloc");
+  }
+}
+
+void FlattenedConstant::putAtDest(Constant *Val, uint8_t *Dest) {
+  uint64_t ValSize = getDataLayout().getTypeAllocSize(Val->getType());
+  assert(Dest + ValSize <= Buf + BufSize);
+  if (isa<ConstantAggregateZero>(Val) ||
+      isa<UndefValue>(Val) ||
+      isa<ConstantPointerNull>(Val)) {
+    // The buffer is already zero-initialized.
+  } else if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    memcpy(Dest, CI->getValue().getRawData(), ValSize);
+  } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Val)) {
+    APInt Data = CF->getValueAPF().bitcastToAPInt();
+    assert((Data.getBitWidth() + 7) / 8 == ValSize);
+    assert(Data.getBitWidth() % 8 == 0);
+    memcpy(Dest, Data.getRawData(), ValSize);
+  } else if (ConstantDataSequential *CD =
+             dyn_cast<ConstantDataSequential>(Val)) {
+    // Note that getRawDataValues() assumes the host endianness is the same.
+    StringRef Data = CD->getRawDataValues();
+    assert(Data.size() == ValSize);
+    memcpy(Dest, Data.data(), Data.size());
+  } else if (isa<ConstantArray>(Val) || isa<ConstantDataVector>(Val) ||
+             isa<ConstantVector>(Val)) {
+    uint64_t ElementSize = getDataLayout().getTypeAllocSize(
+        Val->getType()->getSequentialElementType());
+    for (unsigned I = 0; I < Val->getNumOperands(); ++I) {
+      putAtDest(cast<Constant>(Val->getOperand(I)), Dest + ElementSize * I);
+    }
+  } else if (ConstantStruct *CS = dyn_cast<ConstantStruct>(Val)) {
+    const StructLayout *Layout = getDataLayout().getStructLayout(CS->getType());
+    for (unsigned I = 0; I < CS->getNumOperands(); ++I) {
+      putAtDest(CS->getOperand(I), Dest + Layout->getElementOffset(I));
+    }
+  } else {
+    Constant *GV;
+    uint64_t Offset;
+    ExpandConstant(&getDataLayout(), Val, &GV, &Offset);
+    if (GV) {
+      Constant *NewVal = ConstantExpr::getPtrToInt(GV, getIntPtrType());
+      if (Offset) {
+        // For simplicity, require addends to be 32-bit.
+        if ((int64_t) Offset != (int32_t) (uint32_t) Offset) {
+          errs() << "Not handled: " << *Val << "\n";
+          report_fatal_error(
+              "FlattenGlobals: Offset does not fit into 32 bits");
+        }
+        NewVal = ConstantExpr::getAdd(
+            NewVal, ConstantInt::get(getIntPtrType(), Offset,
+                                     /* isSigned= */ true));
+      }
+      Reloc NewRel(State, Dest - Buf, NewVal);
+      Relocs.push_back(NewRel);
+    } else {
+      memcpy(Dest, &Offset, ValSize);
+    }
+  }
+}
+
+Constant *FlattenedConstant::getAsNormalFormConstant() const {
+  // Return a single SimpleElement.
+  if (Relocs.size() == 0)
+    return dataSlice(0, BufSize);
+  if (Relocs.size() == 1 && BufSize == getPtrSize()) {
+    assert(Relocs[0].getRelOffset() == 0);
+    return Relocs[0].getRelocUse();
+  }
+
+  // Return a CompoundElement.
+  SmallVector<Constant *, 10> Elements;
+  unsigned PrevPos = 0;
+  for (RelocArray::const_iterator Rel = Relocs.begin(), E = Relocs.end();
+       Rel != E; ++Rel) {
+    if (Rel->getRelOffset() > PrevPos)
+      Elements.push_back(dataSlice(PrevPos, Rel->getRelOffset()));
+    Elements.push_back(Rel->getRelocUse());
+    PrevPos = Rel->getRelOffset() + getPtrSize();
+  }
+  if (PrevPos < BufSize)
+    Elements.push_back(dataSlice(PrevPos, BufSize));
+  return ConstantStruct::getAnon(getModule().getContext(), Elements, true);
+}
+
+Type *FlattenedConstant::getAsNormalFormType() const {
+  // Return a single element type.
+  if (Relocs.size() == 0)
+    return dataSliceType(0, BufSize);
+  if (Relocs.size() == 1 && BufSize == getPtrSize()) {
+    assert(Relocs[0].getRelOffset() == 0);
+    return Relocs[0].getRelocUse()->getType();
+  }
+
+  // Return a compound type.
+  SmallVector<Type *, 10> Elements;
+  unsigned PrevPos = 0;
+  for (RelocArray::const_iterator Rel = Relocs.begin(), E = Relocs.end();
+       Rel != E; ++Rel) {
+    if (Rel->getRelOffset() > PrevPos)
+      Elements.push_back(dataSliceType(PrevPos, Rel->getRelOffset()));
+    Elements.push_back(Rel->getRelocUse()->getType());
+    PrevPos = Rel->getRelOffset() + getPtrSize();
+  }
+  if (PrevPos < BufSize)
+    Elements.push_back(dataSliceType(PrevPos, BufSize));
+  return StructType::get(getModule().getContext(), Elements, true);
+}
+
+char FlattenGlobals::ID = 0;
+INITIALIZE_PASS(FlattenGlobals, "flatten-globals",
+                "Flatten global variable initializers into byte arrays",
+                false, false)
+
+void FlattenGlobalsState::flattenGlobalsWithInitializers() {
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E;) {
+    GlobalVariable *Global = &*I++;
+    // Variables with "appending" linkage must always be arrays and so
+    // cannot be normalized, so leave them alone.
+    if (Global->hasAppendingLinkage())
+      continue;
+    Modified = true;
+    FlattenedGlobalsVector.push_back(new FlattenedGlobal(*this, Global));
+  }
+}
+
+void FlattenGlobalsState::removeDeadInitializerConstants() {
+  // Detach original initializers.
+  for (FlattenedGlobalsVectorType::iterator
+           I = FlattenedGlobalsVector.begin(), E = FlattenedGlobalsVector.end();
+       I != E; ++I) {
+    (*I)->removeOriginalInitializer();
+  }
+  // Do cleanup of old initializers.
+  for (RelocMapType::iterator I = RelocMap.begin(), E = RelocMap.end();
+       I != E; ++I) {
+    getRelocUseConstant(I->second)->removeDeadConstantUsers();
+  }
+
+}
+
+void FlattenGlobalsState::replaceGlobalsWithFlattenedGlobals() {
+  for (FlattenedGlobalsVectorType::iterator
+           I = FlattenedGlobalsVector.begin(), E = FlattenedGlobalsVector.end();
+       I != E; ++I) {
+    (*I)->replaceGlobalWithFlattenedGlobal();
+  }
+}
+
+void FlattenGlobalsState::installFlattenedGlobalInitializers() {
+  for (FlattenedGlobalsVectorType::iterator
+           I = FlattenedGlobalsVector.begin(), E = FlattenedGlobalsVector.end();
+       I != E; ++I) {
+    (*I)->installFlattenedInitializer();
+  }
+}
+
+bool FlattenGlobals::runOnModule(Module &M) {
+  FlattenGlobalsState State(M);
+  State.flattenGlobalsWithInitializers();
+  State.removeDeadInitializerConstants();
+  State.replaceGlobalsWithFlattenedGlobals();
+  State.installFlattenedGlobalInitializers();
+  return State.Modified;
+}
+
+ModulePass *llvm::createFlattenGlobalsPass() {
+  return new FlattenGlobals();
+}
diff --git a/lib/Target/JSBackend/NaCl/GlobalCleanup.cpp b/lib/Target/JSBackend/NaCl/GlobalCleanup.cpp
new file mode 100644
index 000000000000..13effcb647ab
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/GlobalCleanup.cpp
@@ -0,0 +1,116 @@
+//===- GlobalCleanup.cpp - Cleanup global symbols post-bitcode-link -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// ===---------------------------------------------------------------------===//
+//
+// PNaCl executables should have no external symbols or aliases. These passes
+// internalize (or otherwise remove/resolve) GlobalValues and resolve all
+// GlobalAliases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+class GlobalCleanup : public ModulePass {
+public:
+  static char ID;
+  GlobalCleanup() : ModulePass(ID) {
+    initializeGlobalCleanupPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+};
+
+class ResolveAliases : public ModulePass {
+public:
+  static char ID;
+  ResolveAliases() : ModulePass(ID) {
+    initializeResolveAliasesPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+};
+}
+
+char GlobalCleanup::ID = 0;
+INITIALIZE_PASS(GlobalCleanup, "nacl-global-cleanup",
+                "GlobalValue cleanup for PNaCl "
+                "(assumes all of the binary is linked statically)",
+                false, false)
+
+static bool CleanUpLinkage(GlobalValue *GV) {
+  // TODO(dschuff): handle the rest of the linkage types as necessary without
+  // running afoul of the IR verifier or breaking the native link
+  switch (GV->getLinkage()) {
+  case GlobalValue::ExternalWeakLinkage: {
+    auto *NullRef = Constant::getNullValue(GV->getType());
+    GV->replaceAllUsesWith(NullRef);
+    GV->eraseFromParent();
+    return true;
+  }
+  case GlobalValue::WeakAnyLinkage: {
+    GV->setLinkage(GlobalValue::InternalLinkage);
+    return true;
+  }
+  default:
+    // default with fall through to avoid compiler warning
+    return false;
+  }
+  return false;
+}
+
+bool GlobalCleanup::runOnModule(Module &M) {
+  bool Modified = false;
+
+  // Cleanup llvm.compiler.used. We leave llvm.used as-is,
+  // because optimization passes feed off it to understand
+  // what globals may/may not be optimized away. For PNaCl,
+  // it is removed before ABI validation by CleanupUsedGlobalsMetadata.
+  if (auto *GV = M.getNamedGlobal("llvm.compiler.used")) {
+    GV->eraseFromParent();
+    Modified = true;
+  }
+
+  for (auto I = M.global_begin(), E = M.global_end(); I != E;) {
+    GlobalVariable *GV = &*I++;
+    Modified |= CleanUpLinkage(GV);
+  }
+
+  for (auto I = M.begin(), E = M.end(); I != E;) {
+    Function *F = &*I++;
+    Modified |= CleanUpLinkage(F);
+  }
+
+  return Modified;
+}
+
+ModulePass *llvm::createGlobalCleanupPass() { return new GlobalCleanup(); }
+
+char ResolveAliases::ID = 0;
+INITIALIZE_PASS(ResolveAliases, "resolve-aliases",
+                "resolve global variable and function aliases", false, false)
+
+bool ResolveAliases::runOnModule(Module &M) {
+  bool Modified = false;
+
+  for (auto I = M.alias_begin(), E = M.alias_end(); I != E;) {
+    GlobalAlias *Alias = &*I++;
+    Alias->replaceAllUsesWith(Alias->getAliasee());
+    Alias->eraseFromParent();
+    Modified = true;
+  }
+  return Modified;
+}
+
+ModulePass *llvm::createResolveAliasesPass() { return new ResolveAliases(); }
diff --git a/lib/Target/JSBackend/NaCl/GlobalizeConstantVectors.cpp b/lib/Target/JSBackend/NaCl/GlobalizeConstantVectors.cpp
new file mode 100644
index 000000000000..74e866c1a9fe
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/GlobalizeConstantVectors.cpp
@@ -0,0 +1,176 @@
+//===- GlobalizeConstantVectors.cpp - Globalize constant vector -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces all constant vector operands by loads of the same
+// vector value from a constant global. After this pass functions don't
+// rely on ConstantVector and ConstantDataVector.
+//
+// The FlattenGlobals pass can be used to further simplify the globals
+// that this pass creates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+// Must be a ModulePass since it adds globals.
+class GlobalizeConstantVectors : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  GlobalizeConstantVectors() : ModulePass(ID), DL(0) {
+    initializeGlobalizeConstantVectorsPass(*PassRegistry::getPassRegistry());
+  }
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+  }
+  virtual bool runOnModule(Module &M);
+
+private:
+  typedef SmallPtrSet<Constant *, 32> Constants;
+  typedef std::pair<Function *, Constants> FunctionConstants;
+  typedef std::vector<FunctionConstants> FunctionConstantList;
+  typedef DenseMap<Constant *, GlobalVariable *> GlobalizedConstants;
+  const DataLayout *DL;
+
+  void findConstantVectors(const Function &F, Constants &Cs) const;
+  void createGlobalConstantVectors(Module &M, const FunctionConstantList &FCs,
+                                   GlobalizedConstants &GCs) const;
+  void materializeConstantVectors(Function &F, const Constants &Cs,
+                                  const GlobalizedConstants &GCs) const;
+};
+
+const char Name[] = "constant_vector";
+} // anonymous namespace
+
+char GlobalizeConstantVectors::ID = 0;
+INITIALIZE_PASS(GlobalizeConstantVectors, "globalize-constant-vectors",
+                "Replace constant vector operands with equivalent loads", false,
+                false)
+
+void GlobalizeConstantVectors::findConstantVectors(const Function &F,
+                                                   Constants &Cs) const {
+  for (const_inst_iterator II = inst_begin(F), IE = inst_end(F); II != IE;
+       ++II) {
+    for (User::const_op_iterator OI = II->op_begin(), OE = II->op_end();
+         OI != OE; ++OI) {
+      Value *V = OI->get();
+      if (isa<ConstantVector>(V) || isa<ConstantDataVector>(V) ||
+          isa<ConstantAggregateZero>(V))
+        Cs.insert(cast<Constant>(V));
+    }
+  }
+}
+
+void GlobalizeConstantVectors::createGlobalConstantVectors(
+    Module &M, const FunctionConstantList &FCs,
+    GlobalizedConstants &GCs) const {
+  for (FunctionConstantList::const_iterator FCI = FCs.begin(), FCE = FCs.end();
+       FCI != FCE; ++FCI) {
+    const Constants &Cs = FCI->second;
+
+    for (Constants::const_iterator CI = Cs.begin(), CE = Cs.end(); CI != CE;
+         ++CI) {
+      Constant *C = *CI;
+      if (GCs.find(C) != GCs.end())
+        continue; // The vector has already been globalized.
+      GlobalVariable *GV =
+          new GlobalVariable(M, C->getType(), /* isConstant= */ true,
+                             GlobalValue::InternalLinkage, C, Name);
+      GV->setAlignment(DL->getPrefTypeAlignment(C->getType()));
+      GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // The content is significant, not the address.
+      GCs[C] = GV;
+    }
+  }
+}
+
+void GlobalizeConstantVectors::materializeConstantVectors(
+    Function &F, const Constants &Cs, const GlobalizedConstants &GCs) const {
+  // The first instruction in a function dominates all others, it is therefore a
+  // safe insertion point.
+  Instruction *FirstInst = F.getEntryBlock().getFirstNonPHI();
+
+  for (Constants::const_iterator CI = Cs.begin(), CE = Cs.end(); CI != CE;
+       ++CI) {
+    Constant *C = *CI;
+    GlobalizedConstants::const_iterator GVI = GCs.find(C);
+    assert(GVI != GCs.end());
+    GlobalVariable *GV = GVI->second;
+    LoadInst *MaterializedGV = new LoadInst(GV, Name, /* isVolatile= */ false,
+                                            GV->getAlignment(), FirstInst);
+
+    // Find users of the constant vector.
+    typedef SmallVector<User *, 64> UserList;
+    UserList CVUsers;
+    for (auto U : C->users()) {
+      if (Instruction *I = dyn_cast<Instruction>(U))
+        if (I->getParent()->getParent() != &F)
+          // Skip uses of the constant vector in other functions: we need to
+          // materialize it in every function which has a use.
+          continue;
+      if (isa<Constant>(U))
+        // Don't replace global uses of the constant vector: we just created a
+        // new one. This avoid recursive references.
+        // Also, it's not legal to replace a constant's operand with
+        // a non-constant (the load instruction).
+        continue;
+      CVUsers.push_back(U);
+    }
+
+    // Replace these Users. Must be done separately to avoid invalidating the
+    // User iterator.
+    for (UserList::iterator UI = CVUsers.begin(), UE = CVUsers.end(); UI != UE;
+         ++UI) {
+      User *U = *UI;
+      for (User::op_iterator OI = U->op_begin(), OE = U->op_end(); OI != OE;
+           ++OI)
+        if (dyn_cast<Constant>(*OI) == C)
+          // The current operand is a use of the constant vector, replace it
+          // with the materialized one.
+          *OI = MaterializedGV;
+    }
+  }
+}
+
+bool GlobalizeConstantVectors::runOnModule(Module &M) {
+  DL = &M.getDataLayout();
+
+  FunctionConstantList FCs;
+  FCs.reserve(M.size());
+  for (Module::iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
+    Constants Cs;
+    findConstantVectors(*FI, Cs);
+    if (!Cs.empty())
+      FCs.push_back(std::make_pair(&*FI, Cs));
+  }
+
+  GlobalizedConstants GCs;
+  createGlobalConstantVectors(M, FCs, GCs);
+
+  for (FunctionConstantList::const_iterator FCI = FCs.begin(), FCE = FCs.end();
+       FCI != FCE; ++FCI)
+    materializeConstantVectors(*FCI->first, FCI->second, GCs);
+
+  return FCs.empty();
+}
+
+ModulePass *llvm::createGlobalizeConstantVectorsPass() {
+  return new GlobalizeConstantVectors();
+}
diff --git a/lib/Target/JSBackend/NaCl/InsertDivideCheck.cpp b/lib/Target/JSBackend/NaCl/InsertDivideCheck.cpp
new file mode 100644
index 000000000000..7510931ce2e2
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/InsertDivideCheck.cpp
@@ -0,0 +1,112 @@
+//===- InsertDivideCheck.cpp - Add divide by zero checks ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass adds a check for divide by zero before every integer DIV or REM.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "add-divide-check"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  class InsertDivideCheck : public FunctionPass {
+  public:
+    static char ID;
+    InsertDivideCheck() : FunctionPass(ID) {
+      initializeInsertDivideCheckPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+  };
+}
+
+static BasicBlock *CreateTrapBlock(Function &F, DebugLoc dl) {
+  BasicBlock *TrapBlock = BasicBlock::Create(F.getContext(), "divrem.by.zero",
+                                             &F);
+  Value *TrapFn = Intrinsic::getDeclaration(F.getParent(), Intrinsic::trap);
+  CallInst::Create(TrapFn, "", TrapBlock)->setDebugLoc(dl);
+  (new UnreachableInst(F.getContext(), TrapBlock))->setDebugLoc(dl);
+  return TrapBlock;
+}
+
+bool InsertDivideCheck::runOnFunction(Function &F) {
+  SmallPtrSet<Instruction*, 8> GuardedDivs;
+  // If the pass finds a DIV/REM that needs to be checked for zero denominator,
+  // it will insert a new "trap" block, and split the block that contains the
+  // DIV/REM into two blocks.  The new BasicBlocks are added after the current
+  // BasicBlock, so that if there is more than one DIV/REM in the same block,
+  // all are visited.
+  for (Function::iterator I = F.begin(); I != F.end(); I++) {
+    BasicBlock *BB = &*I;
+
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; BI++) {
+      BinaryOperator *DivInst = dyn_cast<BinaryOperator>(BI);
+      if (!DivInst || (GuardedDivs.count(DivInst) != 0))
+        continue;
+      unsigned Opcode = DivInst->getOpcode();
+      if (Opcode != Instruction::SDiv && Opcode != Instruction::UDiv &&
+          Opcode != Instruction::SRem && Opcode != Instruction::URem)
+        continue;
+      Value *Denominator = DivInst->getOperand(1);
+      if (!Denominator->getType()->isIntegerTy())
+        continue;
+      DebugLoc dl = DivInst->getDebugLoc();
+      if (ConstantInt *DenomConst = dyn_cast<ConstantInt>(Denominator)) {
+        // Divides by constants do not need a denominator test.
+        if (DenomConst->isZero()) {
+          // For explicit divides by zero, insert a trap before DIV/REM
+          Value *TrapFn = Intrinsic::getDeclaration(F.getParent(),
+                                                    Intrinsic::trap);
+          CallInst::Create(TrapFn, "", DivInst)->setDebugLoc(dl);
+        }
+        continue;
+      }
+      // Create a trap block.
+      BasicBlock *TrapBlock = CreateTrapBlock(F, dl);
+      // Move instructions in BB from DivInst to BB's end to a new block.
+      BasicBlock *Successor = BB->splitBasicBlock(BI, "guarded.divrem");
+      // Remove the unconditional branch inserted by splitBasicBlock.
+      BB->getTerminator()->eraseFromParent();
+      // Remember that DivInst was already processed, so that when we process
+      // inserted blocks later, we do not attempt to again guard it.
+      GuardedDivs.insert(DivInst);
+      // Compare the denominator with zero.
+      Value *Zero = ConstantInt::get(Denominator->getType(), 0);
+      Value *DenomIsZero = new ICmpInst(*BB, ICmpInst::ICMP_EQ, Denominator,
+                                        Zero, "");
+      // Put in a condbranch to the trap block.
+      BranchInst::Create(TrapBlock, Successor, DenomIsZero, BB);
+      // BI is invalidated when we split.  Stop the BasicBlock iterator.
+      break;
+    }
+  }
+
+  return false;
+}
+
+char InsertDivideCheck::ID = 0;
+INITIALIZE_PASS(InsertDivideCheck, "insert-divide-check",
+                "Insert divide by zero checks", false, false)
+
+FunctionPass *llvm::createInsertDivideCheckPass() {
+  return new InsertDivideCheck();
+}
diff --git a/lib/Target/JSBackend/NaCl/InternalizeUsedGlobals.cpp b/lib/Target/JSBackend/NaCl/InternalizeUsedGlobals.cpp
new file mode 100644
index 000000000000..fef6fc04be30
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/InternalizeUsedGlobals.cpp
@@ -0,0 +1,67 @@
+//===- InternalizeUsedGlobals.cpp - mark used globals as internal      ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The internalize pass does not mark internal globals marked as "used",
+// which may be achieved with __attribute((used))__ in C++, for example.
+// In PNaCl scenarios, we always perform whole program analysis, and
+// the ABI requires all but entrypoint globals to be internal. This pass
+// satisfies such requirements.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+namespace {
+
+class InternalizeUsedGlobals : public ModulePass {
+public:
+  static char ID;
+
+  InternalizeUsedGlobals() : ModulePass(ID) {
+    initializeInternalizeUsedGlobalsPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnModule(Module &M);
+};
+}
+
+char InternalizeUsedGlobals::ID = 0;
+
+INITIALIZE_PASS(InternalizeUsedGlobals, "internalize-used-globals",
+                "Mark internal globals in the llvm.used list", false, false)
+
+bool InternalizeUsedGlobals::runOnModule(Module &M) {
+  bool Changed = false;
+
+  SmallPtrSet<GlobalValue *, 8> Used;
+  collectUsedGlobalVariables(M, Used, /*CompilerUsed =*/false);
+  for (GlobalValue *V : Used) {
+    if (V->getLinkage() != GlobalValue::InternalLinkage) {
+      // Setting Linkage to InternalLinkage also sets the visibility to
+      // DefaultVisibility.
+      // For explicitness, we do so upfront.
+      V->setVisibility(GlobalValue::DefaultVisibility);
+      V->setLinkage(GlobalValue::InternalLinkage);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+ModulePass *llvm::createInternalizeUsedGlobalsPass() {
+  return new InternalizeUsedGlobals();
+}
diff --git a/lib/Target/JSBackend/NaCl/LLVMBuild.txt b/lib/Target/JSBackend/NaCl/LLVMBuild.txt
new file mode 100644
index 000000000000..f8b3b9eb13d4
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/JSBackend/NaCl/LLVMBuild.txt ---------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===-----------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===-----------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = PNaClTransforms
+parent = JSBackend
+required_libraries = Analysis Core Support IPO Scalar TransformUtils
+add_to_library_groups = JSBackend
diff --git a/lib/Target/JSBackend/NaCl/LowerEmAsyncify.cpp b/lib/Target/JSBackend/NaCl/LowerEmAsyncify.cpp
new file mode 100644
index 000000000000..4185fd49a3e8
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/LowerEmAsyncify.cpp
@@ -0,0 +1,720 @@
+//===- LowerEmAsyncify - transform asynchronous functions for Emscripten/JS   -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Lu Wang <coolwanglu@gmail.com>
+//
+// In JS we don't have functions like sleep(), which is on the other hand very popuar in C/C++ etc.
+// This pass tries to convert funcitons calling sleep() into a valid form in JavaScript
+// The basic idea is to split the callee at the place where sleep() is called,
+// then the first half may schedule the second half using setTimeout.
+// But we need to pay lots of attention to analyzing/saving/restoring context variables and return values
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h" // for DemoteRegToStack, removeUnreachableBlocks
+#include "llvm/Transforms/Utils/PromoteMemToReg.h" // for PromoteMemToReg
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Pass.h"
+
+#include <vector>
+
+using namespace llvm;
+
+static cl::list<std::string>
+AsyncifyFunctions("emscripten-asyncify-functions",
+                  cl::desc("Functions that call one of these functions, directly or indirectly, will be asyncified"),
+                  cl::CommaSeparated);
+
+static cl::list<std::string>
+AsyncifyWhiteList("emscripten-asyncify-whitelist",
+                  cl::desc("Functions that should not be asyncified"),
+                  cl::CommaSeparated);
+
+namespace {
+  class LowerEmAsyncify: public ModulePass {
+    Module *TheModule;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LowerEmAsyncify() : ModulePass(ID), TheModule(NULL) {
+      initializeLowerEmAsyncifyPass(*PassRegistry::getPassRegistry());
+    }
+    virtual ~LowerEmAsyncify() { }
+    bool runOnModule(Module &M);
+
+  private:
+    const DataLayout *DL;
+
+    Type *Void, *I1, *I32, *I32Ptr;
+    FunctionType *VFunction, *I1Function, *I32PFunction;
+    FunctionType *VI32PFunction, *I32PI32Function;
+    FunctionType *CallbackFunctionType;
+
+    Function *AllocAsyncCtxFunction, *ReallocAsyncCtxFunction, *FreeAsyncCtxFunction;
+    Function *CheckAsyncFunction;
+    Function *DoNotUnwindFunction, *DoNotUnwindAsyncFunction;
+    Function *GetAsyncReturnValueAddrFunction;
+
+    void initTypesAndFunctions(void);
+
+    typedef std::vector<Instruction *> Instructions;
+    typedef DenseMap<Function*, Instructions> FunctionInstructionsMap;
+    typedef std::vector<Value*> Values;
+    typedef SmallPtrSet<BasicBlock*, 16> BasicBlockSet;
+
+    // all the information we want for an async call
+    struct AsyncCallEntry {
+      Instruction *AsyncCallInst; // calling an async function
+      BasicBlock *AfterCallBlock; // the block we should continue on after getting the return value of AsynCallInst
+      CallInst *AllocAsyncCtxInst;  // where we allocate the async ctx before the async call, in the original function
+      Values ContextVariables; // those need to be saved and restored for the async call
+      StructType *ContextStructType; // The structure constructing all the context variables
+      BasicBlock *SaveAsyncCtxBlock; // the block in which we save all the variables
+      Function *CallbackFunc; // the callback function for this async call, which is converted from the original function
+    };
+
+    BasicBlockSet FindReachableBlocksFrom(BasicBlock *src);
+
+    // Find everything that we should save and restore for the async call
+    // save them to Entry.ContextVariables
+    void FindContextVariables(AsyncCallEntry & Entry);
+
+    // The essential function
+    // F is now in the sync form, transform it into an async form that is valid in JS
+    void transformAsyncFunction(Function &F, Instructions const& AsyncCalls);
+
+    bool IsFunctionPointerCall(const Instruction *I);
+  };
+}
+
+char LowerEmAsyncify::ID = 0;
+INITIALIZE_PASS(LowerEmAsyncify, "loweremasyncify",
+    "Lower async functions for js/emscripten",
+    false, false)
+
+bool LowerEmAsyncify::runOnModule(Module &M) {
+  TheModule = &M;
+  DL = &M.getDataLayout();
+
+  std::set<std::string> WhiteList(AsyncifyWhiteList.begin(), AsyncifyWhiteList.end());
+
+  /* 
+   * collect all the functions that should be asyncified
+   * any function that _might_ call an async function is also async
+   */
+  std::vector<Function*> AsyncFunctionsPending;
+  for(unsigned i = 0; i < AsyncifyFunctions.size(); ++i) {
+    std::string const& AFName = AsyncifyFunctions[i];
+    Function *F = TheModule->getFunction(AFName);
+    if (F && !WhiteList.count(F->getName())) {
+      AsyncFunctionsPending.push_back(F);
+    }  
+  }
+
+  // No function needed to transform
+  if (AsyncFunctionsPending.empty()) return false;
+
+  // Walk through the call graph and find all the async functions
+  FunctionInstructionsMap AsyncFunctionCalls;
+  {
+    // pessimistic: consider all indirect calls as possibly async
+    // TODO: deduce based on function types
+    for (Module::iterator FI = TheModule->begin(), FE = TheModule->end(); FI != FE; ++FI) {
+      if (WhiteList.count(FI->getName())) continue;
+
+      bool has_indirect_call = false;
+      for (inst_iterator I = inst_begin(&*FI), E = inst_end(&*FI); I != E; ++I) {
+        if (IsFunctionPointerCall(&*I)) {
+          has_indirect_call = true;
+          AsyncFunctionCalls[&*FI].push_back(&*I);
+        }
+      }
+
+      if (has_indirect_call) AsyncFunctionsPending.push_back(&*FI);
+    }
+
+    while (!AsyncFunctionsPending.empty()) {
+      Function *CurFunction = AsyncFunctionsPending.back();
+      AsyncFunctionsPending.pop_back();
+
+      for (Value::user_iterator UI = CurFunction->user_begin(), E = CurFunction->user_end(); UI != E; ++UI) {
+        ImmutableCallSite ICS(*UI);
+        if (!ICS) continue;
+        // we only need those instructions calling the function
+        // if the function address is used for other purpose, we don't care
+        if (CurFunction != ICS.getCalledValue()->stripPointerCasts()) continue;
+        // Now I is either CallInst or InvokeInst
+        Instruction *I = cast<Instruction>(*UI);
+        Function *F = I->getParent()->getParent();
+        if (AsyncFunctionCalls.count(F) == 0) {
+          AsyncFunctionsPending.push_back(F);
+        }
+        AsyncFunctionCalls[F].push_back(I);
+      }
+    }
+  }
+
+  // exit if no async function is found at all
+  if (AsyncFunctionCalls.empty()) return false;
+
+  initTypesAndFunctions();
+
+  for (FunctionInstructionsMap::iterator I = AsyncFunctionCalls.begin(), E = AsyncFunctionCalls.end();
+      I != E; ++I) {
+    transformAsyncFunction(*(I->first), I->second);
+  }
+
+  return true;
+}
+
+void LowerEmAsyncify::initTypesAndFunctions(void) {
+  // Data types
+  Void = Type::getVoidTy(TheModule->getContext());
+  I1 = Type::getInt1Ty(TheModule->getContext());
+  I32 = Type::getInt32Ty(TheModule->getContext());
+  I32Ptr = Type::getInt32PtrTy(TheModule->getContext());
+
+  // Function types
+  SmallVector<Type*, 2> ArgTypes;
+  VFunction = FunctionType::get(Void, false);
+  I1Function = FunctionType::get(I1, false);
+  I32PFunction = FunctionType::get(I32Ptr, false);
+
+  ArgTypes.clear();
+  ArgTypes.push_back(I32Ptr);
+  VI32PFunction = FunctionType::get(Void, ArgTypes, false);
+
+  ArgTypes.clear();
+  ArgTypes.push_back(I32);
+  I32PI32Function = FunctionType::get(I32Ptr, ArgTypes, false);
+
+  CallbackFunctionType = VI32PFunction;
+
+  // Functions
+  CheckAsyncFunction = Function::Create(
+    I1Function,
+    GlobalValue::ExternalLinkage,
+    "emscripten_check_async",
+    TheModule
+  );
+
+  AllocAsyncCtxFunction = Function::Create(
+    I32PI32Function,
+    GlobalValue::ExternalLinkage,
+    "emscripten_alloc_async_context",
+    TheModule
+  );
+
+  ReallocAsyncCtxFunction = Function::Create(
+    I32PI32Function,
+    GlobalValue::ExternalLinkage,
+    "emscripten_realloc_async_context",
+    TheModule
+  );
+
+  FreeAsyncCtxFunction = Function::Create(
+    VI32PFunction,
+    GlobalValue::ExternalLinkage,
+    "emscripten_free_async_context",
+    TheModule
+  );
+
+  DoNotUnwindFunction = Function::Create(
+    VFunction,
+    GlobalValue::ExternalLinkage,
+    "emscripten_do_not_unwind",
+    TheModule
+  );
+
+  DoNotUnwindAsyncFunction = Function::Create(
+    VFunction,
+    GlobalValue::ExternalLinkage,
+    "emscripten_do_not_unwind_async",
+    TheModule
+  );
+
+  GetAsyncReturnValueAddrFunction = Function::Create(
+    I32PFunction,
+    GlobalValue::ExternalLinkage,
+    "emscripten_get_async_return_value_addr",
+    TheModule
+  );
+}
+
+LowerEmAsyncify::BasicBlockSet LowerEmAsyncify::FindReachableBlocksFrom(BasicBlock *src) {
+  BasicBlockSet ReachableBlockSet;
+  std::vector<BasicBlock*> pending;
+  ReachableBlockSet.insert(src);
+  pending.push_back(src);
+  while (!pending.empty()) {
+    BasicBlock *CurBlock = pending.back();
+    pending.pop_back();
+    for (succ_iterator SI = succ_begin(CurBlock), SE = succ_end(CurBlock); SI != SE; ++SI) {
+      if (ReachableBlockSet.count(*SI) == 0) {
+        ReachableBlockSet.insert(*SI);
+        pending.push_back(*SI);
+      }
+    }
+  }
+  return ReachableBlockSet;
+}
+
+void LowerEmAsyncify::FindContextVariables(AsyncCallEntry & Entry) {
+  BasicBlock *AfterCallBlock = Entry.AfterCallBlock;
+
+  Function & F = *AfterCallBlock->getParent();
+
+  // Create a new entry block as if in the callback function
+  // theck check variables that no longer properly dominate their uses
+  BasicBlock *EntryBlock = BasicBlock::Create(TheModule->getContext(), "", &F, &F.getEntryBlock());
+  BranchInst::Create(AfterCallBlock, EntryBlock);
+
+  DominatorTreeWrapperPass DTW;
+  DTW.runOnFunction(F);
+  DominatorTree& DT = DTW.getDomTree();
+
+  // These blocks may be using some values defined at or before AsyncCallBlock
+  BasicBlockSet Ramifications = FindReachableBlocksFrom(AfterCallBlock); 
+
+  SmallPtrSet<Value*, 32> ContextVariables;
+  Values Pending;
+
+  // Examine the instructions, find all variables that we need to store in the context
+  for (BasicBlockSet::iterator RI = Ramifications.begin(), RE = Ramifications.end(); RI != RE; ++RI) {
+    for (BasicBlock::iterator I = (*RI)->begin(), E = (*RI)->end(); I != E; ++I) {
+      for (unsigned i = 0, NumOperands = I->getNumOperands(); i < NumOperands; ++i) {
+        Value *O = I->getOperand(i);
+        if (Instruction *Inst = dyn_cast<Instruction>(O)) {
+          if (Inst == Entry.AsyncCallInst) continue; // for the original async call, we will load directly from async return value
+          if (ContextVariables.count(Inst) != 0)  continue; // already examined 
+
+          if (!DT.dominates(Inst, I->getOperandUse(i))) {
+            // `I` is using `Inst`, yet `Inst` does not dominate `I` if we arrive directly at AfterCallBlock
+            // so we need to save `Inst` in the context
+            ContextVariables.insert(Inst);
+            Pending.push_back(Inst);
+          }
+        } else if (Argument *Arg = dyn_cast<Argument>(O)) {
+          // count() should be as fast/slow as insert, so just insert here 
+          ContextVariables.insert(Arg);
+        }
+      }
+    }
+  }
+
+  // restore F
+  EntryBlock->eraseFromParent();  
+
+  Entry.ContextVariables.clear();
+  Entry.ContextVariables.reserve(ContextVariables.size());
+  for (SmallPtrSet<Value*, 32>::iterator I = ContextVariables.begin(), E = ContextVariables.end(); I != E; ++I) {
+    Entry.ContextVariables.push_back(*I);
+  }
+}
+
+/*
+ * Consider that F contains a call to G, both of which are async:
+ *
+ * function F:
+ * ...
+ * %0 = G(%1, %2, ...);
+ * ...
+ * return %%;
+ *
+ * We want to convert F and generate F__asyn_cb
+ * they are similar, but with minor yet important differences
+ * Note those `main func only` and `callback func only` instructions 
+
+//////////////////////////////////////////////////////////
+  function F:
+  ...
+  ctx = alloc_ctx(len, sp); // main func only
+                         // TODO
+                         // we could also do this only after an async call
+                         // but in that case we will need to pass ctx to the function
+                         // since ctx is no longer in the top async stack frame
+  %0 = G(%1, %2, ...);
+  if (async) { // G was async
+    save context variables in ctx
+    register F.async_cb as the callback in frame
+    return without unwinding the stack frame
+  } else { // G was sync
+    // use %0 as normal
+    free_ctx(ctx); // main func only
+    // ctx is freed here, because so far F is still a sync function
+    // and we don't want any side effects
+    ...
+    async return value = %%;
+    return & normally unwind the stack frame // main func only
+  }
+//////////////////////////////////////////////////////////
+
+ * And here's F.async_cb
+
+//////////////////////////////////////////////////////////
+  function F.async_cb(ctx):
+  load variables from ctx // callback func only
+  goto resume_point;      // callback func only
+  ...
+  ctx = realloc_ctx(len); // callback func only
+                          // realloc_ctx is different from alloc_ctx
+                          // which reused the current async stack frame
+                          // we want to keep the saved stack pointer
+  %0 = G(%1, %2, ...);
+  if (async) {
+    save context variables in ctx
+    register F.async_cb as the callback
+    return without unwinding the stack frame
+  } else {
+    resume_point:
+    %0'= either $0 or the async return value // callback func only
+    ...
+    async return value = %%
+    return restore the stack pointer back to the value stored in F // callback func only
+    // no need to free the ctx
+    // the scheduler will be aware of this return and handle the stack frames
+  }
+//////////////////////////////////////////////////////////
+
+ */
+
+void LowerEmAsyncify::transformAsyncFunction(Function &F, Instructions const& AsyncCalls) {
+  assert(!AsyncCalls.empty());
+
+  // Pass 0
+  // collect all the return instructions from the original function
+  // will use later
+  std::vector<ReturnInst*> OrigReturns;
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(&*I)) {
+      OrigReturns.push_back(RI);
+    }
+  }
+
+  // Pass 1
+  // Scan each async call and make the basic structure:
+  // All these will be cloned into the callback functions
+  // - allocate the async context before calling an async function
+  // - check async right after calling an async function, save context & return if async, continue if not
+  // - retrieve the async return value and free the async context if the called function turns out to be sync
+  std::vector<AsyncCallEntry> AsyncCallEntries;
+  AsyncCallEntries.reserve(AsyncCalls.size());
+  for (Instructions::const_iterator I = AsyncCalls.begin(), E = AsyncCalls.end(); I != E; ++I) {
+    // prepare blocks
+    Instruction *CurAsyncCall = *I;
+
+    // The block containing the async call
+    BasicBlock *CurBlock = CurAsyncCall->getParent();
+    // The block should run after the async call
+    BasicBlock *AfterCallBlock = SplitBlock(CurBlock, CurAsyncCall->getNextNode());
+    // The block where we store the context and return
+    BasicBlock *SaveAsyncCtxBlock = BasicBlock::Create(TheModule->getContext(), "SaveAsyncCtx", &F, AfterCallBlock);
+    // return a dummy value at the end, to make the block valid
+    new UnreachableInst(TheModule->getContext(), SaveAsyncCtxBlock);
+
+    // allocate the context before making the call
+    // we don't know the size yet, will fix it later
+    // we cannot insert the instruction later because,
+    // we need to make sure that all the instructions and blocks are fixed before we can generate DT and find context variables
+    // In CallHandler.h `sp` will be put as the second parameter
+    // such that we can take a note of the original sp 
+    CallInst *AllocAsyncCtxInst = CallInst::Create(AllocAsyncCtxFunction, Constant::getNullValue(I32), "AsyncCtx", CurAsyncCall);
+
+    // Right after the call
+    // check async and return if so
+    // TODO: we can define truly async functions and partial async functions
+    {
+      // remove old terminator, which came from SplitBlock
+      CurBlock->getTerminator()->eraseFromParent();
+      // go to SaveAsyncCtxBlock if the previous call is async
+      // otherwise just continue to AfterCallBlock
+      CallInst *CheckAsync = CallInst::Create(CheckAsyncFunction, "IsAsync", CurBlock);
+      BranchInst::Create(SaveAsyncCtxBlock, AfterCallBlock, CheckAsync, CurBlock);
+    }
+
+    // take a note of this async call
+    AsyncCallEntry CurAsyncCallEntry;
+    CurAsyncCallEntry.AsyncCallInst = CurAsyncCall;
+    CurAsyncCallEntry.AfterCallBlock = AfterCallBlock;
+    CurAsyncCallEntry.AllocAsyncCtxInst = AllocAsyncCtxInst;
+    CurAsyncCallEntry.SaveAsyncCtxBlock = SaveAsyncCtxBlock;
+    // create an empty function for the callback, which will be constructed later
+    CurAsyncCallEntry.CallbackFunc = Function::Create(CallbackFunctionType, F.getLinkage(), F.getName() + "__async_cb", TheModule);
+    AsyncCallEntries.push_back(CurAsyncCallEntry);
+  }
+
+
+  // Pass 2
+  // analyze the context variables and construct SaveAsyncCtxBlock for each async call
+  // also calculate the size of the context and allocate the async context accordingly
+  for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
+    AsyncCallEntry & CurEntry = *EI;
+
+    // Collect everything to be saved
+    FindContextVariables(CurEntry);
+
+    // Pack the variables as a struct
+    {
+      // TODO: sort them from large memeber to small ones, in order to make the struct compact even when aligned
+      SmallVector<Type*, 8> Types;
+      Types.push_back(CallbackFunctionType->getPointerTo());
+      for (Values::iterator VI = CurEntry.ContextVariables.begin(), VE = CurEntry.ContextVariables.end(); VI != VE; ++VI) {
+        Types.push_back((*VI)->getType());
+      }
+      CurEntry.ContextStructType = StructType::get(TheModule->getContext(), Types);
+    }
+
+    // fix the size of allocation
+    CurEntry.AllocAsyncCtxInst->setOperand(0, 
+        ConstantInt::get(I32, DL->getTypeStoreSize(CurEntry.ContextStructType)));
+
+    // construct SaveAsyncCtxBlock
+    {
+      // fill in SaveAsyncCtxBlock
+      // temporarily remove the terminator for convenience
+      CurEntry.SaveAsyncCtxBlock->getTerminator()->eraseFromParent();
+      assert(CurEntry.SaveAsyncCtxBlock->empty());
+
+      Type *AsyncCtxAddrTy = CurEntry.ContextStructType->getPointerTo();
+      BitCastInst *AsyncCtxAddr = new BitCastInst(CurEntry.AllocAsyncCtxInst, AsyncCtxAddrTy, "AsyncCtxAddr", CurEntry.SaveAsyncCtxBlock);
+      SmallVector<Value*, 2> Indices;
+      // store the callback
+      {
+        Indices.push_back(ConstantInt::get(I32, 0));
+        Indices.push_back(ConstantInt::get(I32, 0));
+        GetElementPtrInst *AsyncVarAddr = GetElementPtrInst::Create(CurEntry.ContextStructType, AsyncCtxAddr, Indices, "", CurEntry.SaveAsyncCtxBlock);
+        new StoreInst(CurEntry.CallbackFunc, AsyncVarAddr, CurEntry.SaveAsyncCtxBlock);
+      }
+      // store the context variables
+      for (size_t i = 0; i < CurEntry.ContextVariables.size(); ++i) {
+        Indices.clear();
+        Indices.push_back(ConstantInt::get(I32, 0));
+        Indices.push_back(ConstantInt::get(I32, i + 1)); // the 0th element is the callback function
+        GetElementPtrInst *AsyncVarAddr = GetElementPtrInst::Create(CurEntry.ContextStructType, AsyncCtxAddr, Indices, "", CurEntry.SaveAsyncCtxBlock);
+        new StoreInst(CurEntry.ContextVariables[i], AsyncVarAddr, CurEntry.SaveAsyncCtxBlock);
+      }
+      // to exit the block, we want to return without unwinding the stack frame
+      CallInst::Create(DoNotUnwindFunction, "", CurEntry.SaveAsyncCtxBlock);
+      ReturnInst::Create(TheModule->getContext(), 
+          (F.getReturnType()->isVoidTy() ? 0 : Constant::getNullValue(F.getReturnType())),
+          CurEntry.SaveAsyncCtxBlock);
+    }
+  }
+
+  // Pass 3
+  // now all the SaveAsyncCtxBlock's have been constructed
+  // we can clone F and construct callback functions 
+  // we could not construct the callbacks in Pass 2 because we need _all_ those SaveAsyncCtxBlock's appear in _each_ callback
+  for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
+    AsyncCallEntry & CurEntry = *EI;
+
+    Function *CurCallbackFunc = CurEntry.CallbackFunc;
+    ValueToValueMapTy VMap;
+
+    // Add the entry block
+    // load variables from the context
+    // also update VMap for CloneFunction
+    BasicBlock *EntryBlock = BasicBlock::Create(TheModule->getContext(), "AsyncCallbackEntry", CurCallbackFunc);
+    std::vector<LoadInst *> LoadedAsyncVars;
+    {
+      Type *AsyncCtxAddrTy = CurEntry.ContextStructType->getPointerTo();
+      BitCastInst *AsyncCtxAddr = new BitCastInst(&*CurCallbackFunc->arg_begin(), AsyncCtxAddrTy, "AsyncCtx", EntryBlock);
+      SmallVector<Value*, 2> Indices;
+      for (size_t i = 0; i < CurEntry.ContextVariables.size(); ++i) {
+        Indices.clear();
+        Indices.push_back(ConstantInt::get(I32, 0));
+        Indices.push_back(ConstantInt::get(I32, i + 1)); // the 0th element of AsyncCtx is the callback function
+        GetElementPtrInst *AsyncVarAddr = GetElementPtrInst::Create(CurEntry.ContextStructType, AsyncCtxAddr, Indices, "", EntryBlock);
+        LoadedAsyncVars.push_back(new LoadInst(AsyncVarAddr, "", EntryBlock));
+        // we want the argument to be replaced by the loaded value
+        if (isa<Argument>(CurEntry.ContextVariables[i]))
+          VMap[CurEntry.ContextVariables[i]] = LoadedAsyncVars.back();
+      }
+    }
+
+    // we don't need any argument, just leave dummy entries there to cheat CloneFunctionInto
+    for (Function::const_arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) {
+      if (VMap.count(&*AI) == 0)
+        VMap[&*AI] = Constant::getNullValue(AI->getType());
+    }
+
+    // Clone the function
+    {
+      SmallVector<ReturnInst*, 8> Returns;
+      CloneFunctionInto(CurCallbackFunc, &F, VMap, false, Returns);
+      
+      // return type of the callback functions is always void
+      // need to fix the return type
+      if (!F.getReturnType()->isVoidTy()) {
+        // for those return instructions that are from the original function
+        // it means we are 'truly' leaving this function
+        // need to store the return value right before ruturn
+        for (size_t i = 0; i < OrigReturns.size(); ++i) {
+          ReturnInst *RI = cast<ReturnInst>(VMap[OrigReturns[i]]);
+          // Need to store the return value into the global area
+          CallInst *RawRetValAddr = CallInst::Create(GetAsyncReturnValueAddrFunction, "", RI);
+          BitCastInst *RetValAddr = new BitCastInst(RawRetValAddr, F.getReturnType()->getPointerTo(), "AsyncRetValAddr", RI);
+          new StoreInst(RI->getOperand(0), RetValAddr, RI);
+        }
+        // we want to unwind the stack back to where it was before the original function as called
+        // but we don't actually need to do this here
+        // at this point it must be true that no callback is pended
+        // so the scheduler will correct the stack pointer and pop the frame
+        // here we just fix the return type
+        for (size_t i = 0; i < Returns.size(); ++i) {
+          ReplaceInstWithInst(Returns[i], ReturnInst::Create(TheModule->getContext()));
+        }
+      }
+    }
+
+    // the callback function does not have any return value
+    // so clear all the attributes for return
+    {
+      AttributeSet Attrs = CurCallbackFunc->getAttributes();
+      CurCallbackFunc->setAttributes(
+        Attrs.removeAttributes(TheModule->getContext(), AttributeSet::ReturnIndex, Attrs.getRetAttributes())
+      );
+    }
+
+    // in the callback function, we never allocate a new async frame
+    // instead we reuse the existing one
+    for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
+      Instruction *I = cast<Instruction>(VMap[EI->AllocAsyncCtxInst]);
+      ReplaceInstWithInst(I, CallInst::Create(ReallocAsyncCtxFunction, I->getOperand(0), "ReallocAsyncCtx"));
+    }
+
+    // mapped entry point & async call
+    BasicBlock *ResumeBlock = cast<BasicBlock>(VMap[CurEntry.AfterCallBlock]);
+    Instruction *MappedAsyncCall = cast<Instruction>(VMap[CurEntry.AsyncCallInst]);
+   
+    // To save space, for each async call in the callback function, we just ignore the sync case, and leave it to the scheduler
+    // TODO need an option for this
+    {
+      for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
+        AsyncCallEntry & CurEntry = *EI;
+        Instruction *MappedAsyncCallInst = cast<Instruction>(VMap[CurEntry.AsyncCallInst]);
+        BasicBlock *MappedAsyncCallBlock = MappedAsyncCallInst->getParent();
+        BasicBlock *MappedAfterCallBlock = cast<BasicBlock>(VMap[CurEntry.AfterCallBlock]);
+
+        // for the sync case of the call, go to NewBlock (instead of MappedAfterCallBlock)
+        BasicBlock *NewBlock = BasicBlock::Create(TheModule->getContext(), "", CurCallbackFunc, MappedAfterCallBlock);
+        MappedAsyncCallBlock->getTerminator()->setSuccessor(1, NewBlock);
+        // store the return value
+        if (!MappedAsyncCallInst->use_empty()) {
+          CallInst *RawRetValAddr = CallInst::Create(GetAsyncReturnValueAddrFunction, "", NewBlock);
+          BitCastInst *RetValAddr = new BitCastInst(RawRetValAddr, MappedAsyncCallInst->getType()->getPointerTo(), "AsyncRetValAddr", NewBlock);
+          new StoreInst(MappedAsyncCallInst, RetValAddr, NewBlock);
+        }
+        // tell the scheduler that we want to keep the current async stack frame
+        CallInst::Create(DoNotUnwindAsyncFunction, "", NewBlock);
+        // finally we go to the SaveAsyncCtxBlock, to register the callbac, save the local variables and leave
+        BasicBlock *MappedSaveAsyncCtxBlock = cast<BasicBlock>(VMap[CurEntry.SaveAsyncCtxBlock]);
+        BranchInst::Create(MappedSaveAsyncCtxBlock, NewBlock);
+      }
+    }
+
+    std::vector<AllocaInst*> ToPromote;
+    // applying loaded variables in the entry block
+    {
+      BasicBlockSet ReachableBlocks = FindReachableBlocksFrom(ResumeBlock);
+      for (size_t i = 0; i < CurEntry.ContextVariables.size(); ++i) {
+        Value *OrigVar = CurEntry.ContextVariables[i];
+        if (isa<Argument>(OrigVar)) continue; // already processed
+        Value *CurVar = VMap[OrigVar];
+        assert(CurVar != MappedAsyncCall);
+        if (Instruction *Inst = dyn_cast<Instruction>(CurVar)) {
+          if (ReachableBlocks.count(Inst->getParent())) {
+            // Inst could be either defined or loaded from the async context
+            // Do the dirty works in memory
+            // TODO: might need to check the safety first
+            // TODO: can we create phi directly?
+            AllocaInst *Addr = DemoteRegToStack(*Inst, false);
+            new StoreInst(LoadedAsyncVars[i], Addr, EntryBlock);
+            ToPromote.push_back(Addr);
+          } else {
+            // The parent block is not reachable, which means there is no confliction
+            // it's safe to replace Inst with the loaded value
+            assert(Inst != LoadedAsyncVars[i]); // this should only happen when OrigVar is an Argument
+            Inst->replaceAllUsesWith(LoadedAsyncVars[i]); 
+          }
+        }
+      }
+    }
+
+    // resolve the return value of the previous async function
+    // it could be the value just loaded from the global area
+    // or directly returned by the function (in its sync case)
+    if (!CurEntry.AsyncCallInst->use_empty()) {
+      // load the async return value
+      CallInst *RawRetValAddr = CallInst::Create(GetAsyncReturnValueAddrFunction, "", EntryBlock);
+      BitCastInst *RetValAddr = new BitCastInst(RawRetValAddr, MappedAsyncCall->getType()->getPointerTo(), "AsyncRetValAddr", EntryBlock);
+      LoadInst *RetVal = new LoadInst(RetValAddr, "AsyncRetVal", EntryBlock);
+
+      AllocaInst *Addr = DemoteRegToStack(*MappedAsyncCall, false);
+      new StoreInst(RetVal, Addr, EntryBlock);
+      ToPromote.push_back(Addr);
+    }
+
+    // TODO remove unreachable blocks before creating phi
+   
+    // We go right to ResumeBlock from the EntryBlock
+    BranchInst::Create(ResumeBlock, EntryBlock);
+   
+    /*
+     * Creating phi's
+     * Normal stack frames and async stack frames are interleaving with each other.
+     * In a callback function, if we call an async function, we might need to realloc the async ctx.
+     * at this point we don't want anything stored after the ctx, 
+     * such that we can free and extend the ctx by simply update STACKTOP.
+     * Therefore we don't want any alloca's in callback functions.
+     *
+     */
+    if (!ToPromote.empty()) {
+      DominatorTreeWrapperPass DTW;
+      DTW.runOnFunction(*CurCallbackFunc);
+      PromoteMemToReg(ToPromote, DTW.getDomTree());
+    }
+
+    removeUnreachableBlocks(*CurCallbackFunc);
+  }
+
+  // Pass 4
+  // Here are modifications to the original function, which we won't want to be cloned into the callback functions
+  for (std::vector<AsyncCallEntry>::iterator EI = AsyncCallEntries.begin(), EE = AsyncCallEntries.end();  EI != EE; ++EI) {
+    AsyncCallEntry & CurEntry = *EI;
+    // remove the frame if no async functinon has been called
+    CallInst::Create(FreeAsyncCtxFunction, CurEntry.AllocAsyncCtxInst, "", CurEntry.AfterCallBlock->getFirstNonPHI());
+  }
+}
+
+bool LowerEmAsyncify::IsFunctionPointerCall(const Instruction *I) {
+  // mostly from CallHandler.h
+  ImmutableCallSite CS(I);
+  if (!CS) return false; // not call nor invoke
+  const Value *CV = CS.getCalledValue()->stripPointerCasts();
+  return !isa<const Function>(CV);
+}
+
+ModulePass *llvm::createLowerEmAsyncifyPass() {
+  return new LowerEmAsyncify();
+}
diff --git a/lib/Target/JSBackend/NaCl/LowerEmExceptionsPass.cpp b/lib/Target/JSBackend/NaCl/LowerEmExceptionsPass.cpp
new file mode 100644
index 000000000000..6e6803664c41
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/LowerEmExceptionsPass.cpp
@@ -0,0 +1,275 @@
+//===- LowerEmExceptions - Lower exceptions for Emscripten/JS   -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is based off the 'cheap' version of LowerInvoke. It does two things:
+//
+//  1) Lower
+//         invoke() to l1 unwind l2
+//     into
+//         preinvoke(id); // (will clear __THREW__)
+//         call();
+//         threw = postinvoke(id); (check __THREW__)
+//         br threw, l1, l2
+//
+//     We do this to avoid introducing a new LLVM IR type, or to try to reuse
+//     invoke-landingpad for our special purposes (as they are checked very
+//     carefully by llvm)
+//
+//  2) Lower landingpads to a call to emscripten_landingpad
+//
+//  3) Lower resume to emscripten_resume which receives non-aggregate inputs
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <vector>
+#include <set>
+
+using namespace llvm;
+
+static cl::list<std::string>
+Whitelist("emscripten-cpp-exceptions-whitelist",
+          cl::desc("Enables C++ exceptions in emscripten (see emscripten EXCEPTION_CATCHING_WHITELIST option)"),
+          cl::CommaSeparated);
+
+namespace {
+  class LowerEmExceptions : public ModulePass {
+    Function *GetHigh, *PreInvoke, *PostInvoke, *LandingPad, *Resume;
+    Module *TheModule;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LowerEmExceptions() : ModulePass(ID), GetHigh(NULL), PreInvoke(NULL), PostInvoke(NULL), LandingPad(NULL), Resume(NULL), TheModule(NULL) {
+      initializeLowerEmExceptionsPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnModule(Module &M);
+  };
+}
+
+char LowerEmExceptions::ID = 0;
+INITIALIZE_PASS(LowerEmExceptions, "loweremexceptions",
+                "Lower invoke and unwind for js/emscripten",
+                false, false)
+
+bool canThrow(Value *V) {
+  if (Function *F = dyn_cast<Function>(V)) {
+    // intrinsics and some emscripten builtins cannot throw
+    if (F->isIntrinsic()) return false;
+    StringRef Name = F->getName();
+    if (Name.startswith("emscripten_asm_")) return false;
+    if (Name == "setjmp" || Name == "longjmp") return false; // leave setjmp and longjmp (mostly) alone, we process them properly later
+    return true;
+  }
+  return true; // not a function, so an indirect call - can throw, we can't tell
+}
+
+bool LowerEmExceptions::runOnModule(Module &M) {
+  TheModule = &M;
+
+  // Add functions
+
+  Type *i32 = Type::getInt32Ty(M.getContext());
+  Type *i8 = Type::getInt8Ty(M.getContext());
+  Type *i1 = Type::getInt1Ty(M.getContext());
+  Type *i8P = i8->getPointerTo();
+  Type *Void = Type::getVoidTy(M.getContext());
+
+  if (!(GetHigh = TheModule->getFunction("getHigh32"))) {
+    FunctionType *GetHighFunc = FunctionType::get(i32, false);
+    GetHigh = Function::Create(GetHighFunc, GlobalValue::ExternalLinkage,
+                               "getHigh32", TheModule);
+  }
+
+  if (!(PreInvoke = TheModule->getFunction("emscripten_preinvoke"))) {
+    SmallVector<Type*, 1> IntArgTypes;
+    IntArgTypes.push_back(i32);
+    FunctionType *VoidIntFunc = FunctionType::get(Void, IntArgTypes, false);
+    PreInvoke = Function::Create(VoidIntFunc, GlobalValue::ExternalLinkage, "emscripten_preinvoke", TheModule);
+  }
+
+  if (!(PostInvoke = TheModule->getFunction("emscripten_postinvoke"))) {
+    SmallVector<Type*, 1> IntArgTypes;
+    IntArgTypes.push_back(i32);
+    FunctionType *IntIntFunc = FunctionType::get(i32, IntArgTypes, false);
+    PostInvoke = Function::Create(IntIntFunc, GlobalValue::ExternalLinkage, "emscripten_postinvoke", TheModule);
+  }
+
+  FunctionType *LandingPadFunc = FunctionType::get(i8P, true);
+  LandingPad = Function::Create(LandingPadFunc, GlobalValue::ExternalLinkage, "emscripten_landingpad", TheModule);
+
+  FunctionType *ResumeFunc = FunctionType::get(Void, true);
+  Resume = Function::Create(ResumeFunc, GlobalValue::ExternalLinkage, "emscripten_resume", TheModule);
+  
+  // Process
+
+  std::set<std::string> WhitelistSet(Whitelist.begin(), Whitelist.end());
+
+  bool Changed = false;
+
+  unsigned InvokeId = 0;
+
+  for (Module::iterator Iter = M.begin(), E = M.end(); Iter != E; ) {
+    Function *F = &*Iter++;
+
+    std::vector<Instruction*> ToErase;
+    std::set<LandingPadInst*> LandingPads;
+
+    bool AllowExceptionsInFunc = WhitelistSet.empty() || (WhitelistSet.count("_" + F->getName().str()) != 0);
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      // check terminator for invokes
+      if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+        LandingPads.insert(II->getLandingPadInst());
+
+        bool NeedInvoke = AllowExceptionsInFunc && canThrow(II->getCalledValue());
+
+        if (NeedInvoke) {
+          // If we are calling a function that is noreturn, we must remove that attribute. The code we
+          // insert here does expect it to return, after we catch the exception.
+          if (II->doesNotReturn()) {
+            if (Function *F = dyn_cast<Function>(II->getCalledValue())) {
+              F->removeFnAttr(Attribute::NoReturn);
+            }
+            II->setAttributes(II->getAttributes().removeAttribute(TheModule->getContext(), AttributeSet::FunctionIndex, Attribute::NoReturn));
+            assert(!II->doesNotReturn());
+          }
+
+          // Insert a normal call instruction folded in between pre- and post-invoke
+          SmallVector<Value*,1> HelperArgs;
+          HelperArgs.push_back(ConstantInt::get(i32, InvokeId++));
+          CallInst::Create(PreInvoke, HelperArgs, "", II);
+
+          SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
+          CallInst *NewCall = CallInst::Create(II->getCalledValue(),
+                                               CallArgs, "", II);
+          NewCall->takeName(II);
+          NewCall->setCallingConv(II->getCallingConv());
+          NewCall->setAttributes(II->getAttributes());
+          NewCall->setDebugLoc(II->getDebugLoc());
+          II->replaceAllUsesWith(NewCall);
+          ToErase.push_back(II);
+
+          CallInst *Post = CallInst::Create(PostInvoke, HelperArgs, "", II);
+          Instruction *Post1 = new TruncInst(Post, i1, "", II);
+
+          // Insert a branch based on the postInvoke
+          BranchInst::Create(II->getUnwindDest(), II->getNormalDest(), Post1, II);
+        } else {
+          // This can't throw, and we don't need this invoke, just replace it with a call+branch
+          SmallVector<Value*,16> CallArgs(II->op_begin(), II->op_end() - 3);
+          CallInst *NewCall = CallInst::Create(II->getCalledValue(),
+                                               CallArgs, "", II);
+          NewCall->takeName(II);
+          NewCall->setCallingConv(II->getCallingConv());
+          NewCall->setAttributes(II->getAttributes());
+          NewCall->setDebugLoc(II->getDebugLoc());
+          II->replaceAllUsesWith(NewCall);
+          ToErase.push_back(II);
+
+          BranchInst::Create(II->getNormalDest(), II);
+
+          // Remove any PHI node entries from the exception destination.
+          II->getUnwindDest()->removePredecessor(&*BB);
+        }
+
+        Changed = true;
+      }
+      // scan the body of the basic block for resumes
+      for (BasicBlock::iterator Iter = BB->begin(), E = BB->end();
+           Iter != E; ) {
+        Instruction *I = &*Iter++;
+        if (ResumeInst *R = dyn_cast<ResumeInst>(I)) {
+          // split the input into legal values
+          Value *Input = R->getValue();
+          ExtractValueInst *Low = ExtractValueInst::Create(Input, 0, "", R);
+          ExtractValueInst *High = ExtractValueInst::Create(Input, 1, "", R);
+
+          // create a resume call
+          SmallVector<Value*,2> CallArgs;
+          CallArgs.push_back(Low);
+          CallArgs.push_back(High);
+          CallInst::Create(Resume, CallArgs, "", R);
+
+          new UnreachableInst(TheModule->getContext(), R); // add a terminator to the block
+
+          ToErase.push_back(R);
+        }
+      }
+    }
+
+    // Look for orphan landingpads, can occur in blocks with no predecesors
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      Instruction *I = BB->getFirstNonPHI();
+      if (LandingPadInst *LP = dyn_cast<LandingPadInst>(I)) {
+        LandingPads.insert(LP);
+      }
+    }
+
+    // Handle all the landingpad for this function together, as multiple invokes may share a single lp
+    for (std::set<LandingPadInst*>::iterator I = LandingPads.begin(); I != LandingPads.end(); I++) {
+      // Replace the landingpad with a landingpad call to get the low part, and a getHigh for the high
+      LandingPadInst *LP = *I;
+      unsigned Num = LP->getNumClauses();
+      SmallVector<Value*,16> NewLPArgs;
+      NewLPArgs.push_back(F->getPersonalityFn());
+      for (unsigned i = 0; i < Num; i++) {
+        Value *Arg = LP->getClause(i);
+        // As a temporary workaround for the lack of aggregate varargs support
+        // in the varargs lowering code, break out filter operands into their
+        // component elements.
+        if (LP->isFilter(i)) {
+          ArrayType *ATy = cast<ArrayType>(Arg->getType());
+          for (unsigned elem = 0, elemEnd = ATy->getNumElements(); elem != elemEnd; ++elem) {
+            Instruction *EE = ExtractValueInst::Create(Arg, makeArrayRef(elem), "", LP);
+            NewLPArgs.push_back(EE);
+          }
+        } else {
+          NewLPArgs.push_back(Arg);
+        }
+      }
+      NewLPArgs.push_back(LP->isCleanup() ? ConstantInt::getTrue(i1) : ConstantInt::getFalse(i1));
+      CallInst *NewLP = CallInst::Create(LandingPad, NewLPArgs, "", LP);
+
+      Instruction *High = CallInst::Create(GetHigh, "", LP);
+
+      // New recreate an aggregate for them, which will be all simplified later (simplification cannot handle landingpad, hence all this)
+      InsertValueInst *IVA = InsertValueInst::Create(UndefValue::get(LP->getType()), NewLP, 0, "", LP);
+      InsertValueInst *IVB = InsertValueInst::Create(IVA, High, 1, "", LP);
+
+      LP->replaceAllUsesWith(IVB);
+      ToErase.push_back(LP);
+    }
+
+    // erase everything we no longer need in this function
+    for (unsigned i = 0; i < ToErase.size(); i++) ToErase[i]->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+ModulePass *llvm::createLowerEmExceptionsPass() {
+  return new LowerEmExceptions();
+}
+
diff --git a/lib/Target/JSBackend/NaCl/LowerEmSetjmp.cpp b/lib/Target/JSBackend/NaCl/LowerEmSetjmp.cpp
new file mode 100644
index 000000000000..64500d286804
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/LowerEmSetjmp.cpp
@@ -0,0 +1,349 @@
+//===- LowerEmSetjmp - Lower setjmp/longjmp for Emscripten/JS   -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Lowers setjmp to a reasonably-performant approach for emscripten. The idea
+// is that each block with a setjmp is broken up into the part right after
+// the setjmp, and a new basic block is added which is either reached from
+// the setjmp, or later from a longjmp. To handle the longjmp, all calls that
+// might longjmp are checked immediately afterwards.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <vector>
+#include <set>
+#include <list>
+
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Utilities for mem/reg: based on Reg2Mem and MemToReg
+
+bool valueEscapes(const Instruction *Inst) {
+  const BasicBlock *BB = Inst->getParent();
+  for (Value::const_user_iterator UI = Inst->user_begin(),E = Inst->user_end();
+       UI != E; ++UI) {
+    const User *U = *UI;
+    const Instruction *I = cast<Instruction>(U);
+    if (I->getParent() != BB || isa<PHINode>(I))
+      return true;
+  }
+  return false;
+}
+
+void doRegToMem(Function &F) { // see Reg2Mem.cpp
+  // Insert all new allocas into entry block.
+  BasicBlock *BBEntry = &F.getEntryBlock();
+  assert(pred_begin(BBEntry) == pred_end(BBEntry) &&
+         "Entry block to function must not have predecessors!");
+
+  // Find first non-alloca instruction and create insertion point. This is
+  // safe if block is well-formed: it always have terminator, otherwise
+  // we'll get and assertion.
+  BasicBlock::iterator I = BBEntry->begin();
+  while (isa<AllocaInst>(I)) ++I;
+
+  CastInst *AllocaInsertionPoint =
+    new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+                    Type::getInt32Ty(F.getContext()),
+                    "reg2mem alloca point", &*I);
+
+  // Find the escaped instructions. But don't create stack slots for
+  // allocas in entry block.
+  std::list<Instruction*> WorkList;
+  for (Function::iterator ibb = F.begin(), ibe = F.end();
+       ibb != ibe; ++ibb)
+    for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+         iib != iie; ++iib) {
+      if (!(isa<AllocaInst>(iib) && iib->getParent() == BBEntry) &&
+          valueEscapes(&*iib)) {
+        WorkList.push_front(&*iib);
+      }
+    }
+
+  // Demote escaped instructions
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
+       ile = WorkList.end(); ilb != ile; ++ilb)
+    DemoteRegToStack(**ilb, false, AllocaInsertionPoint);
+
+  WorkList.clear();
+
+  // Find all phi's
+  for (Function::iterator ibb = F.begin(), ibe = F.end();
+       ibb != ibe; ++ibb)
+    for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+         iib != iie; ++iib)
+      if (isa<PHINode>(iib))
+        WorkList.push_front(&*iib);
+
+  // Demote phi nodes
+  for (std::list<Instruction*>::iterator ilb = WorkList.begin(),
+       ile = WorkList.end(); ilb != ile; ++ilb)
+    DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint);
+}
+
+void doMemToReg(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+
+  DominatorTreeWrapperPass DTW;
+  DTW.runOnFunction(F);
+  DominatorTree& DT = DTW.getDomTree();
+
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT);
+  }
+}
+
+// LowerEmSetjmp
+
+namespace {
+  class LowerEmSetjmp : public ModulePass {
+    Module *TheModule;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LowerEmSetjmp() : ModulePass(ID), TheModule(NULL) {
+      initializeLowerEmSetjmpPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnModule(Module &M);
+  };
+}
+
+char LowerEmSetjmp::ID = 0;
+INITIALIZE_PASS(LowerEmSetjmp, "loweremsetjmp",
+                "Lower setjmp and longjmp for js/emscripten",
+                false, false)
+
+bool LowerEmSetjmp::runOnModule(Module &M) {
+  TheModule = &M;
+
+  Function *Setjmp = TheModule->getFunction("setjmp");
+  Function *Longjmp = TheModule->getFunction("longjmp");
+  if (!Setjmp && !Longjmp) return false;
+
+  Type *i32 = Type::getInt32Ty(M.getContext());
+  Type *Void = Type::getVoidTy(M.getContext());
+
+  // Add functions
+
+  Function *EmSetjmp = NULL;
+
+  if (Setjmp) {
+    SmallVector<Type*, 2> EmSetjmpTypes;
+    EmSetjmpTypes.push_back(Setjmp->getFunctionType()->getParamType(0));
+    EmSetjmpTypes.push_back(i32); // extra param that says which setjmp in the function it is
+    FunctionType *EmSetjmpFunc = FunctionType::get(i32, EmSetjmpTypes, false);
+    EmSetjmp = Function::Create(EmSetjmpFunc, GlobalValue::ExternalLinkage, "emscripten_setjmp", TheModule);
+  }
+
+  Function *EmLongjmp = Longjmp ? Function::Create(Longjmp->getFunctionType(), GlobalValue::ExternalLinkage, "emscripten_longjmp", TheModule) : NULL;
+
+  SmallVector<Type*, 1> IntArgTypes;
+  IntArgTypes.push_back(i32);
+  FunctionType *IntIntFunc = FunctionType::get(i32, IntArgTypes, false);
+  FunctionType *VoidIntFunc = FunctionType::get(Void, IntArgTypes, false);
+
+  Function *CheckLongjmp = Function::Create(IntIntFunc, GlobalValue::ExternalLinkage, "emscripten_check_longjmp", TheModule); // gets control flow
+
+  Function *GetLongjmpResult = Function::Create(IntIntFunc, GlobalValue::ExternalLinkage, "emscripten_get_longjmp_result", TheModule); // gets int value longjmp'd
+
+  FunctionType *VoidFunc = FunctionType::get(Void, false);
+  Function *PrepSetjmp = Function::Create(VoidFunc, GlobalValue::ExternalLinkage, "emscripten_prep_setjmp", TheModule);
+
+  Function *CleanupSetjmp = Function::Create(VoidFunc, GlobalValue::ExternalLinkage, "emscripten_cleanup_setjmp", TheModule);
+
+  Function *PreInvoke = TheModule->getFunction("emscripten_preinvoke");
+  if (!PreInvoke) PreInvoke = Function::Create(VoidIntFunc, GlobalValue::ExternalLinkage, "emscripten_preinvoke", TheModule);
+
+  Function *PostInvoke = TheModule->getFunction("emscripten_postinvoke");
+  if (!PostInvoke) PostInvoke = Function::Create(IntIntFunc, GlobalValue::ExternalLinkage, "emscripten_postinvoke", TheModule);
+
+  // Process all callers of setjmp and longjmp. Start with setjmp.
+
+  typedef std::vector<PHINode*> Phis;
+  typedef std::map<Function*, Phis> FunctionPhisMap;
+  FunctionPhisMap SetjmpOutputPhis;
+  std::vector<Instruction*> ToErase;
+
+  if (Setjmp) {
+    for (Instruction::user_iterator UI = Setjmp->user_begin(), UE = Setjmp->user_end(); UI != UE; ++UI) {
+      User *U = *UI;
+      if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        BasicBlock *SJBB = CI->getParent();
+        // The tail is everything right after the call, and will be reached once when setjmp is
+        // called, and later when longjmp returns to the setjmp
+        BasicBlock *Tail = SplitBlock(SJBB, CI->getNextNode());
+        // Add a phi to the tail, which will be the output of setjmp, which indicates if this is the
+        // first call or a longjmp back. The phi directly uses the right value based on where we
+        // arrive from
+        PHINode *SetjmpOutput = PHINode::Create(i32, 2, "", Tail->getFirstNonPHI());
+        SetjmpOutput->addIncoming(ConstantInt::get(i32, 0), SJBB); // setjmp initial call returns 0
+        CI->replaceAllUsesWith(SetjmpOutput); // The proper output is now this, not the setjmp call itself
+        // longjmp returns to the setjmp will add themselves to this phi
+        Phis& P = SetjmpOutputPhis[SJBB->getParent()];
+        P.push_back(SetjmpOutput);
+        // fix call target
+        SmallVector<Value *, 2> Args;
+        Args.push_back(CI->getArgOperand(0));
+        Args.push_back(ConstantInt::get(i32, P.size())); // our index in the function is our place in the array + 1
+        CallInst::Create(EmSetjmp, Args, "", CI);
+        ToErase.push_back(CI);
+      } else {
+        errs() << **UI << "\n";
+        report_fatal_error("bad use of setjmp, should only call it");
+      }
+    }
+  }
+
+  // Update longjmp FIXME: we could avoid throwing in longjmp as an optimization when longjmping back into the current function perhaps?
+
+  if (Longjmp) Longjmp->replaceAllUsesWith(EmLongjmp);
+
+  // Update all setjmping functions
+
+  unsigned InvokeId = 0;
+
+  for (FunctionPhisMap::iterator I = SetjmpOutputPhis.begin(); I != SetjmpOutputPhis.end(); I++) {
+    Function *F = I->first;
+    Phis& P = I->second;
+
+    CallInst::Create(PrepSetjmp, "", &*F->begin()->begin());
+
+    // Update each call that can longjmp so it can return to a setjmp where relevant
+
+    for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ) {
+      BasicBlock *BB = &*BBI++;
+      for (BasicBlock::iterator Iter = BB->begin(), E = BB->end(); Iter != E; ) {
+        Instruction *I = &*Iter++;
+        CallInst *CI;
+        if ((CI = dyn_cast<CallInst>(I))) {
+          Value *V = CI->getCalledValue();
+          if (V == PrepSetjmp || V == EmSetjmp || V == CheckLongjmp || V == GetLongjmpResult || V == PreInvoke || V == PostInvoke) continue;
+          if (Function *CF = dyn_cast<Function>(V)) if (CF->isIntrinsic()) continue;
+          // TODO: proper analysis of what can actually longjmp. Currently we assume anything but setjmp can.
+          // This may longjmp, so we need to check if it did. Split at that point, and
+          // envelop the call in pre/post invoke, if we need to
+          CallInst *After;
+          Instruction *Check = NULL;
+          if (Iter != E && (After = dyn_cast<CallInst>(Iter)) && After->getCalledValue() == PostInvoke) {
+            // use the pre|postinvoke that exceptions lowering already made
+            Check = &*Iter++;
+          }
+          BasicBlock *Tail = SplitBlock(BB, &*Iter); // Iter already points to the next instruction, as we need
+          TerminatorInst *TI = BB->getTerminator();
+          if (!Check) {
+            // no existing pre|postinvoke, create our own
+            SmallVector<Value*,1> HelperArgs;
+            HelperArgs.push_back(ConstantInt::get(i32, InvokeId++));
+
+            CallInst::Create(PreInvoke, HelperArgs, "", CI);
+            Check = CallInst::Create(PostInvoke, HelperArgs, "", TI); // CI is at the end of the block
+
+            // If we are calling a function that is noreturn, we must remove that attribute. The code we
+            // insert here does expect it to return, after we catch the exception.
+            if (CI->doesNotReturn()) {
+              if (Function *F = dyn_cast<Function>(CI->getCalledValue())) {
+                F->removeFnAttr(Attribute::NoReturn);
+              }
+              CI->setAttributes(CI->getAttributes().removeAttribute(TheModule->getContext(), AttributeSet::FunctionIndex, Attribute::NoReturn));
+              assert(!CI->doesNotReturn());
+            }
+          }
+
+          // We need to replace the terminator in Tail - SplitBlock makes BB go straight to Tail, we need to check if a longjmp occurred, and
+          // go to the right setjmp-tail if so
+          SmallVector<Value *, 1> Args;
+          Args.push_back(Check);
+          Instruction *LongjmpCheck = CallInst::Create(CheckLongjmp, Args, "", BB);
+          Instruction *LongjmpResult = CallInst::Create(GetLongjmpResult, Args, "", BB);
+          SwitchInst *SI = SwitchInst::Create(LongjmpCheck, Tail, 2, BB);
+          // -1 means no longjmp happened, continue normally (will hit the default switch case). 0 means a longjmp that is not ours to handle, needs a rethrow. Otherwise
+          // the index mean is the same as the index in P+1 (to avoid 0).
+          for (unsigned i = 0; i < P.size(); i++) {
+            SI->addCase(cast<ConstantInt>(ConstantInt::get(i32, i+1)), P[i]->getParent());
+            P[i]->addIncoming(LongjmpResult, BB);
+          }
+          ToErase.push_back(TI); // new terminator is now the switch
+
+          // we are splitting the block here, and must continue to find other calls in the block - which is now split. so continue
+          // to traverse in the Tail
+          BB = Tail;
+          Iter = BB->begin();
+          E = BB->end();
+        } else if (InvokeInst *CI = dyn_cast<InvokeInst>(I)) { // XXX check if target is setjmp
+          (void)CI;
+          report_fatal_error("TODO: invoke inside setjmping functions");
+        }
+      }
+    }
+
+    // add a cleanup before each return
+    for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ) {
+      BasicBlock *BB = &*BBI++;
+      TerminatorInst *TI = BB->getTerminator();
+      if (isa<ReturnInst>(TI)) {
+        CallInst::Create(CleanupSetjmp, "", TI);
+      }
+    }
+  }
+
+  for (unsigned i = 0; i < ToErase.size(); i++) {
+    ToErase[i]->eraseFromParent();
+  }
+
+  // Finally, our modifications to the cfg can break dominance of SSA variables. For example,
+  //   if (x()) { .. setjmp() .. }
+  //   if (y()) { .. longjmp() .. }
+  // We must split the longjmp block, and it can jump into the setjmp one. But that means that when
+  // we split the setjmp block, it's first part no longer dominates its second part - there is
+  // a theoretically possible control flow path where x() is false, then y() is true and we
+  // reach the second part of the setjmp block, without ever reaching the first part. So,
+  // we recalculate regs vs. mem
+  for (FunctionPhisMap::iterator I = SetjmpOutputPhis.begin(); I != SetjmpOutputPhis.end(); I++) {
+    Function *F = I->first;
+    doRegToMem(*F);
+    doMemToReg(*F);
+  }
+
+  return true;
+}
+
+ModulePass *llvm::createLowerEmSetjmpPass() {
+  return new LowerEmSetjmp();
+}
diff --git a/lib/Target/JSBackend/NaCl/NoExitRuntime.cpp b/lib/Target/JSBackend/NaCl/NoExitRuntime.cpp
new file mode 100644
index 000000000000..aad9f4f42eec
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/NoExitRuntime.cpp
@@ -0,0 +1,91 @@
+//===- NoExitRuntime.cpp - Expand i64 and wider integer types -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===------------------------------------------------------------------===//
+//
+//===------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <map>
+
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  class NoExitRuntime : public ModulePass {
+    Module *TheModule;
+
+  public:
+    static char ID;
+    NoExitRuntime() : ModulePass(ID) {
+      initializeNoExitRuntimePass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char NoExitRuntime::ID = 0;
+INITIALIZE_PASS(NoExitRuntime, "emscripten-no-exit-runtime",
+                "Generate code which assumes the runtime is never exited (so atexit etc. is unneeded; see emscripten NO_EXIT_RUNTIME setting)",
+                false, false)
+
+
+// Implementation of NoExitRuntime
+
+bool NoExitRuntime::runOnModule(Module &M) {
+  TheModule = &M;
+
+  Function *AtExit = TheModule->getFunction("__cxa_atexit");
+  if (!AtExit || !AtExit->isDeclaration() || AtExit->getNumUses() == 0) return false;
+
+  // The system atexit is used - let's remove calls to it
+
+  Type *i32 = Type::getInt32Ty(TheModule->getContext());
+  Value *Zero  = Constant::getNullValue(i32);
+
+  std::vector<Instruction*> ToErase;
+
+  for (Instruction::user_iterator UI = AtExit->user_begin(), UE = AtExit->user_end(); UI != UE; ++UI) {
+    if (CallInst *CI = dyn_cast<CallInst>(*UI)) {
+      if (CI->getCalledValue() == AtExit) {
+        // calls to atexit can just be removed
+        CI->replaceAllUsesWith(Zero);
+        ToErase.push_back(CI);
+        continue;
+      }
+    }
+    // Possibly other uses of atexit are done - ptrtoint, etc. - but we leave those alone
+  }
+
+  for (unsigned i = 0; i < ToErase.size(); i++) {
+    ToErase[i]->eraseFromParent();
+  }
+
+  return true;
+}
+
+ModulePass *llvm::createNoExitRuntimePass() {
+  return new NoExitRuntime();
+}
diff --git a/lib/Target/JSBackend/NaCl/NormalizeAlignment.cpp b/lib/Target/JSBackend/NaCl/NormalizeAlignment.cpp
new file mode 100644
index 000000000000..d5419bc2caa3
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/NormalizeAlignment.cpp
@@ -0,0 +1,86 @@
+//===- NormalizeAlignment.cpp - Normalize Alignment -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Normalize the alignment of loads and stores to better fit the PNaCl ABI:
+//
+//  * On memcpy/memmove/memset intrinsic calls.
+//  * On regular memory accesses.
+//  * On atomic memory accesses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+class NormalizeAlignment : public FunctionPass {
+public:
+  static char ID;
+  NormalizeAlignment() : FunctionPass(ID) {
+    initializeNormalizeAlignmentPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+};
+}
+
+char NormalizeAlignment::ID = 0;
+INITIALIZE_PASS(NormalizeAlignment, "normalize-alignment",
+                "Normalize the alignment of loads and stores", false, false)
+
+static unsigned normalizeAlignment(DataLayout *DL, unsigned Alignment, Type *Ty,
+                                   bool IsAtomic) {
+  unsigned MaxAllowed = 1;
+  if (isa<VectorType>(Ty))
+    // Already handled properly by FixVectorLoadStoreAlignment.
+    return Alignment;
+  if (Ty->isDoubleTy() || Ty->isFloatTy() || IsAtomic)
+    MaxAllowed = DL->getTypeAllocSize(Ty);
+  // If the alignment is set to 0, this means "use the default
+  // alignment for the target", which we fill in explicitly.
+  if (Alignment == 0 || Alignment >= MaxAllowed)
+    return MaxAllowed;
+  return 1;
+}
+
+bool NormalizeAlignment::runOnFunction(Function &F) {
+  DataLayout DL(F.getParent());
+  bool Modified = false;
+
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (auto *MemOp = dyn_cast<MemIntrinsic>(&I)) {
+        Modified = true;
+        Type *AlignTy = MemOp->getAlignmentCst()->getType();
+        MemOp->setAlignment(ConstantInt::get(AlignTy, 1));
+      } else if (auto *Load = dyn_cast<LoadInst>(&I)) {
+        Modified = true;
+        Load->setAlignment(normalizeAlignment(
+            &DL, Load->getAlignment(), Load->getType(), Load->isAtomic()));
+      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+        Modified = true;
+        Store->setAlignment(normalizeAlignment(
+            &DL, Store->getAlignment(), Store->getValueOperand()->getType(),
+            Store->isAtomic()));
+      }
+    }
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createNormalizeAlignmentPass() {
+  return new NormalizeAlignment();
+}
diff --git a/lib/Target/JSBackend/NaCl/PNaClSjLjEH.cpp b/lib/Target/JSBackend/NaCl/PNaClSjLjEH.cpp
new file mode 100644
index 000000000000..1e333ab6b14b
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/PNaClSjLjEH.cpp
@@ -0,0 +1,465 @@
+//===- PNaClSjLjEH.cpp - Lower C++ exception handling to use setjmp()------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The PNaClSjLjEH pass is part of an implementation of C++ exception
+// handling for PNaCl that uses setjmp() and longjmp() to handle C++
+// exceptions.  The pass lowers LLVM "invoke" instructions to use
+// setjmp().
+//
+// For example, consider the following C++ code fragment:
+//
+//   int catcher_func() {
+//     try {
+//       int result = external_func();
+//       return result + 100;
+//     } catch (MyException &exc) {
+//       return exc.value + 200;
+//     }
+//   }
+//
+// PNaClSjLjEH converts the IR for that function to the following
+// pseudo-code:
+//
+//   struct LandingPadResult {
+//     void *exception_obj;  // For passing to __cxa_begin_catch()
+//     int matched_clause_id;  // See ExceptionInfoWriter.cpp
+//   };
+//
+//   struct ExceptionFrame {
+//     union {
+//       jmp_buf jmpbuf;  // Context for jumping to landingpad block
+//       struct LandingPadResult result;  // Data returned to landingpad block
+//     };
+//     struct ExceptionFrame *next;  // Next frame in linked list
+//     int clause_list_id;  // Reference to landingpad's exception info
+//   };
+//
+//   // Thread-local exception state
+//   __thread struct ExceptionFrame *__pnacl_eh_stack;
+//
+//   int catcher_func() {
+//     struct ExceptionFrame frame;
+//     frame.next = __pnacl_eh_stack;
+//     frame.clause_list_id = 123;
+//     __pnacl_eh_stack = &frame;  // Add frame to stack
+//     int result;
+//     if (!catcher_func_setjmp_caller(external_func, &frame.jmpbuf, &result)) {
+//       __pnacl_eh_stack = frame.next;  // Remove frame from stack
+//       return result + 100;
+//     } else {
+//       // Handle exception.  This is a simplification.  Real code would
+//       // call __cxa_begin_catch() to extract the thrown object.
+//       MyException &exc = *(MyException *) frame.result.exception_obj;
+//       return exc.value + 200;
+//     }
+//   }
+//
+//   // Helper function
+//   static int catcher_func_setjmp_caller(int (*func)(void), jmp_buf jmpbuf,
+//                                         int *result) {
+//     if (!setjmp(jmpbuf)) {
+//       *result = func();
+//       return 0;
+//     }
+//     return 1;
+//   }
+//
+// We use a helper function so that setjmp() is not called directly
+// from catcher_func(), due to a quirk of how setjmp() and longjmp()
+// are specified in C.
+//
+// func() might modify variables (allocas) that are local to
+// catcher_func() (if the variables' addresses are taken).  The C
+// standard says that these variables' values would become undefined
+// after longjmp() returned if setjmp() were called from
+// catcher_func().  Specifically, LLVM's GVN pass can optimize away
+// stores to allocas between setjmp() and longjmp() (see
+// pnacl-sjlj-eh-bug.ll for an example).  But this only applies to
+// allocas inside the caller of setjmp(), not to allocas inside the
+// caller of the caller of setjmp(), so doing the setjmp() call inside
+// a helper function that catcher_func() calls avoids the problem.
+//
+// The pass makes the following changes to IR:
+//
+//  * Convert "invoke" and "landingpad" instructions.
+//  * Convert "resume" instructions into __pnacl_eh_resume() calls.
+//  * Replace each call to llvm.eh.typeid.for() with an integer
+//    constant representing the exception type.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+#include "ExceptionInfoWriter.h"
+
+using namespace llvm;
+
+namespace {
+  // This is a ModulePass so that it can introduce new global variables.
+  class PNaClSjLjEH : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PNaClSjLjEH() : ModulePass(ID) {
+      initializePNaClSjLjEHPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+
+  class FuncRewriter {
+    Type *ExceptionFrameTy;
+    ExceptionInfoWriter *ExcInfoWriter;
+    Function *Func;
+
+    // FrameInitialized indicates whether the following variables have
+    // been initialized.
+    bool FrameInitialized;
+    Function *SetjmpIntrinsic;  // setjmp() intrinsic function
+    Instruction *EHStackTlsVar;  // Bitcast of thread-local __pnacl_eh_stack var
+    Instruction *Frame;  // Frame allocated for this function
+    Instruction *FrameJmpBuf;  // Frame's jmp_buf field
+    Instruction *FrameNextPtr;  // Frame's next field
+    Instruction *FrameExcInfo;  // Frame's clause_list_id field
+
+    Function *EHResumeFunc;  // __pnacl_eh_resume() function
+
+    // Initialize values that are shared across all "invoke"
+    // instructions within the function.
+    void initializeFrame();
+
+  public:
+    FuncRewriter(Type *ExceptionFrameTy, ExceptionInfoWriter *ExcInfoWriter,
+                 Function *Func):
+        ExceptionFrameTy(ExceptionFrameTy),
+        ExcInfoWriter(ExcInfoWriter),
+        Func(Func),
+        FrameInitialized(false),
+        SetjmpIntrinsic(NULL), EHStackTlsVar(NULL),
+        Frame(NULL), FrameJmpBuf(NULL), FrameNextPtr(NULL), FrameExcInfo(NULL),
+        EHResumeFunc(NULL) {}
+
+    Value *createSetjmpWrappedCall(InvokeInst *Invoke);
+    void expandInvokeInst(InvokeInst *Invoke);
+    void expandResumeInst(ResumeInst *Resume);
+    void expandFunc();
+  };
+}
+
+char PNaClSjLjEH::ID = 0;
+INITIALIZE_PASS(PNaClSjLjEH, "pnacl-sjlj-eh",
+                "Lower C++ exception handling to use setjmp()",
+                false, false)
+
+static const int kPNaClJmpBufSize = 1024;
+static const int kPNaClJmpBufAlign = 8;
+
+void FuncRewriter::initializeFrame() {
+  if (FrameInitialized)
+    return;
+  FrameInitialized = true;
+  Module *M = Func->getParent();
+
+  SetjmpIntrinsic = Intrinsic::getDeclaration(M, Intrinsic::nacl_setjmp);
+
+  Value *EHStackTlsVarUncast = M->getGlobalVariable("__pnacl_eh_stack");
+  if (!EHStackTlsVarUncast)
+    report_fatal_error("__pnacl_eh_stack not defined");
+  EHStackTlsVar = new BitCastInst(
+      EHStackTlsVarUncast, ExceptionFrameTy->getPointerTo()->getPointerTo(),
+      "pnacl_eh_stack");
+  Func->getEntryBlock().getInstList().push_front(EHStackTlsVar);
+
+  // Allocate the new exception frame.  This is reused across all
+  // invoke instructions in the function.
+  Type *I32 = Type::getInt32Ty(M->getContext());
+  Frame = new AllocaInst(ExceptionFrameTy, ConstantInt::get(I32, 1),
+                         kPNaClJmpBufAlign, "invoke_frame");
+  Func->getEntryBlock().getInstList().push_front(Frame);
+
+  // Calculate addresses of fields in the exception frame.
+  Value *JmpBufIndexes[] = { ConstantInt::get(I32, 0),
+                             ConstantInt::get(I32, 0),
+                             ConstantInt::get(I32, 0) };
+  FrameJmpBuf = GetElementPtrInst::Create(
+      ExceptionFrameTy, Frame, JmpBufIndexes, "invoke_jmp_buf");
+  FrameJmpBuf->insertAfter(Frame);
+
+  Value *NextPtrIndexes[] = { ConstantInt::get(I32, 0),
+                              ConstantInt::get(I32, 1) };
+  FrameNextPtr = GetElementPtrInst::Create(
+      ExceptionFrameTy, Frame, NextPtrIndexes, "invoke_next");
+  FrameNextPtr->insertAfter(Frame);
+
+  Value *ExcInfoIndexes[] = { ConstantInt::get(I32, 0),
+                              ConstantInt::get(I32, 2) };
+  FrameExcInfo = GetElementPtrInst::Create(
+      ExceptionFrameTy, Frame, ExcInfoIndexes, "exc_info_ptr");
+  FrameExcInfo->insertAfter(Frame);
+}
+
+// Creates the helper function that will do the setjmp() call and
+// function call for implementing Invoke.  Creates the call to the
+// helper function.  Returns a Value which is zero on the normal
+// execution path and non-zero if the landingpad block should be
+// entered.
+Value *FuncRewriter::createSetjmpWrappedCall(InvokeInst *Invoke) {
+  Type *I32 = Type::getInt32Ty(Func->getContext());
+
+  // Allocate space for storing the invoke's result temporarily (so
+  // that the helper function can return multiple values).  We don't
+  // need to do this if the result is unused, and we can't if its type
+  // is void.
+  Instruction *ResultAlloca = NULL;
+  if (!Invoke->use_empty()) {
+    ResultAlloca = new AllocaInst(Invoke->getType(), "invoke_result_ptr");
+    Func->getEntryBlock().getInstList().push_front(ResultAlloca);
+  }
+
+  // Create type for the helper function.
+  SmallVector<Type *, 10> ArgTypes;
+  for (unsigned I = 0, E = Invoke->getNumArgOperands(); I < E; ++I)
+    ArgTypes.push_back(Invoke->getArgOperand(I)->getType());
+  ArgTypes.push_back(Invoke->getCalledValue()->getType());
+  ArgTypes.push_back(FrameJmpBuf->getType());
+  if (ResultAlloca)
+    ArgTypes.push_back(Invoke->getType()->getPointerTo());
+  FunctionType *FTy = FunctionType::get(I32, ArgTypes, false);
+
+  // Create the helper function.
+  Function *HelperFunc = Function::Create(
+      FTy, GlobalValue::InternalLinkage, Func->getName() + "_setjmp_caller");
+  Func->getParent()->getFunctionList().insertAfter(Func->getIterator(), HelperFunc);
+  BasicBlock *EntryBB = BasicBlock::Create(Func->getContext(), "", HelperFunc);
+  BasicBlock *NormalBB = BasicBlock::Create(Func->getContext(), "normal",
+                                            HelperFunc);
+  BasicBlock *ExceptionBB = BasicBlock::Create(Func->getContext(), "exception",
+                                               HelperFunc);
+
+  // Unpack the helper function's arguments.
+  Function::arg_iterator ArgIter = HelperFunc->arg_begin();
+  SmallVector<Value *, 10> InnerCallArgs;
+  for (unsigned I = 0, E = Invoke->getNumArgOperands(); I < E; ++I) {
+    ArgIter->setName("arg");
+    InnerCallArgs.push_back(&*ArgIter++);
+  }
+  Argument *CalleeArg = &*ArgIter++;
+  Argument *JmpBufArg = &*ArgIter++;
+  CalleeArg->setName("func_ptr");
+  JmpBufArg->setName("jmp_buf");
+
+  // Create setjmp() call.
+  Value *SetjmpArgs[] = { JmpBufArg };
+  CallInst *SetjmpCall = CallInst::Create(SetjmpIntrinsic, SetjmpArgs,
+                                          "invoke_sj", EntryBB);
+  CopyDebug(SetjmpCall, Invoke);
+  // Setting the "returns_twice" attribute here prevents optimization
+  // passes from inlining HelperFunc into its caller.
+  SetjmpCall->setCanReturnTwice();
+  // Check setjmp()'s result.
+  Value *IsZero = CopyDebug(new ICmpInst(*EntryBB, CmpInst::ICMP_EQ, SetjmpCall,
+                                         ConstantInt::get(I32, 0),
+                                         "invoke_sj_is_zero"), Invoke);
+  CopyDebug(BranchInst::Create(NormalBB, ExceptionBB, IsZero, EntryBB), Invoke);
+  // Handle the normal, non-exceptional code path.
+  CallInst *InnerCall = CallInst::Create(CalleeArg, InnerCallArgs, "",
+                                         NormalBB);
+  CopyDebug(InnerCall, Invoke);
+  InnerCall->setAttributes(Invoke->getAttributes());
+  InnerCall->setCallingConv(Invoke->getCallingConv());
+  if (ResultAlloca) {
+    InnerCall->setName("result");
+    Argument *ResultArg = &*ArgIter++;
+    ResultArg->setName("result_ptr");
+    CopyDebug(new StoreInst(InnerCall, ResultArg, NormalBB), Invoke);
+  }
+  ReturnInst::Create(Func->getContext(), ConstantInt::get(I32, 0), NormalBB);
+  // Handle the exceptional code path.
+  ReturnInst::Create(Func->getContext(), ConstantInt::get(I32, 1), ExceptionBB);
+
+  // Create the outer call to the helper function.
+  SmallVector<Value *, 10> OuterCallArgs;
+  for (unsigned I = 0, E = Invoke->getNumArgOperands(); I < E; ++I)
+    OuterCallArgs.push_back(Invoke->getArgOperand(I));
+  OuterCallArgs.push_back(Invoke->getCalledValue());
+  OuterCallArgs.push_back(FrameJmpBuf);
+  if (ResultAlloca)
+    OuterCallArgs.push_back(ResultAlloca);
+  CallInst *OuterCall = CallInst::Create(HelperFunc, OuterCallArgs,
+                                         "invoke_is_exc", Invoke);
+  CopyDebug(OuterCall, Invoke);
+
+  // Retrieve the function return value stored in the alloca.  We only
+  // need to do this on the non-exceptional path, but we currently do
+  // it unconditionally because that is simpler.
+  if (ResultAlloca) {
+    Value *Result = new LoadInst(ResultAlloca, "", Invoke);
+    Result->takeName(Invoke);
+    Invoke->replaceAllUsesWith(Result);
+  }
+  return OuterCall;
+}
+
+static void convertInvokeToCall(InvokeInst *Invoke) {
+  SmallVector<Value*, 16> CallArgs(Invoke->op_begin(), Invoke->op_end() - 3);
+  // Insert a normal call instruction.
+  CallInst *NewCall = CallInst::Create(Invoke->getCalledValue(),
+                                       CallArgs, "", Invoke);
+  CopyDebug(NewCall, Invoke);
+  NewCall->takeName(Invoke);
+  NewCall->setCallingConv(Invoke->getCallingConv());
+  NewCall->setAttributes(Invoke->getAttributes());
+  Invoke->replaceAllUsesWith(NewCall);
+
+  // Insert an unconditional branch to the normal destination.
+  BranchInst::Create(Invoke->getNormalDest(), Invoke);
+  // Remove any PHI node entries from the exception destination.
+  Invoke->getUnwindDest()->removePredecessor(Invoke->getParent());
+  Invoke->eraseFromParent();
+}
+
+void FuncRewriter::expandInvokeInst(InvokeInst *Invoke) {
+  // Calls to ReturnsTwice functions, i.e. setjmp(), can't be moved
+  // into a helper function.  setjmp() can't throw an exception
+  // anyway, so convert the invoke to a call.
+  if (Invoke->hasFnAttr(Attribute::ReturnsTwice)) {
+    convertInvokeToCall(Invoke);
+    return;
+  }
+
+  initializeFrame();
+
+  LandingPadInst *LP = Invoke->getLandingPadInst();
+  Type *I32 = Type::getInt32Ty(Func->getContext());
+  Value *ExcInfo = ConstantInt::get(
+      I32, ExcInfoWriter->getIDForLandingPadClauseList(LP));
+
+  // Append the new frame to the list.
+  Value *OldList = CopyDebug(
+      new LoadInst(EHStackTlsVar, "old_eh_stack", Invoke), Invoke);
+  CopyDebug(new StoreInst(OldList, FrameNextPtr, Invoke), Invoke);
+  CopyDebug(new StoreInst(ExcInfo, FrameExcInfo, Invoke), Invoke);
+  CopyDebug(new StoreInst(Frame, EHStackTlsVar, Invoke), Invoke);
+  Value *IsException = createSetjmpWrappedCall(Invoke);
+  // Restore the old frame list.  We only need to do this on the
+  // non-exception code path, but we currently do it unconditionally
+  // because that is simpler.  (The PNaCl C++ runtime library restores
+  // the old frame list on the exceptional path; doing it again here
+  // redundantly is OK.)
+  CopyDebug(new StoreInst(OldList, EHStackTlsVar, Invoke), Invoke);
+
+  Value *IsZero = CopyDebug(new ICmpInst(Invoke, CmpInst::ICMP_EQ, IsException,
+                                         ConstantInt::get(I32, 0),
+                                         "invoke_sj_is_zero"), Invoke);
+  CopyDebug(BranchInst::Create(Invoke->getNormalDest(), Invoke->getUnwindDest(),
+                               IsZero, Invoke),
+            Invoke);
+
+  Invoke->eraseFromParent();
+}
+
+void FuncRewriter::expandResumeInst(ResumeInst *Resume) {
+  if (!EHResumeFunc) {
+    EHResumeFunc = Func->getParent()->getFunction("__pnacl_eh_resume");
+    if (!EHResumeFunc)
+      report_fatal_error("__pnacl_eh_resume() not defined");
+  }
+
+  // The "resume" instruction gets passed the landingpad's full result
+  // (struct LandingPadResult above).  Extract the exception_obj field
+  // to pass to __pnacl_eh_resume(), which doesn't need the
+  // matched_clause_id field.
+  unsigned Indexes[] = { 0 };
+  Value *ExceptionPtr =
+      CopyDebug(ExtractValueInst::Create(Resume->getValue(), Indexes,
+                                         "resume_exc", Resume), Resume);
+
+  // Cast to the pointer type that __pnacl_eh_resume() expects.
+  if (EHResumeFunc->getFunctionType()->getFunctionNumParams() != 1)
+    report_fatal_error("Bad type for __pnacl_eh_resume()");
+  Type *ArgType = EHResumeFunc->getFunctionType()->getFunctionParamType(0);
+  ExceptionPtr = new BitCastInst(ExceptionPtr, ArgType, "resume_cast", Resume);
+
+  Value *Args[] = { ExceptionPtr };
+  CopyDebug(CallInst::Create(EHResumeFunc, Args, "", Resume), Resume);
+  new UnreachableInst(Func->getContext(), Resume);
+  Resume->eraseFromParent();
+}
+
+void FuncRewriter::expandFunc() {
+  Type *I32 = Type::getInt32Ty(Func->getContext());
+
+  // We need to do two passes: When we process an invoke we need to
+  // look at its landingpad, so we can't remove the landingpads until
+  // all the invokes have been processed.
+  for (Function::iterator BB = Func->begin(), E = Func->end(); BB != E; ++BB) {
+    for (BasicBlock::iterator Iter = BB->begin(), E = BB->end(); Iter != E; ) {
+      Instruction *Inst = &*Iter++;
+      if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
+        expandInvokeInst(Invoke);
+      } else if (ResumeInst *Resume = dyn_cast<ResumeInst>(Inst)) {
+        expandResumeInst(Resume);
+      } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(Inst)) {
+        if (Intrinsic->getIntrinsicID() == Intrinsic::eh_typeid_for) {
+          Value *ExcType = Intrinsic->getArgOperand(0);
+          Value *Val = ConstantInt::get(
+              I32, ExcInfoWriter->getIDForExceptionType(ExcType));
+          Intrinsic->replaceAllUsesWith(Val);
+          Intrinsic->eraseFromParent();
+        }
+      }
+    }
+  }
+  for (Function::iterator BB = Func->begin(), E = Func->end(); BB != E; ++BB) {
+    for (BasicBlock::iterator Iter = BB->begin(), E = BB->end(); Iter != E; ) {
+      Instruction *Inst = &*Iter++;
+      if (LandingPadInst *LP = dyn_cast<LandingPadInst>(Inst)) {
+        initializeFrame();
+        Value *LPPtr = new BitCastInst(
+            FrameJmpBuf, LP->getType()->getPointerTo(), "landingpad_ptr", LP);
+        Value *LPVal = CopyDebug(new LoadInst(LPPtr, "", LP), LP);
+        LPVal->takeName(LP);
+        LP->replaceAllUsesWith(LPVal);
+        LP->eraseFromParent();
+      }
+    }
+  }
+}
+
+bool PNaClSjLjEH::runOnModule(Module &M) {
+  Type *JmpBufTy = ArrayType::get(Type::getInt8Ty(M.getContext()),
+                                  kPNaClJmpBufSize);
+
+  // Define "struct ExceptionFrame".
+  StructType *ExceptionFrameTy = StructType::create(M.getContext(),
+                                                    "ExceptionFrame");
+  Type *ExceptionFrameFields[] = {
+    JmpBufTy,  // jmp_buf
+    ExceptionFrameTy->getPointerTo(),  // struct ExceptionFrame *next
+    Type::getInt32Ty(M.getContext())  // Exception info (clause list ID)
+  };
+  ExceptionFrameTy->setBody(ExceptionFrameFields);
+
+  ExceptionInfoWriter ExcInfoWriter(&M.getContext());
+  for (Module::iterator Func = M.begin(), E = M.end(); Func != E; ++Func) {
+    FuncRewriter Rewriter(ExceptionFrameTy, &ExcInfoWriter, &*Func);
+    Rewriter.expandFunc();
+  }
+  ExcInfoWriter.defineGlobalVariables(&M);
+  return true;
+}
+
+ModulePass *llvm::createPNaClSjLjEHPass() {
+  return new PNaClSjLjEH();
+}
diff --git a/lib/Target/JSBackend/NaCl/PromoteI1Ops.cpp b/lib/Target/JSBackend/NaCl/PromoteI1Ops.cpp
new file mode 100644
index 000000000000..2bb23b217a52
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/PromoteI1Ops.cpp
@@ -0,0 +1,170 @@
+//===- PromoteI1Ops.cpp - Promote various operations on the i1 type--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass expands out various operations on the i1 type so that
+// these i1 operations do not need to be supported by the PNaCl
+// translator.
+//
+// This is similar to the PromoteIntegers pass in that it removes uses
+// of an unusual-size integer type.  The difference is that i1 remains
+// a valid type in other operations.  i1 can still be used in phi
+// nodes, "select" instructions, in "sext" and "zext", and so on.  In
+// contrast, the integer types that PromoteIntegers removes are not
+// allowed in any context by PNaCl's ABI verifier.
+//
+// This pass expands out the following:
+//
+//  * i1 loads and stores.
+//  * All i1 comparisons and arithmetic operations, with the exception
+//    of "and", "or" and "xor", because these are used in practice and
+//    don't overflow.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  class PromoteI1Ops : public BasicBlockPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PromoteI1Ops() : BasicBlockPass(ID) {
+      initializePromoteI1OpsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnBasicBlock(BasicBlock &BB);
+  };
+}
+
+char PromoteI1Ops::ID = 0;
+INITIALIZE_PASS(PromoteI1Ops, "nacl-promote-i1-ops",
+                "Promote various operations on the i1 type",
+                false, false)
+
+static Value *promoteValue(Value *Val, bool SignExt, Instruction *InsertPt) {
+  Instruction::CastOps CastType =
+      SignExt ? Instruction::SExt : Instruction::ZExt;
+  return CopyDebug(CastInst::Create(CastType, Val,
+                                    Type::getInt8Ty(Val->getContext()),
+                                    Val->getName() + ".expand_i1_val",
+                                    InsertPt), InsertPt);
+}
+
+bool PromoteI1Ops::runOnBasicBlock(BasicBlock &BB) {
+  bool Changed = false;
+
+  Type *I1Ty = Type::getInt1Ty(BB.getContext());
+  Type *I8Ty = Type::getInt8Ty(BB.getContext());
+
+  // Rewrite boolean Switch terminators:
+  if (SwitchInst *Switch = dyn_cast<SwitchInst>(BB.getTerminator())) {
+    Value *Condition = Switch->getCondition();
+    Type *ConditionTy = Condition->getType();
+    if (ConditionTy->isIntegerTy(1)) {
+      ConstantInt *False =
+        cast<ConstantInt>(ConstantInt::getFalse(ConditionTy));
+      ConstantInt *True =
+        cast<ConstantInt>(ConstantInt::getTrue(ConditionTy));
+
+      SwitchInst::CaseIt FalseCase = Switch->findCaseValue(False);
+      SwitchInst::CaseIt TrueCase  = Switch->findCaseValue(True);
+
+      BasicBlock *FalseBlock  = FalseCase.getCaseSuccessor();
+      BasicBlock *TrueBlock   = TrueCase.getCaseSuccessor();
+      BasicBlock *DefaultDest = Switch->getDefaultDest();
+
+      if (TrueBlock && FalseBlock) {
+        // impossible destination
+        DefaultDest->removePredecessor(Switch->getParent());
+      }
+
+      if (!TrueBlock) {
+        TrueBlock = DefaultDest;
+      }
+      if (!FalseBlock) {
+        FalseBlock = DefaultDest;
+      }
+
+      CopyDebug(BranchInst::Create(TrueBlock, FalseBlock, Condition, Switch),
+                Switch);
+      Switch->eraseFromParent();
+    }
+  }
+
+  for (BasicBlock::iterator Iter = BB.begin(), E = BB.end(); Iter != E; ) {
+    Instruction *Inst = &*Iter++;
+    if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+      if (Load->getType() == I1Ty) {
+        Changed = true;
+        Value *Ptr = CopyDebug(
+            new BitCastInst(
+                Load->getPointerOperand(), I8Ty->getPointerTo(),
+                Load->getPointerOperand()->getName() + ".i8ptr", Load), Load);
+        LoadInst *NewLoad = new LoadInst(
+            Ptr, Load->getName() + ".pre_trunc", Load);
+        CopyDebug(NewLoad, Load);
+        CopyLoadOrStoreAttrs(NewLoad, Load);
+        Value *Result = CopyDebug(new TruncInst(NewLoad, I1Ty, "", Load), Load);
+        Result->takeName(Load);
+        Load->replaceAllUsesWith(Result);
+        Load->eraseFromParent();
+      }
+    } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
+      if (Store->getValueOperand()->getType() == I1Ty) {
+        Changed = true;
+        Value *Ptr = CopyDebug(
+            new BitCastInst(
+                Store->getPointerOperand(), I8Ty->getPointerTo(),
+                Store->getPointerOperand()->getName() + ".i8ptr", Store),
+            Store);
+        Value *Val = promoteValue(Store->getValueOperand(), false, Store);
+        StoreInst *NewStore = new StoreInst(Val, Ptr, Store);
+        CopyDebug(NewStore, Store);
+        CopyLoadOrStoreAttrs(NewStore, Store);
+        Store->eraseFromParent();
+      }
+    } else if (BinaryOperator *Op = dyn_cast<BinaryOperator>(Inst)) {
+      if (Op->getType() == I1Ty &&
+          !(Op->getOpcode() == Instruction::And ||
+            Op->getOpcode() == Instruction::Or ||
+            Op->getOpcode() == Instruction::Xor)) {
+        Value *Arg1 = promoteValue(Op->getOperand(0), false, Op);
+        Value *Arg2 = promoteValue(Op->getOperand(1), false, Op);
+        Value *NewOp = CopyDebug(
+            BinaryOperator::Create(
+                Op->getOpcode(), Arg1, Arg2,
+                Op->getName() + ".pre_trunc", Op), Op);
+        Value *Result = CopyDebug(new TruncInst(NewOp, I1Ty, "", Op), Op);
+        Result->takeName(Op);
+        Op->replaceAllUsesWith(Result);
+        Op->eraseFromParent();
+      }
+    } else if (ICmpInst *Op = dyn_cast<ICmpInst>(Inst)) {
+      if (Op->getOperand(0)->getType() == I1Ty) {
+        Value *Arg1 = promoteValue(Op->getOperand(0), Op->isSigned(), Op);
+        Value *Arg2 = promoteValue(Op->getOperand(1), Op->isSigned(), Op);
+        Value *Result = CopyDebug(
+            new ICmpInst(Op, Op->getPredicate(), Arg1, Arg2, ""), Op);
+        Result->takeName(Op);
+        Op->replaceAllUsesWith(Result);
+        Op->eraseFromParent();
+      }
+    }
+  }
+  return Changed;
+}
+
+BasicBlockPass *llvm::createPromoteI1OpsPass() {
+  return new PromoteI1Ops();
+}
diff --git a/lib/Target/JSBackend/NaCl/PromoteIntegers.cpp b/lib/Target/JSBackend/NaCl/PromoteIntegers.cpp
new file mode 100644
index 000000000000..761f409b33a6
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/PromoteIntegers.cpp
@@ -0,0 +1,737 @@
+//===- PromoteIntegers.cpp - Promote illegal integers for PNaCl ABI -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// A limited set of transformations to promote illegal-sized int types.
+//
+//===----------------------------------------------------------------------===//
+//
+// Legal sizes are currently 1, 8, and large power-of-two sizes. Operations on
+// illegal integers are changed to operate on the next-higher legal size.
+//
+// It maintains no invariants about the upper bits (above the size of the
+// original type); therefore before operations which can be affected by the
+// value of these bits (e.g. cmp, select, lshr), the upper bits of the operands
+// are cleared.
+//
+// Limitations:
+// 1) It can't change function signatures or global variables
+// 2) Doesn't handle arrays or structs with illegal types
+// 3) Doesn't handle constant expressions (it also doesn't produce them, so it
+//    can run after ExpandConstantExpr)
+//
+//===----------------------------------------------------------------------===//
+
+#include "SimplifiedFuncTypeMap.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+static Type *getPromotedType(Type *Ty);
+
+namespace {
+
+class TypeMap : public SimplifiedFuncTypeMap {
+protected:
+  MappingResult getSimpleFuncType(LLVMContext &Ctx, StructMap &Tentatives,
+                                  FunctionType *OldFnTy) override {
+    ParamTypeVector NewArgTypes;
+
+    auto Ret = getPromotedArgType(Ctx, OldFnTy->getReturnType(), Tentatives);
+    bool Changed = Ret.isChanged();
+    for (auto &ArgTy : OldFnTy->params()) {
+      auto NewArgTy = getPromotedArgType(Ctx, ArgTy, Tentatives);
+      NewArgTypes.push_back(NewArgTy);
+      Changed |= NewArgTy.isChanged();
+    }
+
+    auto *NewFctType = FunctionType::get(Ret, NewArgTypes, OldFnTy->isVarArg());
+    return {NewFctType, Changed};
+  }
+
+private:
+  MappingResult getPromotedArgType(LLVMContext &Ctx, Type *Ty,
+                                   StructMap &Tentatives) {
+    if (Ty->isIntegerTy()) {
+      auto *NTy = getPromotedType(Ty);
+      return {NTy, NTy != Ty};
+    }
+    return getSimpleAggregateTypeInternal(Ctx, Ty, Tentatives);
+  }
+};
+
+class PromoteIntegers : public ModulePass {
+public:
+  static char ID;
+
+  PromoteIntegers() : ModulePass(ID) {
+    initializePromoteIntegersPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+
+private:
+  typedef DenseMap<const llvm::Function *, DISubprogram *> DebugMap;
+  TypeMap TypeMapper;
+
+  bool ensureCompliantSignature(LLVMContext &Ctx, Function *OldFct, Module &M);
+};
+} // anonymous namespace
+
+char PromoteIntegers::ID = 0;
+
+INITIALIZE_PASS(PromoteIntegers, "nacl-promote-ints",
+                "Promote integer types which are illegal in PNaCl", false,
+                false)
+
+static bool isLegalSize(unsigned Size) {
+  return Size == 1 || (Size >= 8 && isPowerOf2_32(Size));
+}
+
+static Type *getPromotedIntType(IntegerType *Ty) {
+  auto Width = Ty->getBitWidth();
+  if (isLegalSize(Width))
+    return Ty;
+  assert(Width < (1ull << (sizeof(Width) * CHAR_BIT - 1)) &&
+         "width can't be rounded to the next power of two");
+  return IntegerType::get(Ty->getContext(),
+                          Width < 8 ? 8 : NextPowerOf2(Width));
+}
+
+// Return a legal integer type, promoting to a larger size if necessary.
+static Type *getPromotedType(Type *Ty) {
+  assert(isa<IntegerType>(Ty) && "Trying to convert a non-integer type");
+  return getPromotedIntType(cast<IntegerType>(Ty));
+}
+
+// Return true if Val is an int which should be converted.
+static bool shouldConvert(Value *Val) {
+  if (auto *ITy = dyn_cast<IntegerType>(Val->getType()))
+    return !isLegalSize(ITy->getBitWidth());
+  return false;
+}
+
+// Return a constant which has been promoted to a legal size.
+static Value *convertConstant(Constant *C, bool SignExt) {
+  assert(shouldConvert(C));
+  Type *ProTy = getPromotedType(C->getType());
+  // ConstantExpr of a Constant yields a Constant, not a ConstantExpr.
+  return SignExt ? ConstantExpr::getSExt(C, ProTy)
+                 : ConstantExpr::getZExt(C, ProTy);
+}
+
+namespace {
+// Holds the state for converting/replacing values. Conversion is done in one
+// pass, with each value requiring conversion possibly having two stages. When
+// an instruction needs to be replaced (i.e. it has illegal operands or result)
+// a new instruction is created, and the pass calls getConverted to get its
+// operands. If the original operand has already been converted, the new value
+// is returned. Otherwise, a placeholder is created and used in the new
+// instruction. After a new instruction is created to replace an illegal one,
+// recordConverted is called to register the replacement. All users are updated,
+// and if there is a placeholder, its users are also updated.
+//
+// recordConverted also queues the old value for deletion.
+//
+// This strategy avoids the need for recursion or worklists for conversion.
+class ConversionState {
+public:
+  // Return the promoted value for Val. If Val has not yet been converted,
+  // return a placeholder, which will be converted later.
+  Value *getConverted(Value *Val) {
+    if (!shouldConvert(Val))
+      return Val;
+    if (isa<GlobalVariable>(Val))
+      report_fatal_error("Can't convert illegal GlobalVariables");
+    if (RewrittenMap.count(Val))
+      return RewrittenMap[Val];
+
+    // Directly convert constants.
+    if (auto *C = dyn_cast<Constant>(Val))
+      return convertConstant(C, /*SignExt=*/false);
+
+    // No converted value available yet, so create a placeholder.
+    auto *P = new Argument(getPromotedType(Val->getType()));
+
+    RewrittenMap[Val] = P;
+    Placeholders[Val] = P;
+    return P;
+  }
+
+  // Replace the uses of From with To, replace the uses of any
+  // placeholders for From, and optionally give From's name to To.
+  // Also mark To for deletion.
+  void recordConverted(Instruction *From, Value *To, bool TakeName = true) {
+    ToErase.push_back(From);
+    if (!shouldConvert(From)) {
+      // From does not produce an illegal value, update its users in place.
+      From->replaceAllUsesWith(To);
+    } else {
+      // From produces an illegal value, so its users will be replaced. When
+      // replacements are created they will use values returned by getConverted.
+      if (Placeholders.count(From)) {
+        // Users of the placeholder can be updated in place.
+        Placeholders[From]->replaceAllUsesWith(To);
+        Placeholders.erase(From);
+      }
+      RewrittenMap[From] = To;
+    }
+    if (TakeName) {
+      To->takeName(From);
+    }
+  }
+
+  void eraseReplacedInstructions() {
+    for (Instruction *E : ToErase)
+      E->dropAllReferences();
+    for (Instruction *E : ToErase)
+      E->eraseFromParent();
+  }
+
+private:
+  // Maps illegal values to their new converted values (or placeholders
+  // if no new value is available yet)
+  DenseMap<Value *, Value *> RewrittenMap;
+  // Maps illegal values with no conversion available yet to their placeholders
+  DenseMap<Value *, Value *> Placeholders;
+  // Illegal values which have already been converted, will be erased.
+  SmallVector<Instruction *, 8> ToErase;
+};
+} // anonymous namespace
+
+// Create a BitCast instruction from the original Value being cast. These
+// instructions aren't replaced by convertInstruction because they are pointer
+// types (which are always valid), but their uses eventually lead to an invalid
+// type.
+static Value *CreateBitCast(IRBuilder<> *IRB, Value *From, Type *ToTy,
+                            const Twine &Name) {
+  if (auto *BC = dyn_cast<BitCastInst>(From))
+    return CreateBitCast(IRB, BC->getOperand(0), ToTy, Name);
+  return IRB->CreateBitCast(From, ToTy, Name);
+}
+
+// Split an illegal load into multiple legal loads and return the resulting
+// promoted value. The size of the load is assumed to be a multiple of 8.
+//
+// \param BaseAlign Alignment of the base load.
+// \param Offset    Offset from the base load.
+static Value *splitLoad(DataLayout *DL, LoadInst *Inst, ConversionState &State,
+                        unsigned BaseAlign, unsigned Offset) {
+  if (Inst->isVolatile() || Inst->isAtomic())
+    report_fatal_error("Can't split volatile/atomic loads");
+  if (DL->getTypeSizeInBits(Inst->getType()) % 8 != 0)
+    report_fatal_error("Loads must be a multiple of 8 bits");
+
+  auto *OrigPtr = State.getConverted(Inst->getPointerOperand());
+  // OrigPtr is a placeholder in recursive calls, and so has no name.
+  if (OrigPtr->getName().empty())
+    OrigPtr->setName(Inst->getPointerOperand()->getName());
+  unsigned Width = DL->getTypeSizeInBits(Inst->getType());
+  auto *NewType = getPromotedType(Inst->getType());
+  unsigned LoWidth = PowerOf2Floor(Width);
+  assert(isLegalSize(LoWidth));
+
+  auto *LoType = IntegerType::get(Inst->getContext(), LoWidth);
+  auto *HiType = IntegerType::get(Inst->getContext(), Width - LoWidth);
+  IRBuilder<> IRB(Inst);
+
+  auto *BCLo = CreateBitCast(&IRB, OrigPtr, LoType->getPointerTo(),
+                             OrigPtr->getName() + ".loty");
+  auto *LoadLo = IRB.CreateAlignedLoad(BCLo, MinAlign(BaseAlign, Offset),
+                                       Inst->getName() + ".lo");
+  auto *LoExt = IRB.CreateZExt(LoadLo, NewType, LoadLo->getName() + ".ext");
+  auto *GEPHi = IRB.CreateConstGEP1_32(BCLo, 1, OrigPtr->getName() + ".hi");
+  auto *BCHi = CreateBitCast(&IRB, GEPHi, HiType->getPointerTo(),
+                             OrigPtr->getName() + ".hity");
+
+  auto HiOffset = (Offset + LoWidth) / CHAR_BIT;
+  auto *LoadHi = IRB.CreateAlignedLoad(BCHi, MinAlign(BaseAlign, HiOffset),
+                                       Inst->getName() + ".hi");
+  auto *Hi = !isLegalSize(Width - LoWidth)
+                 ? splitLoad(DL, LoadHi, State, BaseAlign, HiOffset)
+                 : LoadHi;
+
+  auto *HiExt = IRB.CreateZExt(Hi, NewType, Hi->getName() + ".ext");
+  auto *HiShift = IRB.CreateShl(HiExt, LoWidth, HiExt->getName() + ".sh");
+  auto *Result = IRB.CreateOr(LoExt, HiShift);
+
+  State.recordConverted(Inst, Result);
+
+  return Result;
+}
+
+static Value *splitStore(DataLayout *DL, StoreInst *Inst,
+                         ConversionState &State, unsigned BaseAlign,
+                         unsigned Offset) {
+  if (Inst->isVolatile() || Inst->isAtomic())
+    report_fatal_error("Can't split volatile/atomic stores");
+  if (DL->getTypeSizeInBits(Inst->getValueOperand()->getType()) % 8 != 0)
+    report_fatal_error("Stores must be a multiple of 8 bits");
+
+  auto *OrigPtr = State.getConverted(Inst->getPointerOperand());
+  // OrigPtr is now a placeholder in recursive calls, and so has no name.
+  if (OrigPtr->getName().empty())
+    OrigPtr->setName(Inst->getPointerOperand()->getName());
+  auto *OrigVal = State.getConverted(Inst->getValueOperand());
+  unsigned Width = DL->getTypeSizeInBits(Inst->getValueOperand()->getType());
+  unsigned LoWidth = PowerOf2Floor(Width);
+  assert(isLegalSize(LoWidth));
+
+  auto *LoType = IntegerType::get(Inst->getContext(), LoWidth);
+  auto *HiType = IntegerType::get(Inst->getContext(), Width - LoWidth);
+  IRBuilder<> IRB(Inst);
+
+  auto *BCLo = CreateBitCast(&IRB, OrigPtr, LoType->getPointerTo(),
+                             OrigPtr->getName() + ".loty");
+  auto *LoTrunc = IRB.CreateTrunc(OrigVal, LoType, OrigVal->getName() + ".lo");
+  IRB.CreateAlignedStore(LoTrunc, BCLo, MinAlign(BaseAlign, Offset));
+
+  auto HiOffset = (Offset + LoWidth) / CHAR_BIT;
+  auto *HiLShr =
+      IRB.CreateLShr(OrigVal, LoWidth, OrigVal->getName() + ".hi.sh");
+  auto *GEPHi = IRB.CreateConstGEP1_32(BCLo, 1, OrigPtr->getName() + ".hi");
+  auto *HiTrunc = IRB.CreateTrunc(HiLShr, HiType, OrigVal->getName() + ".hi");
+  auto *BCHi = CreateBitCast(&IRB, GEPHi, HiType->getPointerTo(),
+                             OrigPtr->getName() + ".hity");
+
+  auto *StoreHi =
+      IRB.CreateAlignedStore(HiTrunc, BCHi, MinAlign(BaseAlign, HiOffset));
+  Value *Hi = StoreHi;
+
+  if (!isLegalSize(Width - LoWidth)) {
+    // HiTrunc is still illegal, and is redundant with the truncate in the
+    // recursive call, so just get rid of it. If HiTrunc is a constant then the
+    // IRB will have just returned a shifted, truncated constant, which is
+    // already uniqued (and does not need to be RAUWed), and recordConverted
+    // expects constants.
+    if (!isa<Constant>(HiTrunc))
+      State.recordConverted(cast<Instruction>(HiTrunc), HiLShr,
+                            /*TakeName=*/false);
+    Hi = splitStore(DL, StoreHi, State, BaseAlign, HiOffset);
+  }
+  State.recordConverted(Inst, Hi, /*TakeName=*/false);
+  return Hi;
+}
+
+// Return a converted value with the bits of the operand above the size of the
+// original type cleared.
+static Value *getClearConverted(Value *Operand, Instruction *InsertPt,
+                                ConversionState &State) {
+  auto *OrigType = Operand->getType();
+  auto *OrigInst = dyn_cast<Instruction>(Operand);
+  Operand = State.getConverted(Operand);
+  // If the operand is a constant, it will have been created by
+  // ConversionState.getConverted, which zero-extends by default.
+  if (isa<Constant>(Operand))
+    return Operand;
+  Instruction *NewInst = BinaryOperator::Create(
+      Instruction::And, Operand,
+      ConstantInt::get(
+          getPromotedType(OrigType),
+          APInt::getLowBitsSet(getPromotedType(OrigType)->getIntegerBitWidth(),
+                               OrigType->getIntegerBitWidth())),
+      Operand->getName() + ".clear", InsertPt);
+  if (OrigInst)
+    CopyDebug(NewInst, OrigInst);
+  return NewInst;
+}
+
+// Return a value with the bits of the operand above the size of the original
+// type equal to the sign bit of the original operand. The new operand is
+// assumed to have been legalized already.
+// This is done by shifting the sign bit of the smaller value up to the MSB
+// position in the larger size, and then arithmetic-shifting it back down.
+static Value *getSignExtend(Value *Operand, Value *OrigOperand,
+                            Instruction *InsertPt) {
+  // If OrigOperand was a constant, NewOperand will have been created by
+  // ConversionState.getConverted, which zero-extends by default. But that is
+  // wrong here, so replace it with a sign-extended constant.
+  if (Constant *C = dyn_cast<Constant>(OrigOperand))
+    return convertConstant(C, /*SignExt=*/true);
+  Type *OrigType = OrigOperand->getType();
+  ConstantInt *ShiftAmt =
+      ConstantInt::getSigned(cast<IntegerType>(getPromotedType(OrigType)),
+                             getPromotedType(OrigType)->getIntegerBitWidth() -
+                                 OrigType->getIntegerBitWidth());
+  BinaryOperator *Shl =
+      BinaryOperator::Create(Instruction::Shl, Operand, ShiftAmt,
+                             Operand->getName() + ".getsign", InsertPt);
+  if (Instruction *Inst = dyn_cast<Instruction>(OrigOperand))
+    CopyDebug(Shl, Inst);
+  return CopyDebug(BinaryOperator::Create(Instruction::AShr, Shl, ShiftAmt,
+                                          Operand->getName() + ".signed",
+                                          InsertPt),
+                   Shl);
+}
+
+static void convertInstruction(DataLayout *DL, Instruction *Inst,
+                               ConversionState &State) {
+  if (SExtInst *Sext = dyn_cast<SExtInst>(Inst)) {
+    Value *Op = Sext->getOperand(0);
+    Value *NewInst = nullptr;
+    // If the operand to be extended is illegal, we first need to fill its
+    // upper bits with its sign bit.
+    if (shouldConvert(Op)) {
+      NewInst = getSignExtend(State.getConverted(Op), Op, Sext);
+    }
+    // If the converted type of the operand is the same as the converted
+    // type of the result, we won't actually be changing the type of the
+    // variable, just its value.
+    if (getPromotedType(Op->getType()) != getPromotedType(Sext->getType())) {
+      NewInst = CopyDebug(
+          new SExtInst(NewInst ? NewInst : State.getConverted(Op),
+                       getPromotedType(cast<IntegerType>(Sext->getType())),
+                       Sext->getName() + ".sext", Sext),
+          Sext);
+    }
+    assert(NewInst && "Failed to convert sign extension");
+    State.recordConverted(Sext, NewInst);
+  } else if (ZExtInst *Zext = dyn_cast<ZExtInst>(Inst)) {
+    Value *Op = Zext->getOperand(0);
+    Value *NewInst = nullptr;
+    if (shouldConvert(Op)) {
+      NewInst = getClearConverted(Op, Zext, State);
+    }
+    // If the converted type of the operand is the same as the converted
+    // type of the result, we won't actually be changing the type of the
+    // variable, just its value.
+    if (getPromotedType(Op->getType()) != getPromotedType(Zext->getType())) {
+      NewInst = CopyDebug(
+          CastInst::CreateZExtOrBitCast(
+              NewInst ? NewInst : State.getConverted(Op),
+              getPromotedType(cast<IntegerType>(Zext->getType())), "", Zext),
+          Zext);
+    }
+    assert(NewInst);
+    State.recordConverted(Zext, NewInst);
+  } else if (TruncInst *Trunc = dyn_cast<TruncInst>(Inst)) {
+    Value *Op = Trunc->getOperand(0);
+    Value *NewInst;
+    // If the converted type of the operand is the same as the converted
+    // type of the result, we don't actually need to change the type of the
+    // variable, just its value. However, because we don't care about the values
+    // of the upper bits until they are consumed, truncation can be a no-op.
+    if (getPromotedType(Op->getType()) != getPromotedType(Trunc->getType())) {
+      NewInst = CopyDebug(
+          new TruncInst(State.getConverted(Op),
+                        getPromotedType(cast<IntegerType>(Trunc->getType())),
+                        State.getConverted(Op)->getName() + ".trunc", Trunc),
+          Trunc);
+    } else {
+      NewInst = State.getConverted(Op);
+    }
+    State.recordConverted(Trunc, NewInst);
+  } else if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+    if (shouldConvert(Load)) {
+      unsigned BaseAlign = Load->getAlignment() == 0
+                               ? DL->getABITypeAlignment(Load->getType())
+                               : Load->getAlignment();
+      splitLoad(DL, Load, State, BaseAlign, /*Offset=*/0);
+    }
+  } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
+    if (shouldConvert(Store->getValueOperand())) {
+      unsigned BaseAlign =
+          Store->getAlignment() == 0
+              ? DL->getABITypeAlignment(Store->getValueOperand()->getType())
+              : Store->getAlignment();
+      splitStore(DL, Store, State, BaseAlign, /*Offset=*/0);
+    }
+  } else if (isa<InvokeInst>(Inst) || isa<CallInst>(Inst) ||
+             isa<LandingPadInst>(Inst)) {
+    for (unsigned I = 0; I < Inst->getNumOperands(); I++) {
+      auto *Arg = Inst->getOperand(I);
+      if (shouldConvert(Arg))
+        Inst->setOperand(I, State.getConverted(Arg));
+    }
+    if (shouldConvert(Inst)) {
+      Inst->mutateType(getPromotedType(Inst->getType()));
+    }
+  } else if (auto *Ret = dyn_cast<ReturnInst>(Inst)) {
+    auto *NewRet = ReturnInst::Create(
+        Ret->getContext(), State.getConverted(Ret->getReturnValue()), Inst);
+    State.recordConverted(Ret, NewRet);
+  } else if (auto *Resume = dyn_cast<ResumeInst>(Inst)) {
+    auto *NewRes =
+        ResumeInst::Create(State.getConverted(Resume->getValue()), Inst);
+    State.recordConverted(Ret, NewRes);
+  } else if (BinaryOperator *Binop = dyn_cast<BinaryOperator>(Inst)) {
+    Value *NewInst = nullptr;
+    switch (Binop->getOpcode()) {
+    case Instruction::AShr: {
+      // The AShr operand needs to be sign-extended to the promoted size
+      // before shifting. Because the sign-extension is implemented with
+      // with AShr, it can be combined with the original operation.
+      Value *Op = Binop->getOperand(0);
+      Value *ShiftAmount = nullptr;
+      APInt SignShiftAmt =
+          APInt(getPromotedType(Op->getType())->getIntegerBitWidth(),
+                getPromotedType(Op->getType())->getIntegerBitWidth() -
+                    Op->getType()->getIntegerBitWidth());
+      NewInst = CopyDebug(
+          BinaryOperator::Create(
+              Instruction::Shl, State.getConverted(Op),
+              ConstantInt::get(getPromotedType(Op->getType()), SignShiftAmt),
+              State.getConverted(Op)->getName() + ".getsign", Binop),
+          Binop);
+      if (ConstantInt *C =
+              dyn_cast<ConstantInt>(State.getConverted(Binop->getOperand(1)))) {
+        ShiftAmount = ConstantInt::get(getPromotedType(Op->getType()),
+                                       SignShiftAmt + C->getValue());
+      } else {
+        // Clear the upper bits of the original shift amount, and add back the
+        // amount we shifted to get the sign bit.
+        ShiftAmount = getClearConverted(Binop->getOperand(1), Binop, State);
+        ShiftAmount =
+            CopyDebug(BinaryOperator::Create(
+                          Instruction::Add, ShiftAmount,
+                          ConstantInt::get(
+                              getPromotedType(Binop->getOperand(1)->getType()),
+                              SignShiftAmt),
+                          State.getConverted(Op)->getName() + ".shamt", Binop),
+                      Binop);
+      }
+      NewInst = CopyDebug(
+          BinaryOperator::Create(Instruction::AShr, NewInst, ShiftAmount,
+                                 Binop->getName() + ".result", Binop),
+          Binop);
+      break;
+    }
+
+    case Instruction::LShr:
+    case Instruction::Shl: {
+      // For LShr, clear the upper bits of the operand before shifting them
+      // down into the valid part of the value.
+      Value *Op = Binop->getOpcode() == Instruction::LShr
+                      ? getClearConverted(Binop->getOperand(0), Binop, State)
+                      : State.getConverted(Binop->getOperand(0));
+      NewInst = BinaryOperator::Create(
+          Binop->getOpcode(), Op,
+          // Clear the upper bits of the shift amount.
+          getClearConverted(Binop->getOperand(1), Binop, State),
+          Binop->getName() + ".result", Binop);
+      break;
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      // These operations don't care about the state of the upper bits.
+      NewInst = CopyDebug(
+          BinaryOperator::Create(Binop->getOpcode(),
+                                 State.getConverted(Binop->getOperand(0)),
+                                 State.getConverted(Binop->getOperand(1)),
+                                 Binop->getName() + ".result", Binop),
+          Binop);
+      break;
+    case Instruction::UDiv:
+    case Instruction::URem:
+      NewInst =
+          CopyDebug(BinaryOperator::Create(
+                        Binop->getOpcode(),
+                        getClearConverted(Binop->getOperand(0), Binop, State),
+                        getClearConverted(Binop->getOperand(1), Binop, State),
+                        Binop->getName() + ".result", Binop),
+                    Binop);
+      break;
+    case Instruction::SDiv:
+    case Instruction::SRem:
+      NewInst =
+          CopyDebug(BinaryOperator::Create(
+                        Binop->getOpcode(),
+                        getSignExtend(State.getConverted(Binop->getOperand(0)),
+                                      Binop->getOperand(0), Binop),
+                        getSignExtend(State.getConverted(Binop->getOperand(1)),
+                                      Binop->getOperand(0), Binop),
+                        Binop->getName() + ".result", Binop),
+                    Binop);
+      break;
+    case Instruction::FAdd:
+    case Instruction::FSub:
+    case Instruction::FMul:
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    case Instruction::BinaryOpsEnd:
+      // We should not see FP operators here.
+      errs() << *Inst << "\n";
+      llvm_unreachable("Cannot handle binary operator");
+      break;
+    }
+    if (isa<OverflowingBinaryOperator>(NewInst)) {
+      cast<BinaryOperator>(NewInst)
+          ->setHasNoUnsignedWrap(Binop->hasNoUnsignedWrap());
+      cast<BinaryOperator>(NewInst)
+          ->setHasNoSignedWrap(Binop->hasNoSignedWrap());
+    }
+    State.recordConverted(Binop, NewInst);
+  } else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Inst)) {
+    Value *Op0, *Op1;
+    // For signed compares, operands are sign-extended to their
+    // promoted type. For unsigned or equality compares, the upper bits are
+    // cleared.
+    if (Cmp->isSigned()) {
+      Op0 = getSignExtend(State.getConverted(Cmp->getOperand(0)),
+                          Cmp->getOperand(0), Cmp);
+      Op1 = getSignExtend(State.getConverted(Cmp->getOperand(1)),
+                          Cmp->getOperand(1), Cmp);
+    } else {
+      Op0 = getClearConverted(Cmp->getOperand(0), Cmp, State);
+      Op1 = getClearConverted(Cmp->getOperand(1), Cmp, State);
+    }
+    Instruction *NewInst =
+        CopyDebug(new ICmpInst(Cmp, Cmp->getPredicate(), Op0, Op1, ""), Cmp);
+    State.recordConverted(Cmp, NewInst);
+  } else if (SelectInst *Select = dyn_cast<SelectInst>(Inst)) {
+    Instruction *NewInst = CopyDebug(
+        SelectInst::Create(
+            Select->getCondition(), State.getConverted(Select->getTrueValue()),
+            State.getConverted(Select->getFalseValue()), "", Select),
+        Select);
+    State.recordConverted(Select, NewInst);
+  } else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+    PHINode *NewPhi = PHINode::Create(getPromotedType(Phi->getType()),
+                                      Phi->getNumIncomingValues(), "", Phi);
+    CopyDebug(NewPhi, Phi);
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I < E; ++I) {
+      NewPhi->addIncoming(State.getConverted(Phi->getIncomingValue(I)),
+                          Phi->getIncomingBlock(I));
+    }
+    State.recordConverted(Phi, NewPhi);
+  } else if (SwitchInst *Switch = dyn_cast<SwitchInst>(Inst)) {
+    Value *Condition = getClearConverted(Switch->getCondition(), Switch, State);
+    SwitchInst *NewInst = SwitchInst::Create(
+        Condition, Switch->getDefaultDest(), Switch->getNumCases(), Switch);
+    CopyDebug(NewInst, Switch);
+    for (SwitchInst::CaseIt I = Switch->case_begin(), E = Switch->case_end();
+         I != E; ++I) {
+      NewInst->addCase(cast<ConstantInt>(convertConstant(I.getCaseValue(),
+                                                         /*SignExt=*/false)),
+                       I.getCaseSuccessor());
+    }
+    Switch->eraseFromParent();
+  } else {
+    errs() << *Inst << "\n";
+    llvm_unreachable("unhandled instruction");
+  }
+}
+
+static bool processFunction(Function &F, DataLayout &DL) {
+  ConversionState State;
+  bool Modified = false; // XXX Emscripten: Fixed use of an uninitialized variable.
+  for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    for (auto BBI = FI->begin(), BBE = FI->end(); BBI != BBE;) {
+      Instruction *Inst = &*BBI++;
+      // Only attempt to convert an instruction if its result or any of its
+      // operands are illegal.
+      bool ShouldConvert = shouldConvert(Inst);
+      for (auto OI = Inst->op_begin(), OE = Inst->op_end(); OI != OE; ++OI)
+        ShouldConvert |= shouldConvert(cast<Value>(OI));
+
+      if (ShouldConvert) {
+        convertInstruction(&DL, Inst, State);
+        Modified = true;
+      }
+    }
+  }
+  State.eraseReplacedInstructions();
+
+  if (Modified)
+    // Clean up bitcasts that were create with constexprs in them.
+    std::unique_ptr<FunctionPass>(createExpandConstantExprPass())
+        ->runOnFunction(F);
+  return Modified;
+}
+
+bool PromoteIntegers::ensureCompliantSignature(
+    LLVMContext &Ctx, Function *OldFct, Module &M) {
+
+  auto *NewFctType = cast<FunctionType>(
+      TypeMapper.getSimpleType(Ctx, OldFct->getFunctionType()));
+  if (NewFctType == OldFct->getFunctionType())
+    return false;
+
+  auto *NewFct = Function::Create(NewFctType, OldFct->getLinkage(), "", &M);
+
+  NewFct->takeName(OldFct);
+  NewFct->copyAttributesFrom(OldFct);
+  for (auto UseIter = OldFct->use_begin(), E = OldFct->use_end();
+       E != UseIter;) {
+    Use &FctUse = *(UseIter++);
+    // Types are not going to match after this.
+    FctUse.set(NewFct);
+  }
+
+  if (OldFct->empty())
+    return true;
+
+  NewFct->getBasicBlockList().splice(NewFct->begin(),
+                                     OldFct->getBasicBlockList());
+  IRBuilder<> Builder(&*NewFct->getEntryBlock().getFirstInsertionPt());
+
+  auto OldArgIter = OldFct->getArgumentList().begin();
+  for (auto &NewArg : NewFct->getArgumentList()) {
+    Argument *OldArg = &*OldArgIter++;
+
+    if (OldArg->getType() != NewArg.getType()) {
+      if (NewArg.getType()->isIntegerTy()) {
+        auto *Replacement = Builder.CreateTrunc(&NewArg, OldArg->getType());
+        Replacement->takeName(OldArg);
+        NewArg.setName(Replacement->getName() + ".exp");
+        OldArg->replaceAllUsesWith(Replacement);
+      } else {
+        // Blindly replace the type of the uses, this is some composite
+        // like a function type.
+        NewArg.takeName(OldArg);
+        for (auto UseIter = OldArg->use_begin(), E = OldArg->use_end();
+             E != UseIter;) {
+          Use &AUse = *(UseIter++);
+          AUse.set(&NewArg);
+        }
+      }
+    } else {
+      NewArg.takeName(OldArg);
+      OldArg->replaceAllUsesWith(&NewArg);
+    }
+  }
+
+  return true;
+}
+
+bool PromoteIntegers::runOnModule(Module &M) {
+  DataLayout DL(&M);
+  LLVMContext &Ctx = M.getContext();
+  bool Modified = false;
+
+  // Change function signatures first.
+  for (auto I = M.begin(), E = M.end(); I != E;) {
+    Function *F = &*I++;
+    bool Changed = ensureCompliantSignature(Ctx, F, M);
+    if (Changed)
+      F->eraseFromParent();
+    Modified |= Changed;
+  }
+
+  for (auto &F : M.getFunctionList())
+    Modified |= processFunction(F, DL);
+
+  return Modified;
+}
+
+ModulePass *llvm::createPromoteIntegersPass() { return new PromoteIntegers(); }
diff --git a/lib/Target/JSBackend/NaCl/RemoveAsmMemory.cpp b/lib/Target/JSBackend/NaCl/RemoveAsmMemory.cpp
new file mode 100644
index 000000000000..f06933b6dd2a
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/RemoveAsmMemory.cpp
@@ -0,0 +1,70 @@
+//===- RemoveAsmMemory.cpp - Remove ``asm("":::"memory")`` ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass removes all instances of ``asm("":::"memory")``.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Pass.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+class RemoveAsmMemory : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  RemoveAsmMemory() : FunctionPass(ID) {
+    initializeRemoveAsmMemoryPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+
+class AsmDirectivesVisitor : public InstVisitor<AsmDirectivesVisitor> {
+public:
+  AsmDirectivesVisitor() : ModifiedFunction(false) {}
+  ~AsmDirectivesVisitor() {}
+  bool modifiedFunction() const { return ModifiedFunction; }
+
+  /// Only Call Instructions are ever inline assembly directives.
+  void visitCallInst(CallInst &CI);
+
+private:
+  bool ModifiedFunction;
+
+  AsmDirectivesVisitor(const AsmDirectivesVisitor &) = delete;
+  AsmDirectivesVisitor &operator=(const AsmDirectivesVisitor &) = delete;
+};
+}
+
+char RemoveAsmMemory::ID = 0;
+INITIALIZE_PASS(RemoveAsmMemory, "remove-asm-memory",
+                "remove all instances of ``asm(\"\":::\"memory\")``", false,
+                false)
+
+bool RemoveAsmMemory::runOnFunction(Function &F) {
+  AsmDirectivesVisitor AV;
+  AV.visit(F);
+  return AV.modifiedFunction();
+}
+
+void AsmDirectivesVisitor::visitCallInst(CallInst &CI) {
+  llvm_unreachable("no longer maintained");
+}
+
+namespace llvm {
+FunctionPass *createRemoveAsmMemoryPass() { return new RemoveAsmMemory(); }
+}
diff --git a/lib/Target/JSBackend/NaCl/ReplacePtrsWithInts.cpp b/lib/Target/JSBackend/NaCl/ReplacePtrsWithInts.cpp
new file mode 100644
index 000000000000..86f311915b36
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ReplacePtrsWithInts.cpp
@@ -0,0 +1,593 @@
+//===- ReplacePtrsWithInts.cpp - Convert pointer values to integer values--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass strips out aggregate pointer types and replaces them with
+// the integer type iPTR, which is i32 for PNaCl (though this pass
+// will allow iPTR to be i64 if the DataLayout specifies 64-bit
+// pointers).
+//
+// This pass relies on -simplify-allocas to transform allocas into arrays of
+// bytes.
+//
+// The pass converts IR to the following normal form:
+//
+// All inttoptr and ptrtoint instructions use the same integer size
+// (iPTR), so they do not implicitly truncate or zero-extend.
+//
+// Pointer types only appear in the following instructions:
+//  * loads and stores:  the pointer operand is a NormalizedPtr.
+//  * function calls:  the function operand is a NormalizedPtr.
+//  * intrinsic calls:  any pointer arguments are NormalizedPtrs.
+//  * alloca
+//  * bitcast and inttoptr:  only used as part of a NormalizedPtr.
+//  * ptrtoint:  the operand is an InherentPtr.
+//
+// Where an InherentPtr is defined as a pointer value that is:
+//  * an alloca;
+//  * a GlobalValue (a function or global variable); or
+//  * an intrinsic call.
+//
+// And a NormalizedPtr is defined as a pointer value that is:
+//  * an inttoptr instruction;
+//  * an InherentPtr; or
+//  * a bitcast of an InherentPtr.
+//
+// This pass currently strips out lifetime markers (that is, calls to
+// the llvm.lifetime.start/end intrinsics) and invariant markers
+// (calls to llvm.invariant.start/end).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  // This is a ModulePass because the pass must recreate functions in
+  // order to change their argument and return types.
+  struct ReplacePtrsWithInts : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    ReplacePtrsWithInts() : ModulePass(ID) {
+      initializeReplacePtrsWithIntsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+
+  // FunctionConverter stores the state for mapping old instructions
+  // (of pointer type) to converted instructions (of integer type)
+  // within a function, and provides methods for doing the conversion.
+  class FunctionConverter {
+    // Int type that pointer types are to be replaced with, typically i32.
+    Type *IntPtrType;
+
+    struct RewrittenVal {
+      RewrittenVal(): Placeholder(NULL), NewIntVal(NULL) {}
+      Value *Placeholder;
+      Value *NewIntVal;
+    };
+    // Maps from old values (of pointer type) to converted values (of
+    // IntPtrType type).
+    DenseMap<Value *, RewrittenVal> RewriteMap;
+
+  public:
+    FunctionConverter(Type *IntPtrType) : IntPtrType(IntPtrType) {}
+
+    // Returns the normalized version of the given type, converting
+    // pointer types to IntPtrType.
+    Type *convertType(Type *Ty);
+    // Returns the normalized version of the given function type by
+    // normalizing the function's argument types.
+    FunctionType *convertFuncType(FunctionType *FTy);
+
+    // Records that 'To' is the normalized version of 'From'.  If 'To'
+    // is not of pointer type, no type conversion is required, so this
+    // can take the short cut of replacing 'To' with 'From'.
+    void recordConverted(Value *From, Value *To);
+    void recordConvertedAndErase(Instruction *From, Value *To);
+
+    // Returns Val with no-op casts (those that convert between
+    // IntPtrType and pointer types) stripped off.
+    Value *stripNoopCasts(Value *Val);
+
+    // Returns the normalized version of the given value.
+    //
+    // If the conversion of Val has been deferred, this returns a
+    // placeholder object, which will later be replaceAllUsesWith'd to
+    // the final value.  Since replaceAllUsesWith does not work on
+    // references by metadata nodes, this can be bypassed using
+    // BypassPlaceholder to get the real converted value, assuming it
+    // is available.
+    Value *convert(Value *Val, bool BypassPlaceholder = false);
+    // Returns the NormalizedPtr form of the given pointer value.
+    // Inserts conversion instructions at InsertPt.
+    Value *convertBackToPtr(Value *Val, Instruction *InsertPt);
+    // Returns the NormalizedPtr form of the given function pointer.
+    // Inserts conversion instructions at InsertPt.
+    Value *convertFunctionPtr(Value *Callee, Instruction *InsertPt);
+    // Converts an instruction without recreating it, by wrapping its
+    // operands and result.
+    void convertInPlace(Instruction *Inst);
+
+    void eraseReplacedInstructions();
+
+    // List of instructions whose deletion has been deferred.
+    SmallVector<Instruction *, 20> ToErase;
+  };
+}
+
+Type *FunctionConverter::convertType(Type *Ty) {
+  if (Ty->isPointerTy())
+    return IntPtrType;
+  return Ty;
+}
+
+FunctionType *FunctionConverter::convertFuncType(FunctionType *FTy) {
+  SmallVector<Type *, 8> ArgTypes;
+  for (FunctionType::param_iterator ArgTy = FTy->param_begin(),
+           E = FTy->param_end(); ArgTy != E; ++ArgTy) {
+    ArgTypes.push_back(convertType(*ArgTy));
+  }
+  return FunctionType::get(convertType(FTy->getReturnType()), ArgTypes,
+                           FTy->isVarArg());
+}
+
+void FunctionConverter::recordConverted(Value *From, Value *To) {
+  if (!From->getType()->isPointerTy()) {
+    From->replaceAllUsesWith(To);
+    return;
+  }
+  RewrittenVal *RV = &RewriteMap[From];
+  assert(!RV->NewIntVal);
+  RV->NewIntVal = To;
+}
+
+void FunctionConverter::recordConvertedAndErase(Instruction *From, Value *To) {
+  recordConverted(From, To);
+  // There may still be references to this value, so defer deleting it.
+  ToErase.push_back(From);
+}
+
+Value *FunctionConverter::stripNoopCasts(Value *Val) {
+  SmallPtrSet<Value *, 4> Visited;
+  for (;;) {
+    if (!Visited.insert(Val).second) {
+      // It is possible to get a circular reference in unreachable
+      // basic blocks.  Handle this case for completeness.
+      return UndefValue::get(Val->getType());
+    }
+    if (CastInst *Cast = dyn_cast<CastInst>(Val)) {
+      Value *Src = Cast->getOperand(0);
+      if ((isa<BitCastInst>(Cast) && Cast->getType()->isPointerTy()) ||
+          (isa<PtrToIntInst>(Cast) && Cast->getType() == IntPtrType) ||
+          (isa<IntToPtrInst>(Cast) && Src->getType() == IntPtrType)) {
+        Val = Src;
+        continue;
+      }
+    }
+    return Val;
+  }
+}
+
+Value *FunctionConverter::convert(Value *Val, bool BypassPlaceholder) {
+  Val = stripNoopCasts(Val);
+  if (!Val->getType()->isPointerTy())
+    return Val;
+  if (Constant *C = dyn_cast<Constant>(Val))
+    return ConstantExpr::getPtrToInt(C, IntPtrType);
+  RewrittenVal *RV = &RewriteMap[Val];
+  if (BypassPlaceholder) {
+    assert(RV->NewIntVal);
+    return RV->NewIntVal;
+  }
+  if (!RV->Placeholder)
+    RV->Placeholder = new Argument(convertType(Val->getType()));
+  return RV->Placeholder;
+}
+
+Value *FunctionConverter::convertBackToPtr(Value *Val, Instruction *InsertPt) {
+  Type *NewTy =
+    convertType(Val->getType()->getPointerElementType())->getPointerTo();
+  return new IntToPtrInst(convert(Val), NewTy, "", InsertPt);
+}
+
+Value *FunctionConverter::convertFunctionPtr(Value *Callee,
+                                             Instruction *InsertPt) {
+  FunctionType *FuncType = cast<FunctionType>(
+      Callee->getType()->getPointerElementType());
+  return new IntToPtrInst(convert(Callee),
+                          convertFuncType(FuncType)->getPointerTo(),
+                          "", InsertPt);
+}
+
+static bool ShouldLeaveAlone(Value *V) {
+  if (Function *F = dyn_cast<Function>(V))
+    return F->isIntrinsic();
+  if (isa<InlineAsm>(V))
+    return true;
+  return false;
+}
+
+void FunctionConverter::convertInPlace(Instruction *Inst) {
+  // Convert operands.
+  for (unsigned I = 0; I < Inst->getNumOperands(); ++I) {
+    Value *Arg = Inst->getOperand(I);
+    if (Arg->getType()->isPointerTy() && !ShouldLeaveAlone(Arg)) {
+      Value *Conv = convert(Arg);
+      Inst->setOperand(I, new IntToPtrInst(Conv, Arg->getType(), "", Inst));
+    }
+  }
+  // Convert result.
+  if (Inst->getType()->isPointerTy()) {
+    Instruction *Cast = new PtrToIntInst(
+        Inst, convertType(Inst->getType()), Inst->getName() + ".asint");
+    Cast->insertAfter(Inst);
+    recordConverted(Inst, Cast);
+  }
+}
+
+void FunctionConverter::eraseReplacedInstructions() {
+  bool Error = false;
+  for (DenseMap<Value *, RewrittenVal>::iterator I = RewriteMap.begin(),
+           E = RewriteMap.end(); I != E; ++I) {
+    if (I->second.Placeholder) {
+      if (I->second.NewIntVal) {
+        I->second.Placeholder->replaceAllUsesWith(I->second.NewIntVal);
+      } else {
+        errs() << "Not converted: " << *I->first << "\n";
+        Error = true;
+      }
+    }
+  }
+  if (Error)
+    report_fatal_error("Case not handled in ReplacePtrsWithInts");
+
+  // Delete the placeholders in a separate pass.  This means that if
+  // one placeholder is accidentally rewritten to another, we will get
+  // a useful error message rather than accessing a dangling pointer.
+  for (DenseMap<Value *, RewrittenVal>::iterator I = RewriteMap.begin(),
+           E = RewriteMap.end(); I != E; ++I) {
+    delete I->second.Placeholder;
+  }
+
+  // We must do dropAllReferences() before doing eraseFromParent(),
+  // otherwise we will try to erase instructions that are still
+  // referenced.
+  for (SmallVectorImpl<Instruction *>::iterator I = ToErase.begin(),
+           E = ToErase.end();
+       I != E; ++I) {
+    (*I)->dropAllReferences();
+  }
+  for (SmallVectorImpl<Instruction *>::iterator I = ToErase.begin(),
+           E = ToErase.end();
+       I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+}
+
+// Remove attributes that only apply to pointer arguments.  Returns
+// the updated AttributeSet.
+static AttributeSet RemovePointerAttrs(LLVMContext &Context,
+                                       AttributeSet Attrs) {
+  SmallVector<AttributeSet, 8> AttrList;
+  for (unsigned Slot = 0; Slot < Attrs.getNumSlots(); ++Slot) {
+    unsigned Index = Attrs.getSlotIndex(Slot);
+    AttrBuilder AB;
+    for (AttributeSet::iterator Attr = Attrs.begin(Slot), E = Attrs.end(Slot);
+         Attr != E; ++Attr) {
+      if (!Attr->isEnumAttribute()) {
+        continue;
+      }
+      switch (Attr->getKindAsEnum()) {
+        // ByVal and StructRet should already have been removed by the
+        // ExpandByVal pass.
+        case Attribute::ByVal:
+        case Attribute::StructRet:
+        case Attribute::Nest:
+          Attrs.dump();
+          report_fatal_error("ReplacePtrsWithInts cannot handle "
+                             "byval, sret or nest attrs");
+          break;
+        // Strip these attributes because they apply only to pointers. This pass
+        // rewrites pointer arguments, thus these parameter attributes are
+        // meaningless. Also, they are rejected by the PNaCl module verifier.
+        case Attribute::NoCapture:
+        case Attribute::NoAlias:
+        case Attribute::ReadNone:
+        case Attribute::ReadOnly:
+        case Attribute::NonNull:
+        case Attribute::Dereferenceable:
+        case Attribute::DereferenceableOrNull:
+          break;
+        default:
+          AB.addAttribute(*Attr);
+      }
+    }
+    AttrList.push_back(AttributeSet::get(Context, Index, AB));
+  }
+  return AttributeSet::get(Context, AttrList);
+}
+
+static void ConvertInstruction(DataLayout *DL, Type *IntPtrType,
+                               FunctionConverter *FC, Instruction *Inst) {
+  if (ReturnInst *Ret = dyn_cast<ReturnInst>(Inst)) {
+    Value *Result = Ret->getReturnValue();
+    if (Result)
+      Result = FC->convert(Result);
+    CopyDebug(ReturnInst::Create(Ret->getContext(), Result, Ret), Inst);
+    Ret->eraseFromParent();
+  } else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+    PHINode *Phi2 = PHINode::Create(FC->convertType(Phi->getType()),
+                                    Phi->getNumIncomingValues(),
+                                    "", Phi);
+    CopyDebug(Phi2, Phi);
+    for (unsigned I = 0; I < Phi->getNumIncomingValues(); ++I) {
+      Phi2->addIncoming(FC->convert(Phi->getIncomingValue(I)),
+                        Phi->getIncomingBlock(I));
+    }
+    Phi2->takeName(Phi);
+    FC->recordConvertedAndErase(Phi, Phi2);
+  } else if (SelectInst *Op = dyn_cast<SelectInst>(Inst)) {
+    Instruction *Op2 = SelectInst::Create(Op->getCondition(),
+                                          FC->convert(Op->getTrueValue()),
+                                          FC->convert(Op->getFalseValue()),
+                                          "", Op);
+    CopyDebug(Op2, Op);
+    Op2->takeName(Op);
+    FC->recordConvertedAndErase(Op, Op2);
+  } else if (isa<PtrToIntInst>(Inst) || isa<IntToPtrInst>(Inst)) {
+    Value *Arg = FC->convert(Inst->getOperand(0));
+    Type *ResultTy = FC->convertType(Inst->getType());
+    unsigned ArgSize = Arg->getType()->getIntegerBitWidth();
+    unsigned ResultSize = ResultTy->getIntegerBitWidth();
+    Value *Result;
+    // We avoid using IRBuilder's CreateZExtOrTrunc() here because it
+    // constant-folds ptrtoint ConstantExprs.  This leads to creating
+    // ptrtoints of non-IntPtrType type, which is not what we want,
+    // because we want truncation/extension to be done explicitly by
+    // separate instructions.
+    if (ArgSize == ResultSize) {
+      Result = Arg;
+    } else {
+      Instruction::CastOps CastType =
+          ArgSize > ResultSize ? Instruction::Trunc : Instruction::ZExt;
+      Result = CopyDebug(CastInst::Create(CastType, Arg, ResultTy, "", Inst),
+                         Inst);
+    }
+    if (Result != Arg)
+      Result->takeName(Inst);
+    FC->recordConvertedAndErase(Inst, Result);
+  } else if (isa<BitCastInst>(Inst)) {
+    if (Inst->getType()->isPointerTy()) {
+      FC->ToErase.push_back(Inst);
+    }
+  } else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Inst)) {
+    Value *Cmp2 = CopyDebug(new ICmpInst(Inst, Cmp->getPredicate(),
+                                         FC->convert(Cmp->getOperand(0)),
+                                         FC->convert(Cmp->getOperand(1)), ""),
+                            Inst);
+    Cmp2->takeName(Cmp);
+    Cmp->replaceAllUsesWith(Cmp2);
+    Cmp->eraseFromParent();
+  } else if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
+    Value *Ptr = FC->convertBackToPtr(Load->getPointerOperand(), Inst);
+    LoadInst *Result = new LoadInst(Ptr, "", Inst);
+    Result->takeName(Inst);
+    CopyDebug(Result, Inst);
+    CopyLoadOrStoreAttrs(Result, Load);
+    FC->recordConvertedAndErase(Inst, Result);
+  } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
+    Value *Ptr = FC->convertBackToPtr(Store->getPointerOperand(), Inst);
+    StoreInst *Result = new StoreInst(FC->convert(Store->getValueOperand()),
+                                      Ptr, Inst);
+    CopyDebug(Result, Inst);
+    CopyLoadOrStoreAttrs(Result, Store);
+    Inst->eraseFromParent();
+  } else if (CallInst *Call = dyn_cast<CallInst>(Inst)) {
+    if (IntrinsicInst *ICall = dyn_cast<IntrinsicInst>(Inst)) {
+      if (ICall->getIntrinsicID() == Intrinsic::lifetime_start ||
+          ICall->getIntrinsicID() == Intrinsic::lifetime_end ||
+          ICall->getIntrinsicID() == Intrinsic::invariant_start) {
+        // Remove alloca lifetime markers for now.  This is because
+        // the GVN pass can introduce lifetime markers taking PHI
+        // nodes as arguments.  If ReplacePtrsWithInts converts the
+        // PHI node to int type, we will render those lifetime markers
+        // ineffective.  But dropping a subset of lifetime markers is
+        // not safe in general.  So, until LLVM better defines the
+        // semantics of lifetime markers, we drop them all.  See:
+        // https://code.google.com/p/nativeclient/issues/detail?id=3443
+        // We do the same for invariant.start/end because they work in
+        // a similar way.
+        Inst->eraseFromParent();
+      } else {
+        FC->convertInPlace(Inst);
+      }
+    } else if (isa<InlineAsm>(Call->getCalledValue())) {
+      FC->convertInPlace(Inst);
+    } else {
+      SmallVector<Value *, 10> Args;
+      for (unsigned I = 0; I < Call->getNumArgOperands(); ++I)
+        Args.push_back(FC->convert(Call->getArgOperand(I)));
+      CallInst *NewCall = CallInst::Create(
+          FC->convertFunctionPtr(Call->getCalledValue(), Call),
+          Args, "", Inst);
+      CopyDebug(NewCall, Call);
+      NewCall->setAttributes(RemovePointerAttrs(Call->getContext(),
+                                                Call->getAttributes()));
+      NewCall->setCallingConv(Call->getCallingConv());
+      NewCall->setTailCall(Call->isTailCall());
+      NewCall->takeName(Call);
+      FC->recordConvertedAndErase(Call, NewCall);
+    }
+  } else if (InvokeInst *Call = dyn_cast<InvokeInst>(Inst)) {
+    SmallVector<Value *, 10> Args;
+    for (unsigned I = 0; I < Call->getNumArgOperands(); ++I)
+      Args.push_back(FC->convert(Call->getArgOperand(I)));
+    InvokeInst *NewCall = InvokeInst::Create(
+        FC->convertFunctionPtr(Call->getCalledValue(), Call),
+        Call->getNormalDest(),
+        Call->getUnwindDest(),
+        Args, "", Inst);
+    CopyDebug(NewCall, Call);
+    NewCall->setAttributes(RemovePointerAttrs(Call->getContext(),
+                                              Call->getAttributes()));
+    NewCall->setCallingConv(Call->getCallingConv());
+    NewCall->takeName(Call);
+    FC->recordConvertedAndErase(Call, NewCall);
+  } else if (// Handle these instructions as a convenience to allow
+             // the pass to be used in more situations, even though we
+             // don't expect them in PNaCl's stable ABI.
+             isa<AllocaInst>(Inst) ||
+             isa<GetElementPtrInst>(Inst) ||
+             isa<VAArgInst>(Inst) ||
+             isa<IndirectBrInst>(Inst) ||
+             isa<ExtractValueInst>(Inst) ||
+             isa<InsertValueInst>(Inst) ||
+             // These atomics only operate on integer pointers, not
+             // other pointers, so we don't need to recreate the
+             // instruction.
+             isa<AtomicCmpXchgInst>(Inst) ||
+             isa<AtomicRMWInst>(Inst)) {
+    FC->convertInPlace(Inst);
+  }
+}
+
+// Convert ptrtoint+inttoptr to a bitcast because it's shorter and
+// because some intrinsics work on bitcasts but not on
+// ptrtoint+inttoptr, in particular:
+//  * llvm.lifetime.start/end (although we strip these out)
+//  * llvm.eh.typeid.for
+static void SimplifyCasts(Instruction *Inst, Type *IntPtrType) {
+  if (IntToPtrInst *Cast1 = dyn_cast<IntToPtrInst>(Inst)) {
+    if (PtrToIntInst *Cast2 = dyn_cast<PtrToIntInst>(Cast1->getOperand(0))) {
+      assert(Cast2->getType() == IntPtrType);
+      Value *V = Cast2->getPointerOperand();
+      if (V->getType() != Cast1->getType())
+        V = new BitCastInst(V, Cast1->getType(), V->getName() + ".bc", Cast1);
+      Cast1->replaceAllUsesWith(V);
+      if (Cast1->use_empty())
+        Cast1->eraseFromParent();
+      if (Cast2->use_empty())
+        Cast2->eraseFromParent();
+    }
+  }
+}
+
+static void CleanUpFunction(Function *Func, Type *IntPtrType) {
+  // Remove the ptrtoint/bitcast ConstantExprs we introduced for
+  // referencing globals.
+  FunctionPass *Pass = createExpandConstantExprPass();
+  Pass->runOnFunction(*Func);
+  delete Pass;
+
+  for (Function::iterator BB = Func->begin(), E = Func->end();
+       BB != E; ++BB) {
+    for (BasicBlock::iterator Iter = BB->begin(), E = BB->end();
+         Iter != E; ) {
+      SimplifyCasts(&*Iter++, IntPtrType);
+    }
+  }
+  // Cleanup pass.
+  for (Function::iterator BB = Func->begin(), E = Func->end();
+       BB != E; ++BB) {
+    for (BasicBlock::iterator Iter = BB->begin(), E = BB->end();
+         Iter != E; ) {
+      Instruction *Inst = &*Iter++;
+      // Add names to inttoptrs to make the output more readable.  The
+      // placeholder values get in the way of doing this earlier when
+      // the inttoptrs are created.
+      if (isa<IntToPtrInst>(Inst))
+        Inst->setName(Inst->getOperand(0)->getName() + ".asptr");
+      // Remove ptrtoints that were introduced for allocas but not used.
+      if (isa<PtrToIntInst>(Inst) && Inst->use_empty())
+        Inst->eraseFromParent();
+    }
+  }
+}
+
+char ReplacePtrsWithInts::ID = 0;
+INITIALIZE_PASS(ReplacePtrsWithInts, "replace-ptrs-with-ints",
+                "Convert pointer values to integer values",
+                false, false)
+
+bool ReplacePtrsWithInts::runOnModule(Module &M) {
+  DataLayout DL(&M);
+  Type *IntPtrType = DL.getIntPtrType(M.getContext());
+
+  for (Module::iterator Iter = M.begin(), E = M.end(); Iter != E; ) {
+    Function *OldFunc = &*Iter++;
+    // Intrinsics' types must be left alone.
+    if (OldFunc->isIntrinsic())
+      continue;
+
+    FunctionConverter FC(IntPtrType);
+    FunctionType *NFTy = FC.convertFuncType(OldFunc->getFunctionType());
+    OldFunc->setAttributes(RemovePointerAttrs(M.getContext(),
+                                              OldFunc->getAttributes()));
+    Function *NewFunc = RecreateFunction(OldFunc, NFTy);
+
+    // Move the arguments across to the new function.
+    for (Function::arg_iterator Arg = OldFunc->arg_begin(),
+             E = OldFunc->arg_end(), NewArg = NewFunc->arg_begin();
+         Arg != E; ++Arg, ++NewArg) {
+      FC.recordConverted(&*Arg, &*NewArg);
+      NewArg->takeName(&*Arg);
+    }
+
+    // invariant.end calls refer to invariant.start calls, so we must
+    // remove the former first.
+    for (Function::iterator BB = NewFunc->begin(), E = NewFunc->end();
+         BB != E; ++BB) {
+      for (BasicBlock::iterator Iter = BB->begin(), E = BB->end();
+           Iter != E; ) {
+        if (IntrinsicInst *ICall = dyn_cast<IntrinsicInst>(Iter++)) {
+          if (ICall->getIntrinsicID() == Intrinsic::invariant_end)
+            ICall->eraseFromParent();
+        }
+      }
+    }
+
+    // Convert the function body.
+    for (Function::iterator BB = NewFunc->begin(), E = NewFunc->end();
+         BB != E; ++BB) {
+      for (BasicBlock::iterator Iter = BB->begin(), E = BB->end();
+           Iter != E; ) {
+        ConvertInstruction(&DL, IntPtrType, &FC, &*Iter++);
+      }
+    }
+    FC.eraseReplacedInstructions();
+
+    OldFunc->eraseFromParent();
+  }
+  // Now that all functions have their normalized types, we can remove
+  // various casts.
+  for (Module::iterator Func = M.begin(), E = M.end(); Func != E; ++Func) {
+    CleanUpFunction(&*Func, IntPtrType);
+    // Delete the now-unused bitcast ConstantExprs that we created so
+    // that they don't interfere with StripDeadPrototypes.
+    Func->removeDeadConstantUsers();
+  }
+  return true;
+}
+
+ModulePass *llvm::createReplacePtrsWithIntsPass() {
+  return new ReplacePtrsWithInts();
+}
diff --git a/lib/Target/JSBackend/NaCl/ResolvePNaClIntrinsics.cpp b/lib/Target/JSBackend/NaCl/ResolvePNaClIntrinsics.cpp
new file mode 100644
index 000000000000..616866782014
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/ResolvePNaClIntrinsics.cpp
@@ -0,0 +1,489 @@
+//===- ResolvePNaClIntrinsics.cpp - Resolve calls to PNaCl intrinsics ----====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass resolves calls to PNaCl stable bitcode intrinsics. It is
+// normally run in the PNaCl translator.
+//
+// Running AddPNaClExternalDeclsPass is a precondition for running this
+// pass. They are separate because one is a ModulePass and the other is
+// a FunctionPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NaClAtomicIntrinsics.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Transforms/Utils/Local.h"
+#if defined(PNACL_BROWSER_TRANSLATOR)
+#include "native_client/src/untrusted/nacl/pnacl.h"
+#endif
+
+using namespace llvm;
+
+namespace {
+class ResolvePNaClIntrinsics : public FunctionPass {
+public:
+  ResolvePNaClIntrinsics() : FunctionPass(ID) {
+    initializeResolvePNaClIntrinsicsPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID;
+  bool runOnFunction(Function &F) override;
+
+  /// Interface specifying how intrinsic calls should be resolved. Each
+  /// intrinsic call handled by the implementor will be visited by the
+  /// doResolve method.
+  class CallResolver {
+  public:
+    /// Called once per \p Call to the intrinsic in the module.
+    /// Returns true if the Function was changed.
+    bool resolve(IntrinsicInst *Call) {
+      // To be a well-behaving FunctionPass, don't touch uses in other
+      // functions. These will be handled when the pass manager gets to
+      // those functions.
+      if (Call->getParent()->getParent() == &F)
+        return doResolve(Call);
+      return false;
+    }
+    Function *getDeclaration() const { return doGetDeclaration(); }
+    std::string getName() { return Intrinsic::getName(IntrinsicID); }
+
+  protected:
+    Function &F;
+    Module *M;
+    Intrinsic::ID IntrinsicID;
+
+    CallResolver(Function &F, Intrinsic::ID IntrinsicID)
+        : F(F), M(F.getParent()), IntrinsicID(IntrinsicID) {}
+    virtual ~CallResolver() {}
+
+    /// The following pure virtual methods must be defined by
+    /// implementors, and will be called once per intrinsic call.
+    /// NOTE: doGetDeclaration() should only "get" the intrinsic declaration
+    /// and not *add* decls to the module. Declarations should be added
+    /// up front by the AddPNaClExternalDecls module pass.
+    virtual Function *doGetDeclaration() const = 0;
+    /// Returns true if the Function was changed.
+    virtual bool doResolve(IntrinsicInst *Call) = 0;
+
+  private:
+    CallResolver(const CallResolver &) = delete;
+    CallResolver &operator=(const CallResolver &) = delete;
+  };
+
+private:
+  /// Visit all calls matching the \p Resolver's declaration, and invoke
+  /// the CallResolver methods on each of them.
+  bool visitCalls(CallResolver &Resolver);
+};
+
+/// Rewrite intrinsic calls to another function.
+class IntrinsicCallToFunctionCall :
+    public ResolvePNaClIntrinsics::CallResolver {
+public:
+  IntrinsicCallToFunctionCall(Function &F, Intrinsic::ID IntrinsicID,
+                              const char *TargetFunctionName)
+      : CallResolver(F, IntrinsicID),
+        TargetFunction(M->getFunction(TargetFunctionName)) {
+    // Expect to find the target function for this intrinsic already
+    // declared, even if it is never used.
+    if (!TargetFunction)
+      report_fatal_error(std::string(
+          "Expected to find external declaration of ") + TargetFunctionName);
+  }
+  ~IntrinsicCallToFunctionCall() override {}
+
+private:
+  Function *TargetFunction;
+
+  Function *doGetDeclaration() const override {
+    return Intrinsic::getDeclaration(M, IntrinsicID);
+  }
+
+  bool doResolve(IntrinsicInst *Call) override {
+    Call->setCalledFunction(TargetFunction);
+    if (IntrinsicID == Intrinsic::nacl_setjmp) {
+      // The "returns_twice" attribute is required for correctness,
+      // otherwise the backend will reuse stack slots in a way that is
+      // incorrect for setjmp().  See:
+      // https://code.google.com/p/nativeclient/issues/detail?id=3733
+      Call->setCanReturnTwice();
+    }
+    return true;
+  }
+
+  IntrinsicCallToFunctionCall(const IntrinsicCallToFunctionCall &) = delete;
+  IntrinsicCallToFunctionCall &
+  operator=(const IntrinsicCallToFunctionCall &) = delete;
+};
+
+/// Rewrite intrinsic calls to a constant whose value is determined by a
+/// functor. This functor is called once per Call, and returns a
+/// Constant that should replace the Call.
+template <class Callable>
+class ConstantCallResolver : public ResolvePNaClIntrinsics::CallResolver {
+public:
+  ConstantCallResolver(Function &F, Intrinsic::ID IntrinsicID,
+                       Callable Functor)
+      : CallResolver(F, IntrinsicID), Functor(Functor) {}
+  ~ConstantCallResolver() override {}
+
+private:
+  Callable Functor;
+
+  Function *doGetDeclaration() const override {
+    return Intrinsic::getDeclaration(M, IntrinsicID);
+  }
+
+  bool doResolve(IntrinsicInst *Call) override {
+    Constant *C = Functor(Call);
+    Call->replaceAllUsesWith(C);
+    Call->eraseFromParent();
+    return true;
+  }
+
+  ConstantCallResolver(const ConstantCallResolver &) = delete;
+  ConstantCallResolver &operator=(const ConstantCallResolver &) = delete;
+};
+
+/// Resolve __nacl_atomic_is_lock_free to true/false at translation
+/// time. PNaCl's currently supported platforms all support lock-free atomics at
+/// byte sizes {1,2,4,8} except for MIPS and asmjs architectures that supports
+/// lock-free atomics at byte sizes {1,2,4}, and the alignment of the pointer is
+/// always expected to be natural (as guaranteed by C11 and C++11). PNaCl's
+/// Module-level ABI verification checks that the byte size is constant and in
+/// {1,2,4,8}.
+struct IsLockFreeToConstant {
+  Constant *operator()(CallInst *Call) {
+    uint64_t MaxLockFreeByteSize = 8;
+    const APInt &ByteSize =
+        cast<Constant>(Call->getOperand(0))->getUniqueInteger();
+
+#   if defined(PNACL_BROWSER_TRANSLATOR)
+    switch (__builtin_nacl_target_arch()) {
+    case PnaclTargetArchitectureX86_32:
+    case PnaclTargetArchitectureX86_64:
+    case PnaclTargetArchitectureARM_32:
+      break;
+    case PnaclTargetArchitectureMips_32:
+      MaxLockFreeByteSize = 4;
+      break;
+    default:
+      errs() << "Architecture: " << Triple::getArchTypeName(Arch) << "\n";
+      report_fatal_error("is_lock_free: unhandled architecture");
+    }
+#   else
+    switch (Arch) {
+    case Triple::x86:
+    case Triple::x86_64:
+    case Triple::arm:
+      break;
+    case Triple::mipsel:
+    case Triple::asmjs:
+      MaxLockFreeByteSize = 4;
+      break;
+    default:
+      errs() << "Architecture: " << Triple::getArchTypeName(Arch) << "\n";
+      report_fatal_error("is_lock_free: unhandled architecture");
+    }
+#   endif
+
+    bool IsLockFree = ByteSize.ule(MaxLockFreeByteSize);
+    auto *C = ConstantInt::get(Call->getType(), IsLockFree);
+    return C;
+  }
+
+  Triple::ArchType Arch;
+  IsLockFreeToConstant(Module *M)
+      : Arch(Triple(M->getTargetTriple()).getArch()) {}
+  IsLockFreeToConstant() = delete;
+};
+
+/// Rewrite atomic intrinsics to LLVM IR instructions.
+class AtomicCallResolver : public ResolvePNaClIntrinsics::CallResolver {
+public:
+  AtomicCallResolver(Function &F,
+                     const NaCl::AtomicIntrinsics::AtomicIntrinsic *I)
+      : CallResolver(F, I->ID), I(I) {}
+  ~AtomicCallResolver() override {}
+
+private:
+  const NaCl::AtomicIntrinsics::AtomicIntrinsic *I;
+
+  Function *doGetDeclaration() const override { return I->getDeclaration(M); }
+
+  bool doResolve(IntrinsicInst *Call) override {
+    // Assume the @llvm.nacl.atomic.* intrinsics follow the PNaCl ABI:
+    // this should have been checked by the verifier.
+    bool isVolatile = false;
+    SynchronizationScope SS = CrossThread;
+    Instruction *I;
+    SmallVector<Instruction *, 16> MaybeDead;
+
+    switch (Call->getIntrinsicID()) {
+    default:
+      llvm_unreachable("unknown atomic intrinsic");
+    case Intrinsic::nacl_atomic_load:
+      I = new LoadInst(Call->getArgOperand(0), "", isVolatile,
+                       alignmentFromPointer(Call->getArgOperand(0)),
+                       thawMemoryOrder(Call->getArgOperand(1)), SS, Call);
+      break;
+    case Intrinsic::nacl_atomic_store:
+      I = new StoreInst(Call->getArgOperand(0), Call->getArgOperand(1),
+                        isVolatile,
+                        alignmentFromPointer(Call->getArgOperand(1)),
+                        thawMemoryOrder(Call->getArgOperand(2)), SS, Call);
+      break;
+    case Intrinsic::nacl_atomic_rmw:
+      I = new AtomicRMWInst(thawRMWOperation(Call->getArgOperand(0)),
+                            Call->getArgOperand(1), Call->getArgOperand(2),
+                            thawMemoryOrder(Call->getArgOperand(3)), SS, Call);
+      break;
+    case Intrinsic::nacl_atomic_cmpxchg:
+      I = new AtomicCmpXchgInst(
+          Call->getArgOperand(0), Call->getArgOperand(1),
+          Call->getArgOperand(2), thawMemoryOrder(Call->getArgOperand(3)),
+          thawMemoryOrder(Call->getArgOperand(4)), SS, Call);
+
+      // cmpxchg returns struct { T loaded, i1 success } whereas the PNaCl
+      // intrinsic only returns the loaded value. The Call can't simply be
+      // replaced. Identify loaded+success structs that can be replaced by the
+      // cmxpchg's returned struct.
+      {
+        Instruction *Loaded = nullptr;
+        Instruction *Success = nullptr;
+        for (User *CallUser : Call->users()) {
+          if (auto ICmp = dyn_cast<ICmpInst>(CallUser)) {
+            // Identify comparisons for cmpxchg's success.
+            if (ICmp->getPredicate() != CmpInst::ICMP_EQ)
+              continue;
+            Value *LHS = ICmp->getOperand(0);
+            Value *RHS = ICmp->getOperand(1);
+            Value *Old = I->getOperand(1);
+            if (RHS != Old && LHS != Old) // Call is either RHS or LHS.
+              continue; // The comparison isn't checking for cmpxchg's success.
+
+            // Recognize the pattern creating struct { T loaded, i1 success }:
+            // it can be replaced by cmpxchg's result.
+            for (User *InsUser : ICmp->users()) {
+              if (!isa<Instruction>(InsUser) ||
+                  cast<Instruction>(InsUser)->getParent() != Call->getParent())
+                continue; // Different basic blocks, don't be clever.
+              auto Ins = dyn_cast<InsertValueInst>(InsUser);
+              if (!Ins)
+                continue;
+              auto InsTy = dyn_cast<StructType>(Ins->getType());
+              if (!InsTy)
+                continue;
+              if (!InsTy->isLayoutIdentical(cast<StructType>(I->getType())))
+                continue; // Not a struct { T loaded, i1 success }.
+              if (Ins->getNumIndices() != 1 || Ins->getIndices()[0] != 1)
+                continue; // Not an insert { T, i1 } %something, %success, 1.
+              auto TIns = dyn_cast<InsertValueInst>(Ins->getAggregateOperand());
+              if (!TIns)
+                continue; // T wasn't inserted into the struct, don't be clever.
+              if (!isa<UndefValue>(TIns->getAggregateOperand()))
+                continue; // Not an insert into an undef value, don't be clever.
+              if (TIns->getInsertedValueOperand() != Call)
+                continue; // Not inserting the loaded value.
+              if (TIns->getNumIndices() != 1 || TIns->getIndices()[0] != 0)
+                continue; // Not an insert { T, i1 } undef, %loaded, 0.
+              // Hooray! This is the struct you're looking for.
+
+              // Keep track of values extracted from the struct, instead of
+              // recreating them.
+              for (User *StructUser : Ins->users()) {
+                if (auto Extract = dyn_cast<ExtractValueInst>(StructUser)) {
+                  MaybeDead.push_back(Extract);
+                  if (!Loaded && Extract->getIndices()[0] == 0) {
+                    Loaded = cast<Instruction>(StructUser);
+                    Loaded->moveBefore(Call);
+                  } else if (!Success && Extract->getIndices()[0] == 1) {
+                    Success = cast<Instruction>(StructUser);
+                    Success->moveBefore(Call);
+                  }
+                }
+              }
+
+              MaybeDead.push_back(Ins);
+              MaybeDead.push_back(TIns);
+              Ins->replaceAllUsesWith(I);
+            }
+
+            MaybeDead.push_back(ICmp);
+            if (!Success)
+              Success = ExtractValueInst::Create(I, 1, "success", Call);
+            ICmp->replaceAllUsesWith(Success);
+          }
+        }
+
+        // Clean up remaining uses of the loaded value, if any. Later code will
+        // try to replace Call with I, make sure the types match.
+        if (Call->hasNUsesOrMore(1)) {
+          if (!Loaded)
+            Loaded = ExtractValueInst::Create(I, 0, "loaded", Call);
+          I = Loaded;
+        } else {
+          I = nullptr;
+        }
+
+        if (Loaded)
+          MaybeDead.push_back(Loaded);
+        if (Success)
+          MaybeDead.push_back(Success);
+      }
+      break;
+    case Intrinsic::nacl_atomic_fence:
+      I = new FenceInst(M->getContext(),
+                        thawMemoryOrder(Call->getArgOperand(0)), SS, Call);
+      break;
+    case Intrinsic::nacl_atomic_fence_all: {
+      FunctionType *FTy =
+          FunctionType::get(Type::getVoidTy(M->getContext()), false);
+      std::string AsmString; // Empty.
+      std::string Constraints("~{memory}");
+      bool HasSideEffect = true;
+      CallInst *Asm = CallInst::Create(
+          InlineAsm::get(FTy, AsmString, Constraints, HasSideEffect), "", Call);
+      Asm->setDebugLoc(Call->getDebugLoc());
+      I = new FenceInst(M->getContext(), AtomicOrdering::SequentiallyConsistent, SS, Asm);
+      Asm = CallInst::Create(
+          InlineAsm::get(FTy, AsmString, Constraints, HasSideEffect), "", I);
+      Asm->setDebugLoc(Call->getDebugLoc());
+    } break;
+    }
+
+    if (I) {
+      I->setName(Call->getName());
+      I->setDebugLoc(Call->getDebugLoc());
+      Call->replaceAllUsesWith(I);
+    }
+    Call->eraseFromParent();
+
+    // Remove dead code.
+    for (Instruction *Kill : MaybeDead)
+      if (isInstructionTriviallyDead(Kill))
+        Kill->eraseFromParent();
+
+    return true;
+  }
+
+  unsigned alignmentFromPointer(const Value *Ptr) const {
+    auto *PtrType = cast<PointerType>(Ptr->getType());
+    unsigned BitWidth = PtrType->getElementType()->getIntegerBitWidth();
+    return BitWidth / 8;
+  }
+
+  AtomicOrdering thawMemoryOrder(const Value *MemoryOrder) const {
+    auto MO = static_cast<NaCl::MemoryOrder>(
+        cast<Constant>(MemoryOrder)->getUniqueInteger().getLimitedValue());
+    switch (MO) {
+    // Only valid values should pass validation.
+    default: llvm_unreachable("unknown memory order");
+    case NaCl::MemoryOrderRelaxed: return AtomicOrdering::Monotonic;
+    // TODO Consume is unspecified by LLVM's internal IR.
+    case NaCl::MemoryOrderConsume: return AtomicOrdering::SequentiallyConsistent;
+    case NaCl::MemoryOrderAcquire: return AtomicOrdering::Acquire;
+    case NaCl::MemoryOrderRelease: return AtomicOrdering::Release;
+    case NaCl::MemoryOrderAcquireRelease: return AtomicOrdering::AcquireRelease;
+    case NaCl::MemoryOrderSequentiallyConsistent: return AtomicOrdering::SequentiallyConsistent;
+    }
+  }
+
+  AtomicRMWInst::BinOp thawRMWOperation(const Value *Operation) const {
+    auto Op = static_cast<NaCl::AtomicRMWOperation>(
+        cast<Constant>(Operation)->getUniqueInteger().getLimitedValue());
+    switch (Op) {
+    // Only valid values should pass validation.
+    default: llvm_unreachable("unknown atomic RMW operation");
+    case NaCl::AtomicAdd: return AtomicRMWInst::Add;
+    case NaCl::AtomicSub: return AtomicRMWInst::Sub;
+    case NaCl::AtomicOr:  return AtomicRMWInst::Or;
+    case NaCl::AtomicAnd: return AtomicRMWInst::And;
+    case NaCl::AtomicXor: return AtomicRMWInst::Xor;
+    case NaCl::AtomicExchange: return AtomicRMWInst::Xchg;
+    }
+  }
+
+  AtomicCallResolver(const AtomicCallResolver &);
+  AtomicCallResolver &operator=(const AtomicCallResolver &);
+};
+}
+
+bool ResolvePNaClIntrinsics::visitCalls(
+    ResolvePNaClIntrinsics::CallResolver &Resolver) {
+  bool Changed = false;
+  Function *IntrinsicFunction = Resolver.getDeclaration();
+  if (!IntrinsicFunction)
+    return false;
+
+  SmallVector<IntrinsicInst *, 64> Calls;
+  for (User *U : IntrinsicFunction->users()) {
+    // At this point, the only uses of the intrinsic can be calls, since we
+    // assume this pass runs on bitcode that passed ABI verification.
+    auto *Call = dyn_cast<IntrinsicInst>(U);
+    if (!Call)
+      report_fatal_error("Expected use of intrinsic to be a call: " +
+                         Resolver.getName());
+    Calls.push_back(Call);
+  }
+
+  for (IntrinsicInst *Call : Calls)
+    Changed |= Resolver.resolve(Call);
+
+  return Changed;
+}
+
+bool ResolvePNaClIntrinsics::runOnFunction(Function &F) {
+  Module *M = F.getParent();
+  LLVMContext &C = M->getContext();
+  bool Changed = false;
+
+  IntrinsicCallToFunctionCall SetJmpResolver(F, Intrinsic::nacl_setjmp,
+                                             "setjmp");
+  IntrinsicCallToFunctionCall LongJmpResolver(F, Intrinsic::nacl_longjmp,
+                                              "longjmp");
+  Changed |= visitCalls(SetJmpResolver);
+  Changed |= visitCalls(LongJmpResolver);
+
+  NaCl::AtomicIntrinsics AI(C);
+  NaCl::AtomicIntrinsics::View V = AI.allIntrinsicsAndOverloads();
+  for (auto I = V.begin(), E = V.end(); I != E; ++I) {
+    AtomicCallResolver AtomicResolver(F, I);
+    Changed |= visitCalls(AtomicResolver);
+  }
+
+  ConstantCallResolver<IsLockFreeToConstant> IsLockFreeResolver(
+      F, Intrinsic::nacl_atomic_is_lock_free, IsLockFreeToConstant(M));
+  Changed |= visitCalls(IsLockFreeResolver);
+
+  return Changed;
+}
+
+char ResolvePNaClIntrinsics::ID = 0;
+INITIALIZE_PASS(ResolvePNaClIntrinsics, "resolve-pnacl-intrinsics",
+                "Resolve PNaCl intrinsic calls", false, false)
+
+FunctionPass *llvm::createResolvePNaClIntrinsicsPass() {
+  return new ResolvePNaClIntrinsics();
+}
diff --git a/lib/Target/JSBackend/NaCl/RewriteAtomics.cpp b/lib/Target/JSBackend/NaCl/RewriteAtomics.cpp
new file mode 100644
index 000000000000..c7f17a4f72cb
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/RewriteAtomics.cpp
@@ -0,0 +1,411 @@
+//===- RewriteAtomics.cpp - Stabilize instructions used for concurrency ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass encodes atomics, volatiles and fences using NaCl intrinsics
+// instead of LLVM's regular IR instructions.
+//
+// All of the above are transformed into one of the
+// @llvm.nacl.atomic.* intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NaClAtomicIntrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+#include <climits>
+#include <string>
+
+using namespace llvm;
+
+static cl::opt<bool> PNaClMemoryOrderSeqCstOnly(
+    "pnacl-memory-order-seq-cst-only",
+    cl::desc("PNaCl should upgrade all atomic memory orders to seq_cst"),
+    cl::init(false));
+
+namespace {
+
+class RewriteAtomics : public ModulePass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  RewriteAtomics() : ModulePass(ID) {
+    // This is a module pass because it may have to introduce
+    // intrinsic declarations into the module and modify a global function.
+    initializeRewriteAtomicsPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnModule(Module &M);
+};
+
+template <class T> std::string ToStr(const T &V) {
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << const_cast<T &>(V);
+  return OS.str();
+}
+
+class AtomicVisitor : public InstVisitor<AtomicVisitor> {
+public:
+  AtomicVisitor(Module &M, Pass &P)
+      : M(M), C(M.getContext()),
+        TD(M.getDataLayout()), AI(C),
+        ModifiedModule(false) {}
+  ~AtomicVisitor() {}
+  bool modifiedModule() const { return ModifiedModule; }
+
+  void visitLoadInst(LoadInst &I);
+  void visitStoreInst(StoreInst &I);
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I);
+  void visitAtomicRMWInst(AtomicRMWInst &I);
+  void visitFenceInst(FenceInst &I);
+
+private:
+  Module &M;
+  LLVMContext &C;
+  const DataLayout TD;
+  NaCl::AtomicIntrinsics AI;
+  bool ModifiedModule;
+
+  AtomicVisitor() = delete;
+  AtomicVisitor(const AtomicVisitor &) = delete;
+  AtomicVisitor &operator=(const AtomicVisitor &) = delete;
+
+  /// Create an integer constant holding a NaCl::MemoryOrder that can be
+  /// passed as an argument to one of the @llvm.nacl.atomic.*
+  /// intrinsics. This function may strengthen the ordering initially
+  /// specified by the instruction \p I for stability purpose.
+  template <class Instruction>
+  ConstantInt *freezeMemoryOrder(const Instruction &I, AtomicOrdering O) const;
+  std::pair<ConstantInt *, ConstantInt *>
+  freezeMemoryOrder(const AtomicCmpXchgInst &I, AtomicOrdering S,
+                    AtomicOrdering F) const;
+
+  /// Sanity-check that instruction \p I which has pointer and value
+  /// parameters have matching sizes \p BitSize for the type-pointed-to
+  /// and the value's type \p T.
+  void checkSizeMatchesType(const Instruction &I, unsigned BitSize,
+                            const Type *T) const;
+
+  /// Verify that loads and stores are at least naturally aligned. Use
+  /// byte alignment because converting to bits could truncate the
+  /// value.
+  void checkAlignment(const Instruction &I, unsigned ByteAlignment,
+                      unsigned ByteSize) const;
+
+  /// Create a cast before Instruction \p I from \p Src to \p Dst with \p Name.
+  CastInst *createCast(Instruction &I, Value *Src, Type *Dst, Twine Name) const;
+
+  /// Try to find the atomic intrinsic of with its \p ID and \OverloadedType.
+  /// Report fatal error on failure.
+  const NaCl::AtomicIntrinsics::AtomicIntrinsic *
+  findAtomicIntrinsic(const Instruction &I, Intrinsic::ID ID,
+                      Type *OverloadedType) const;
+
+  /// Helper function which rewrites a single instruction \p I to a
+  /// particular \p intrinsic with overloaded type \p OverloadedType,
+  /// and argument list \p Args. Will perform a bitcast to the proper \p
+  /// DstType, if different from \p OverloadedType.
+  void replaceInstructionWithIntrinsicCall(
+      Instruction &I, const NaCl::AtomicIntrinsics::AtomicIntrinsic *Intrinsic,
+      Type *DstType, Type *OverloadedType, ArrayRef<Value *> Args);
+
+  /// Most atomics instructions deal with at least one pointer, this
+  /// struct automates some of this and has generic sanity checks.
+  template <class Instruction> struct PointerHelper {
+    Value *P;
+    Type *OriginalPET;
+    Type *PET;
+    unsigned BitSize;
+    PointerHelper(const AtomicVisitor &AV, Instruction &I)
+        : P(I.getPointerOperand()) {
+      if (I.getPointerAddressSpace() != 0)
+        report_fatal_error("unhandled pointer address space " +
+                           Twine(I.getPointerAddressSpace()) + " for atomic: " +
+                           ToStr(I));
+      assert(P->getType()->isPointerTy() && "expected a pointer");
+      PET = OriginalPET = P->getType()->getPointerElementType();
+      BitSize = AV.TD.getTypeSizeInBits(OriginalPET);
+      if (!OriginalPET->isIntegerTy()) {
+        // The pointer wasn't to an integer type. We define atomics in
+        // terms of integers, so bitcast the pointer to an integer of
+        // the proper width.
+        Type *IntNPtr = Type::getIntNPtrTy(AV.C, BitSize);
+        P = AV.createCast(I, P, IntNPtr, P->getName() + ".cast");
+        PET = P->getType()->getPointerElementType();
+      }
+      AV.checkSizeMatchesType(I, BitSize, PET);
+    }
+  };
+};
+}
+
+char RewriteAtomics::ID = 0;
+INITIALIZE_PASS(RewriteAtomics, "nacl-rewrite-atomics",
+                "rewrite atomics, volatiles and fences into stable "
+                "@llvm.nacl.atomics.* intrinsics",
+                false, false)
+
+bool RewriteAtomics::runOnModule(Module &M) {
+  AtomicVisitor AV(M, *this);
+  AV.visit(M);
+  return AV.modifiedModule();
+}
+
+template <class Instruction>
+ConstantInt *AtomicVisitor::freezeMemoryOrder(const Instruction &I,
+                                              AtomicOrdering O) const {
+  NaCl::MemoryOrder AO = NaCl::MemoryOrderInvalid;
+
+  // TODO Volatile load/store are promoted to sequentially consistent
+  //      for now. We could do something weaker.
+  if (const LoadInst *L = dyn_cast<LoadInst>(&I)) {
+    if (L->isVolatile())
+      AO = NaCl::MemoryOrderSequentiallyConsistent;
+  } else if (const StoreInst *S = dyn_cast<StoreInst>(&I)) {
+    if (S->isVolatile())
+      AO = NaCl::MemoryOrderSequentiallyConsistent;
+  }
+
+  if (AO == NaCl::MemoryOrderInvalid) {
+    switch (O) {
+    case AtomicOrdering::NotAtomic: llvm_unreachable("unexpected memory order");
+    // Monotonic is a strict superset of Unordered. Both can therefore
+    // map to Relaxed ordering, which is in the C11/C++11 standard.
+    case AtomicOrdering::Unordered: AO = NaCl::MemoryOrderRelaxed; break;
+    case AtomicOrdering::Monotonic: AO = NaCl::MemoryOrderRelaxed; break;
+    // TODO Consume is currently unspecified by LLVM's internal IR.
+    case AtomicOrdering::Acquire: AO = NaCl::MemoryOrderAcquire; break;
+    case AtomicOrdering::Release: AO = NaCl::MemoryOrderRelease; break;
+    case AtomicOrdering::AcquireRelease: AO = NaCl::MemoryOrderAcquireRelease; break;
+    case AtomicOrdering::SequentiallyConsistent:
+      AO = NaCl::MemoryOrderSequentiallyConsistent; break;
+    }
+  }
+
+  // TODO For now only acquire/release/acq_rel/seq_cst are allowed.
+  if (PNaClMemoryOrderSeqCstOnly || AO == NaCl::MemoryOrderRelaxed)
+    AO = NaCl::MemoryOrderSequentiallyConsistent;
+
+  return ConstantInt::get(Type::getInt32Ty(C), AO);
+}
+
+std::pair<ConstantInt *, ConstantInt *>
+AtomicVisitor::freezeMemoryOrder(const AtomicCmpXchgInst &I, AtomicOrdering S,
+                                 AtomicOrdering F) const {
+  if (S == AtomicOrdering::Release || (S == AtomicOrdering::AcquireRelease && F != AtomicOrdering::Acquire))
+    // According to C++11's [atomics.types.operations.req], cmpxchg with release
+    // success memory ordering must have relaxed failure memory ordering, which
+    // PNaCl currently disallows. The next-strongest ordering is acq_rel which
+    // is also an invalid failure ordering, we therefore have to change the
+    // success ordering to seq_cst, which can then fail as seq_cst.
+    S = F = AtomicOrdering::SequentiallyConsistent;
+  if (F == AtomicOrdering::Unordered || F == AtomicOrdering::Monotonic) // Both are treated as relaxed.
+    F = AtomicCmpXchgInst::getStrongestFailureOrdering(S);
+  return std::make_pair(freezeMemoryOrder(I, S), freezeMemoryOrder(I, F));
+}
+
+void AtomicVisitor::checkSizeMatchesType(const Instruction &I, unsigned BitSize,
+                                         const Type *T) const {
+  Type *IntType = Type::getIntNTy(C, BitSize);
+  if (IntType && T == IntType)
+    return;
+  report_fatal_error("unsupported atomic type " + ToStr(*T) + " of size " +
+                     Twine(BitSize) + " bits in: " + ToStr(I));
+}
+
+void AtomicVisitor::checkAlignment(const Instruction &I, unsigned ByteAlignment,
+                                   unsigned ByteSize) const {
+  if (ByteAlignment < ByteSize)
+    report_fatal_error("atomic load/store must be at least naturally aligned, "
+                       "got " +
+                       Twine(ByteAlignment) + ", bytes expected at least " +
+                       Twine(ByteSize) + " bytes, in: " + ToStr(I));
+}
+
+CastInst *AtomicVisitor::createCast(Instruction &I, Value *Src, Type *Dst,
+                                    Twine Name) const {
+  Type *SrcT = Src->getType();
+  Instruction::CastOps Op = SrcT->isIntegerTy() && Dst->isPointerTy()
+                                ? Instruction::IntToPtr
+                                : SrcT->isPointerTy() && Dst->isIntegerTy()
+                                      ? Instruction::PtrToInt
+                                      : Instruction::BitCast;
+  if (!CastInst::castIsValid(Op, Src, Dst))
+    report_fatal_error("cannot emit atomic instruction while converting type " +
+                       ToStr(*SrcT) + " to " + ToStr(*Dst) + " for " + Name +
+                       " in " + ToStr(I));
+  return CastInst::Create(Op, Src, Dst, Name, &I);
+}
+
+const NaCl::AtomicIntrinsics::AtomicIntrinsic *
+AtomicVisitor::findAtomicIntrinsic(const Instruction &I, Intrinsic::ID ID,
+                                   Type *OverloadedType) const {
+  if (const NaCl::AtomicIntrinsics::AtomicIntrinsic *Intrinsic =
+          AI.find(ID, OverloadedType))
+    return Intrinsic;
+  report_fatal_error("unsupported atomic instruction: " + ToStr(I));
+}
+
+void AtomicVisitor::replaceInstructionWithIntrinsicCall(
+    Instruction &I, const NaCl::AtomicIntrinsics::AtomicIntrinsic *Intrinsic,
+    Type *DstType, Type *OverloadedType, ArrayRef<Value *> Args) {
+  std::string Name(I.getName());
+  Function *F = Intrinsic->getDeclaration(&M);
+  CallInst *Call = CallInst::Create(F, Args, "", &I);
+  Call->setDebugLoc(I.getDebugLoc());
+  Instruction *Res = Call;
+
+  assert((I.getType()->isStructTy() == isa<AtomicCmpXchgInst>(&I)) &&
+         "cmpxchg returns a struct, and other instructions don't");
+  if (auto S = dyn_cast<StructType>(I.getType())) {
+    assert(S->getNumElements() == 2 &&
+           "cmpxchg returns a struct with two elements");
+    assert(S->getElementType(0) == DstType &&
+           "cmpxchg struct's first member should be the value type");
+    assert(S->getElementType(1) == Type::getInt1Ty(C) &&
+           "cmpxchg struct's second member should be the success flag");
+    // Recreate struct { T value, i1 success } after the call.
+    auto Success = CmpInst::Create(
+        Instruction::ICmp, CmpInst::ICMP_EQ, Res,
+        cast<AtomicCmpXchgInst>(&I)->getCompareOperand(), "success", &I);
+    Res = InsertValueInst::Create(
+        InsertValueInst::Create(UndefValue::get(S), Res, 0,
+                                Name + ".insert.value", &I),
+        Success, 1, Name + ".insert.success", &I);
+  } else if (!Call->getType()->isVoidTy() && DstType != OverloadedType) {
+    // The call returns a value which needs to be cast to a non-integer.
+    Res = createCast(I, Call, DstType, Name + ".cast");
+    Res->setDebugLoc(I.getDebugLoc());
+  }
+
+  I.replaceAllUsesWith(Res);
+  I.eraseFromParent();
+  Call->setName(Name);
+  ModifiedModule = true;
+}
+
+///   %res = load {atomic|volatile} T* %ptr memory_order, align sizeof(T)
+/// becomes:
+///   %res = call T @llvm.nacl.atomic.load.i<size>(%ptr, memory_order)
+void AtomicVisitor::visitLoadInst(LoadInst &I) {
+  return; // XXX EMSCRIPTEN
+  if (I.isSimple())
+    return;
+  PointerHelper<LoadInst> PH(*this, I);
+  const NaCl::AtomicIntrinsics::AtomicIntrinsic *Intrinsic =
+      findAtomicIntrinsic(I, Intrinsic::nacl_atomic_load, PH.PET);
+  checkAlignment(I, I.getAlignment(), PH.BitSize / CHAR_BIT);
+  Value *Args[] = {PH.P, freezeMemoryOrder(I, I.getOrdering())};
+  replaceInstructionWithIntrinsicCall(I, Intrinsic, PH.OriginalPET, PH.PET,
+                                      Args);
+}
+
+///   store {atomic|volatile} T %val, T* %ptr memory_order, align sizeof(T)
+/// becomes:
+///   call void @llvm.nacl.atomic.store.i<size>(%val, %ptr, memory_order)
+void AtomicVisitor::visitStoreInst(StoreInst &I) {
+  return; // XXX EMSCRIPTEN
+  if (I.isSimple())
+    return;
+  PointerHelper<StoreInst> PH(*this, I);
+  const NaCl::AtomicIntrinsics::AtomicIntrinsic *Intrinsic =
+      findAtomicIntrinsic(I, Intrinsic::nacl_atomic_store, PH.PET);
+  checkAlignment(I, I.getAlignment(), PH.BitSize / CHAR_BIT);
+  Value *V = I.getValueOperand();
+  if (!V->getType()->isIntegerTy()) {
+    // The store isn't of an integer type. We define atomics in terms of
+    // integers, so bitcast the value to store to an integer of the
+    // proper width.
+    CastInst *Cast = createCast(I, V, Type::getIntNTy(C, PH.BitSize),
+                                V->getName() + ".cast");
+    Cast->setDebugLoc(I.getDebugLoc());
+    V = Cast;
+  }
+  checkSizeMatchesType(I, PH.BitSize, V->getType());
+  Value *Args[] = {V, PH.P, freezeMemoryOrder(I, I.getOrdering())};
+  replaceInstructionWithIntrinsicCall(I, Intrinsic, PH.OriginalPET, PH.PET,
+                                      Args);
+}
+
+///   %res = atomicrmw OP T* %ptr, T %val memory_order
+/// becomes:
+///   %res = call T @llvm.nacl.atomic.rmw.i<size>(OP, %ptr, %val, memory_order)
+void AtomicVisitor::visitAtomicRMWInst(AtomicRMWInst &I) {
+  return; // XXX EMSCRIPTEN
+  NaCl::AtomicRMWOperation Op;
+  switch (I.getOperation()) {
+  default: report_fatal_error("unsupported atomicrmw operation: " + ToStr(I));
+  case AtomicRMWInst::Add: Op = NaCl::AtomicAdd; break;
+  case AtomicRMWInst::Sub: Op = NaCl::AtomicSub; break;
+  case AtomicRMWInst::And: Op = NaCl::AtomicAnd; break;
+  case AtomicRMWInst::Or:  Op = NaCl::AtomicOr;  break;
+  case AtomicRMWInst::Xor: Op = NaCl::AtomicXor; break;
+  case AtomicRMWInst::Xchg: Op = NaCl::AtomicExchange; break;
+  }
+  PointerHelper<AtomicRMWInst> PH(*this, I);
+  const NaCl::AtomicIntrinsics::AtomicIntrinsic *Intrinsic =
+      findAtomicIntrinsic(I, Intrinsic::nacl_atomic_rmw, PH.PET);
+  checkSizeMatchesType(I, PH.BitSize, I.getValOperand()->getType());
+  Value *Args[] = {ConstantInt::get(Type::getInt32Ty(C), Op), PH.P,
+                   I.getValOperand(), freezeMemoryOrder(I, I.getOrdering())};
+  replaceInstructionWithIntrinsicCall(I, Intrinsic, PH.OriginalPET, PH.PET,
+                                      Args);
+}
+
+///   %res = cmpxchg [weak] T* %ptr, T %old, T %new, memory_order_success
+///       memory_order_failure
+///   %val = extractvalue { T, i1 } %res, 0
+///   %success = extractvalue { T, i1 } %res, 1
+/// becomes:
+///   %val = call T @llvm.nacl.atomic.cmpxchg.i<size>(
+///       %object, %expected, %desired, memory_order_success,
+///       memory_order_failure)
+///   %success = icmp eq %old, %val
+/// Note: weak is currently dropped if present, the cmpxchg is always strong.
+void AtomicVisitor::visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+  PointerHelper<AtomicCmpXchgInst> PH(*this, I);
+  const NaCl::AtomicIntrinsics::AtomicIntrinsic *Intrinsic =
+      findAtomicIntrinsic(I, Intrinsic::nacl_atomic_cmpxchg, PH.PET);
+  checkSizeMatchesType(I, PH.BitSize, I.getCompareOperand()->getType());
+  checkSizeMatchesType(I, PH.BitSize, I.getNewValOperand()->getType());
+  auto Order =
+      freezeMemoryOrder(I, I.getSuccessOrdering(), I.getFailureOrdering());
+  Value *Args[] = {PH.P, I.getCompareOperand(), I.getNewValOperand(),
+                   Order.first, Order.second};
+  replaceInstructionWithIntrinsicCall(I, Intrinsic, PH.OriginalPET, PH.PET,
+                                      Args);
+}
+
+///   fence memory_order
+/// becomes:
+///   call void @llvm.nacl.atomic.fence(memory_order)
+/// and
+///   call void asm sideeffect "", "~{memory}"()
+///   fence seq_cst
+///   call void asm sideeffect "", "~{memory}"()
+/// becomes:
+///   call void asm sideeffect "", "~{memory}"()
+///   call void @llvm.nacl.atomic.fence.all()
+///   call void asm sideeffect "", "~{memory}"()
+/// Note that the assembly gets eliminated by the -remove-asm-memory pass.
+void AtomicVisitor::visitFenceInst(FenceInst &I) {
+  return; // XXX EMSCRIPTEN
+}
+
+ModulePass *llvm::createRewriteAtomicsPass() { return new RewriteAtomics(); }
diff --git a/lib/Target/JSBackend/NaCl/RewriteLLVMIntrinsics.cpp b/lib/Target/JSBackend/NaCl/RewriteLLVMIntrinsics.cpp
new file mode 100644
index 000000000000..119b85aaa1a4
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/RewriteLLVMIntrinsics.cpp
@@ -0,0 +1,149 @@
+//===- RewriteLLVMIntrinsics.cpp - Rewrite LLVM intrinsics to other values ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces calls to LLVM intrinsics that are *not* part of the
+// PNaCl stable bitcode ABI into simpler values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+class RewriteLLVMIntrinsics : public ModulePass {
+public:
+  static char ID;
+  RewriteLLVMIntrinsics() : ModulePass(ID) {
+    // This is a module pass because this makes it easier to access uses
+    // of global intrinsic functions.
+    initializeRewriteLLVMIntrinsicsPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnModule(Module &M);
+
+  /// Rewrite an intrinsic to something different.
+  class IntrinsicRewriter {
+  public:
+    Function *function() const { return F; }
+    /// Called once per \p Call of the Intrinsic Function.
+    void rewriteCall(CallInst *Call) { doRewriteCall(Call); }
+
+  protected:
+    IntrinsicRewriter(Module &M, Intrinsic::ID IntrinsicID)
+        : F(Intrinsic::getDeclaration(&M, IntrinsicID)) {}
+    virtual ~IntrinsicRewriter() {}
+    /// This pure virtual method must be defined by implementors, and
+    /// will be called by rewriteCall.
+    virtual void doRewriteCall(CallInst *Call) = 0;
+
+    Function *F;
+
+  private:
+    IntrinsicRewriter() = delete;
+    IntrinsicRewriter(const IntrinsicRewriter &) = delete;
+    IntrinsicRewriter &operator=(const IntrinsicRewriter &) = delete;
+  };
+
+private:
+  /// Visit all uses of a Function, rewrite it using the \p Rewriter,
+  /// and then delete the Call. Later delete the Function from the
+  /// Module. Returns true if the Module was changed.
+  bool visitUses(IntrinsicRewriter &Rewriter);
+};
+
+/// Rewrite a Call to nothing.
+class ToNothing : public RewriteLLVMIntrinsics::IntrinsicRewriter {
+public:
+  ToNothing(Module &M, Intrinsic::ID IntrinsicID)
+      : IntrinsicRewriter(M, IntrinsicID) {}
+  virtual ~ToNothing() {}
+
+protected:
+  virtual void doRewriteCall(CallInst *Call) {
+    // Nothing to do: the visit does the deletion.
+  }
+};
+
+/// Rewrite a Call to a ConstantInt of the same type.
+class ToConstantInt : public RewriteLLVMIntrinsics::IntrinsicRewriter {
+public:
+  ToConstantInt(Module &M, Intrinsic::ID IntrinsicID, uint64_t Value)
+      : IntrinsicRewriter(M, IntrinsicID), Value(Value),
+        RetType(function()->getFunctionType()->getReturnType()) {}
+  virtual ~ToConstantInt() {}
+
+protected:
+  virtual void doRewriteCall(CallInst *Call) {
+    Constant *C = ConstantInt::get(RetType, Value);
+    Call->replaceAllUsesWith(C);
+  }
+
+private:
+  uint64_t Value;
+  Type *RetType;
+};
+}
+
+char RewriteLLVMIntrinsics::ID = 0;
+INITIALIZE_PASS(RewriteLLVMIntrinsics, "rewrite-llvm-intrinsic-calls",
+                "Rewrite LLVM intrinsic calls to simpler expressions", false,
+                false)
+
+bool RewriteLLVMIntrinsics::runOnModule(Module &M) {
+  // Replace all uses of the @llvm.flt.rounds intrinsic with the constant
+  // "1" (round-to-nearest). Until we add a second intrinsic like
+  // @llvm.set.flt.round it is impossible to have a rounding mode that is
+  // not the initial rounding mode (round-to-nearest). We can remove
+  // this rewrite after adding a set() intrinsic.
+  ToConstantInt FltRoundsRewriter(M, Intrinsic::flt_rounds, 1);
+
+  // Remove all @llvm.prefetch intrinsics.
+  ToNothing PrefetchRewriter(M, Intrinsic::prefetch);
+  ToNothing AssumeRewriter(M, Intrinsic::assume);
+
+  return visitUses(FltRoundsRewriter) | visitUses(PrefetchRewriter)
+    | visitUses(AssumeRewriter);
+}
+
+bool RewriteLLVMIntrinsics::visitUses(IntrinsicRewriter &Rewriter) {
+  Function *F = Rewriter.function();
+  SmallVector<CallInst *, 64> Calls;
+  for (User *U : F->users()) {
+    if (CallInst *Call = dyn_cast<CallInst>(U)) {
+      Calls.push_back(Call);
+    } else {
+      // Intrinsics we care about currently don't need to handle this case.
+      std::string S;
+      raw_string_ostream OS(S);
+      OS << "Taking the address of this intrinsic is invalid: " << *U;
+      report_fatal_error(OS.str());
+    }
+  }
+
+  for (auto Call : Calls) {
+      Rewriter.rewriteCall(Call);
+      Call->eraseFromParent();
+  }
+
+  F->eraseFromParent();
+  return !Calls.empty();
+}
+
+ModulePass *llvm::createRewriteLLVMIntrinsicsPass() {
+  return new RewriteLLVMIntrinsics();
+}
diff --git a/lib/Target/JSBackend/NaCl/RewritePNaClLibraryCalls.cpp b/lib/Target/JSBackend/NaCl/RewritePNaClLibraryCalls.cpp
new file mode 100644
index 000000000000..c3f1e9409a92
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/RewritePNaClLibraryCalls.cpp
@@ -0,0 +1,545 @@
+//===- RewritePNaClLibraryCalls.cpp - PNaCl library calls to intrinsics ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces calls to known library functions with calls to intrinsics
+// that are part of the PNaCl stable bitcode ABI.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+#include <cstdarg>
+
+using namespace llvm;
+
+namespace {
+  class RewritePNaClLibraryCalls : public ModulePass {
+  public:
+    static char ID;
+    RewritePNaClLibraryCalls() :
+        ModulePass(ID), TheModule(NULL), Context(NULL), SetjmpIntrinsic(NULL),
+        LongjmpIntrinsic(NULL), MemcpyIntrinsic(NULL),
+        MemmoveIntrinsic(NULL), MemsetIntrinsic(NULL) {
+      // This is a module pass because it may have to introduce
+      // intrinsic declarations into the module and modify globals.
+      initializeRewritePNaClLibraryCallsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  private:
+    typedef void (RewritePNaClLibraryCalls::*RewriteCallFunc)(CallInst *);
+    typedef void (RewritePNaClLibraryCalls::*PopulateWrapperFunc)(Function *);
+
+    /// Handles a certain pattern of library function -> intrinsic rewrites.
+    /// Currently all library functions this pass knows how to rewrite fall into
+    /// this pattern.
+    /// RewriteLibraryCall performs the rewrite for a single library function
+    /// and is customized by its arguments.
+    ///
+    /// \p LibraryFunctionName Name of the library function to look for.
+    /// \p CorrectFunctionType is the correct type of this library function.
+    /// \p CallRewriter Method that rewrites the library function call into an
+    ///    intrinsic call.
+    /// \p OnlyCallsAllowed Only calls to this library function are allowed.
+    /// \p WrapperPopulator called to populate the body of the library function
+    ///    with a wrapped intrinsic call.
+    bool RewriteLibraryCall(
+        const char *LibraryFunctionName,
+        FunctionType *CorrectFunctionType,
+        RewriteCallFunc CallRewriter,
+        bool OnlyCallsAllowed,
+        PopulateWrapperFunc WrapperPopulator);
+
+    /// Two function types are compatible if they have compatible return types
+    /// and the same number of compatible parameters. Return types and
+    /// parameters are compatible if they are exactly the same type or both are
+    /// pointer types.
+    static bool compatibleFunctionTypes(FunctionType *FTy1, FunctionType *FTy2);
+    static bool compatibleParamOrRetTypes(Type *Ty1, Type *Ty2);
+
+    void rewriteSetjmpCall(CallInst *Call);
+    void rewriteLongjmpCall(CallInst *Call);
+    void rewriteMemcpyCall(CallInst *Call);
+    void rewriteMemmoveCall(CallInst *Call);
+    void rewriteMemsetCall(CallInst *Call);
+
+    void populateSetjmpWrapper(Function *SetjmpFunc);
+    void populateLongjmpWrapper(Function *LongjmpFunc);
+    void populateMemcpyWrapper(Function *MemcpyFunc);
+    void populateMemmoveWrapper(Function *MemmoveFunc);
+    void populateMemsetWrapper(Function *MemsetFunc);
+
+    /// Generic implementation of populating a wrapper function.
+    /// Initially, the function exists in the module as a declaration with
+    /// unnamed arguments. This method is called with a NULL-terminated list
+    /// of argument names that get assigned in the generated IR for
+    /// readability.
+    void populateWrapperCommon(
+        Function *Func,
+        StringRef FuncName,
+        RewriteCallFunc CallRewriter,
+        bool CallCannotReturn,
+        ...);
+
+    /// Find and cache known intrinsics.
+    Function *findSetjmpIntrinsic();
+    Function *findLongjmpIntrinsic();
+    Function *findMemcpyIntrinsic();
+    Function *findMemmoveIntrinsic();
+    Function *findMemsetIntrinsic();
+
+    /// Cached data that remains the same throughout a module run.
+    Module *TheModule;
+    LLVMContext *Context;
+
+    /// These are cached but computed lazily.
+    Function *SetjmpIntrinsic;
+    Function *LongjmpIntrinsic;
+    Function *MemcpyIntrinsic;
+    Function *MemmoveIntrinsic;
+    Function *MemsetIntrinsic;
+  };
+}
+
+char RewritePNaClLibraryCalls::ID = 0;
+INITIALIZE_PASS(RewritePNaClLibraryCalls, "rewrite-pnacl-library-calls",
+                "Rewrite PNaCl library calls to stable intrinsics",
+                false, false)
+
+bool RewritePNaClLibraryCalls::RewriteLibraryCall(
+    const char *LibraryFunctionName,
+    FunctionType *CorrectFunctionType,
+    RewriteCallFunc CallRewriter,
+    bool OnlyCallsAllowed,
+    PopulateWrapperFunc WrapperPopulator) {
+  bool Changed = false;
+
+  Function *LibFunc = TheModule->getFunction(LibraryFunctionName);
+
+  // Iterate over all uses of this function, if it exists in the module with
+  // external linkage. If it exists but the linkage is not external, this may
+  // come from code that defines its own private function with the same name
+  // and doesn't actually include the standard libc header declaring it.
+  // In such a case we leave the code as it is.
+  //
+  // Another case we need to handle here is this function having the wrong
+  // prototype (incompatible with the C library function prototype, and hence
+  // incompatible with the intrinsic). In general, this is undefined behavior,
+  // but we can't fail compilation because some workflows rely on it
+  // compiling correctly (for example, autoconf). The solution is:
+  // When the declared type of the function in the module is not correct, we
+  // re-create the function with the correct prototype and replace all calls
+  // to this new function (casted to the old function type). Effectively this
+  // delays the undefined behavior until run-time.
+  if (LibFunc && LibFunc->hasExternalLinkage()) {
+    if (!compatibleFunctionTypes(LibFunc->getFunctionType(),
+                                 CorrectFunctionType)) {
+      // Use the RecreateFunction utility to create a new function with the
+      // correct prototype. RecreateFunction also RAUWs the function with
+      // proper bitcasts.
+      //
+      // One interesting case that may arise is when the original module had
+      // calls to both a correct and an incorrect version of the library
+      // function. Depending on the linking order, either version could be
+      // selected as the global declaration in the module, so even valid calls
+      // could end up being bitcast-ed from the incorrect to the correct
+      // function type. The RecreateFunction call below will eliminate such
+      // bitcasts (because the new type matches the call type), but dead
+      // constant expressions may be left behind.
+      // These are cleaned up with removeDeadConstantUsers.
+      Function *NewFunc = RecreateFunction(LibFunc, CorrectFunctionType);
+      LibFunc->eraseFromParent();
+      NewFunc->setLinkage(Function::InternalLinkage);
+      Changed = true;
+      NewFunc->removeDeadConstantUsers();
+      LibFunc = NewFunc;
+    }
+
+    // Handle all uses that are calls. These are simply replaced with
+    // equivalent intrinsic calls.
+    {
+      SmallVector<CallInst *, 32> Calls;
+      for (User *U : LibFunc->users())
+        // users() will also provide call instructions in which the used value
+        // is an argument, and not the value being called. Make sure we rewrite
+        // only actual calls to LibFunc here.
+        if (CallInst *Call = dyn_cast<CallInst>(U))
+          if (Call->getCalledValue() == LibFunc)
+            Calls.push_back(Call);
+
+      for (auto Call : Calls)
+        (this->*(CallRewriter))(Call);
+
+      Changed |= !Calls.empty();
+    }
+
+    if (LibFunc->use_empty()) {
+      LibFunc->eraseFromParent();
+    } else if (OnlyCallsAllowed) {
+      // If additional uses remain, these aren't calls.
+      report_fatal_error(Twine("Taking the address of ") +
+                         LibraryFunctionName + " is invalid");
+    } else {
+      // If non-call uses remain and allowed for this function, populate it
+      // with a wrapper.
+      (this->*(WrapperPopulator))(LibFunc);
+      LibFunc->setLinkage(Function::InternalLinkage);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool RewritePNaClLibraryCalls::runOnModule(Module &M) {
+  TheModule = &M;
+  Context = &TheModule->getContext();
+  bool Changed = false;
+
+  Type *Int8PtrTy = Type::getInt8PtrTy(*Context);
+  Type *Int64PtrTy = Type::getInt64PtrTy(*Context);
+  Type *Int32Ty = Type::getInt32Ty(*Context);
+  Type *VoidTy = Type::getVoidTy(*Context);
+
+  Type *SetjmpParams[] = { Int64PtrTy };
+  FunctionType *SetjmpFunctionType = FunctionType::get(Int32Ty, SetjmpParams,
+                                                       false);
+  Changed |= RewriteLibraryCall(
+      "setjmp",
+      SetjmpFunctionType,
+      &RewritePNaClLibraryCalls::rewriteSetjmpCall,
+      true,
+      &RewritePNaClLibraryCalls::populateSetjmpWrapper);
+
+  Type *LongjmpParams[] = { Int64PtrTy, Int32Ty };
+  FunctionType *LongjmpFunctionType = FunctionType::get(VoidTy, LongjmpParams,
+                                                        false);
+  Changed |= RewriteLibraryCall(
+      "longjmp",
+      LongjmpFunctionType,
+      &RewritePNaClLibraryCalls::rewriteLongjmpCall,
+      false,
+      &RewritePNaClLibraryCalls::populateLongjmpWrapper);
+
+  Type *MemsetParams[] = { Int8PtrTy, Int32Ty, Int32Ty };
+  FunctionType *MemsetFunctionType = FunctionType::get(Int8PtrTy, MemsetParams,
+                                                       false);
+  Changed |= RewriteLibraryCall(
+      "memset",
+      MemsetFunctionType,
+      &RewritePNaClLibraryCalls::rewriteMemsetCall,
+      false,
+      &RewritePNaClLibraryCalls::populateMemsetWrapper);
+
+  Type *MemcpyParams[] = { Int8PtrTy, Int8PtrTy, Int32Ty };
+  FunctionType *MemcpyFunctionType = FunctionType::get(Int8PtrTy, MemcpyParams,
+                                                       false);
+  Changed |= RewriteLibraryCall(
+      "memcpy",
+      MemcpyFunctionType,
+      &RewritePNaClLibraryCalls::rewriteMemcpyCall,
+      false,
+      &RewritePNaClLibraryCalls::populateMemcpyWrapper);
+
+  Type *MemmoveParams[] = { Int8PtrTy, Int8PtrTy, Int32Ty };
+  FunctionType *MemmoveFunctionType = FunctionType::get(Int8PtrTy,
+                                                        MemmoveParams,
+                                                        false);
+  Changed |= RewriteLibraryCall(
+      "memmove",
+      MemmoveFunctionType,
+      &RewritePNaClLibraryCalls::rewriteMemmoveCall,
+      false,
+      &RewritePNaClLibraryCalls::populateMemmoveWrapper);
+
+  return Changed;
+}
+
+bool RewritePNaClLibraryCalls::compatibleFunctionTypes(FunctionType *FTy1,
+                                                       FunctionType *FTy2) {
+  if (FTy1->getNumParams() != FTy2->getNumParams()) {
+    return false;
+  }
+
+  if (!compatibleParamOrRetTypes(FTy1->getReturnType(),
+                                 FTy2->getReturnType())) {
+    return false;
+  }
+
+  for (unsigned I = 0, End = FTy1->getNumParams(); I != End; ++I) {
+    if (!compatibleParamOrRetTypes(FTy1->getParamType(I), 
+                                   FTy2->getParamType(I))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool RewritePNaClLibraryCalls::compatibleParamOrRetTypes(Type *Ty1,
+                                                         Type *Ty2) {
+  return (Ty1 == Ty2 || (Ty1->isPointerTy() && Ty2->isPointerTy()));
+}
+
+void RewritePNaClLibraryCalls::rewriteSetjmpCall(CallInst *Call) {
+  // Find the intrinsic function.
+  Function *NaClSetjmpFunc = findSetjmpIntrinsic();
+  // Cast the jmp_buf argument to the type NaClSetjmpCall expects.
+  Type *PtrTy = NaClSetjmpFunc->getFunctionType()->getParamType(0);
+  BitCastInst *JmpBufCast = new BitCastInst(Call->getArgOperand(0), PtrTy,
+                                            "jmp_buf_i8", Call);
+  const DebugLoc &DLoc = Call->getDebugLoc();
+  JmpBufCast->setDebugLoc(DLoc);
+
+  // Emit the updated call.
+  Value *Args[] = { JmpBufCast };
+  CallInst *NaClSetjmpCall = CallInst::Create(NaClSetjmpFunc, Args, "", Call);
+  NaClSetjmpCall->setDebugLoc(DLoc);
+  NaClSetjmpCall->takeName(Call);
+
+  // Replace the original call.
+  Call->replaceAllUsesWith(NaClSetjmpCall);
+  Call->eraseFromParent();
+}
+
+void RewritePNaClLibraryCalls::rewriteLongjmpCall(CallInst *Call) {
+  // Find the intrinsic function.
+  Function *NaClLongjmpFunc = findLongjmpIntrinsic();
+  // Cast the jmp_buf argument to the type NaClLongjmpCall expects.
+  Type *PtrTy = NaClLongjmpFunc->getFunctionType()->getParamType(0);
+  BitCastInst *JmpBufCast = new BitCastInst(Call->getArgOperand(0), PtrTy,
+                                            "jmp_buf_i8", Call);
+  const DebugLoc &DLoc = Call->getDebugLoc();
+  JmpBufCast->setDebugLoc(DLoc);
+
+  // Emit the call.
+  Value *Args[] = { JmpBufCast, Call->getArgOperand(1) };
+  CallInst *NaClLongjmpCall = CallInst::Create(NaClLongjmpFunc, Args, "", Call);
+  NaClLongjmpCall->setDebugLoc(DLoc);
+  // No takeName here since longjmp is a void call that does not get assigned to
+  // a value.
+
+  // Remove the original call. There's no need for RAUW because longjmp
+  // returns void.
+  Call->eraseFromParent();
+}
+
+void RewritePNaClLibraryCalls::rewriteMemcpyCall(CallInst *Call) {
+  Function *MemcpyIntrinsic = findMemcpyIntrinsic();
+  // dest, src, len, align, isvolatile
+  Value *Args[] = { Call->getArgOperand(0),
+                    Call->getArgOperand(1),
+                    Call->getArgOperand(2),
+                    ConstantInt::get(Type::getInt32Ty(*Context), 1),
+                    ConstantInt::get(Type::getInt1Ty(*Context), 0) };
+  CallInst *MemcpyIntrinsicCall = CallInst::Create(MemcpyIntrinsic,
+                                                   Args, "", Call);
+  MemcpyIntrinsicCall->setDebugLoc(Call->getDebugLoc());
+
+  // libc memcpy returns the source pointer, but the LLVM intrinsic doesn't; if
+  // the return value has actual uses, just replace them with the dest
+  // argument itself.
+  Call->replaceAllUsesWith(Call->getArgOperand(0));
+  Call->eraseFromParent();
+}
+
+void RewritePNaClLibraryCalls::rewriteMemmoveCall(CallInst *Call) {
+  Function *MemmoveIntrinsic = findMemmoveIntrinsic();
+  // dest, src, len, align, isvolatile
+  Value *Args[] = { Call->getArgOperand(0),
+                    Call->getArgOperand(1),
+                    Call->getArgOperand(2),
+                    ConstantInt::get(Type::getInt32Ty(*Context), 1),
+                    ConstantInt::get(Type::getInt1Ty(*Context), 0) };
+  CallInst *MemmoveIntrinsicCall = CallInst::Create(MemmoveIntrinsic,
+                                                    Args, "", Call);
+  MemmoveIntrinsicCall->setDebugLoc(Call->getDebugLoc());
+
+  // libc memmove returns the source pointer, but the LLVM intrinsic doesn't; if
+  // the return value has actual uses, just replace them with the dest
+  // argument itself.
+  Call->replaceAllUsesWith(Call->getArgOperand(0));
+  Call->eraseFromParent();
+}
+
+void RewritePNaClLibraryCalls::rewriteMemsetCall(CallInst *Call) {
+  Function *MemsetIntrinsic = findMemsetIntrinsic();
+  // libc memset has 'int c' for the filler byte, but the LLVM intrinsic uses
+  // a i8; truncation is required.
+  TruncInst *ByteTrunc = new TruncInst(Call->getArgOperand(1),
+                                       Type::getInt8Ty(*Context),
+                                       "trunc_byte", Call);
+
+  const DebugLoc &DLoc = Call->getDebugLoc();
+  ByteTrunc->setDebugLoc(DLoc);
+
+  // dest, val, len, align, isvolatile
+  Value *Args[] = { Call->getArgOperand(0),
+                    ByteTrunc,
+                    Call->getArgOperand(2),
+                    ConstantInt::get(Type::getInt32Ty(*Context), 1),
+                    ConstantInt::get(Type::getInt1Ty(*Context), 0) };
+  CallInst *MemsetIntrinsicCall = CallInst::Create(MemsetIntrinsic,
+                                                   Args, "", Call);
+  MemsetIntrinsicCall->setDebugLoc(DLoc);
+
+  // libc memset returns the source pointer, but the LLVM intrinsic doesn't; if
+  // the return value has actual uses, just replace them with the dest
+  // argument itself.
+  Call->replaceAllUsesWith(Call->getArgOperand(0));
+  Call->eraseFromParent();
+}
+
+void RewritePNaClLibraryCalls::populateWrapperCommon(
+      Function *Func,
+      StringRef FuncName,
+      RewriteCallFunc CallRewriter,
+      bool CallCannotReturn,
+      ...) {
+  if (!Func->isDeclaration()) {
+    report_fatal_error(Twine("Expected ") + FuncName +
+                       " to be declared, not defined");
+  }
+
+  // Populate the function body with code.
+  BasicBlock *BB = BasicBlock::Create(*Context, "entry", Func);
+
+  // Collect and name the function arguments.
+  Function::arg_iterator FuncArgs = Func->arg_begin();
+  SmallVector<Value *, 4> Args;
+  va_list ap;
+  va_start(ap, CallCannotReturn);
+  while (true) {
+    // Iterate over the varargs until a terminated NULL is encountered.
+    const char *ArgName = va_arg(ap, const char *);
+    if (!ArgName)
+      break;
+    Value *Arg = &*FuncArgs++;
+    Arg->setName(ArgName);
+    Args.push_back(Arg);
+  }
+  va_end(ap);
+
+  // Emit a call to self, and then call CallRewriter to rewrite it to the
+  // intrinsic. This is done in order to keep the call rewriting logic in a
+  // single place.
+  CallInst *SelfCall = CallInst::Create(Func, Args, "", BB);
+
+  if (CallCannotReturn) {
+    new UnreachableInst(*Context, BB);
+  } else if (Func->getReturnType()->isVoidTy()) {
+    ReturnInst::Create(*Context, BB);
+  } else {
+    ReturnInst::Create(*Context, SelfCall, BB);
+  }
+
+  (this->*(CallRewriter))(SelfCall);
+}
+
+void RewritePNaClLibraryCalls::populateSetjmpWrapper(Function *SetjmpFunc) {
+  populateWrapperCommon(
+      /* Func             */ SetjmpFunc,
+      /* FuncName         */ "setjmp",
+      /* CallRewriter     */ &RewritePNaClLibraryCalls::rewriteSetjmpCall,
+      /* CallCannotReturn */ false,
+      /* ...              */ "env", NULL);
+}
+
+void RewritePNaClLibraryCalls::populateLongjmpWrapper(Function *LongjmpFunc) {
+  populateWrapperCommon(
+      /* Func             */ LongjmpFunc,
+      /* FuncName         */ "longjmp",
+      /* CallRewriter     */ &RewritePNaClLibraryCalls::rewriteLongjmpCall,
+      /* CallCannotReturn */ true,
+      /* ...              */ "env", "val", NULL);
+}
+
+void RewritePNaClLibraryCalls::populateMemcpyWrapper(Function *MemcpyFunc) {
+  populateWrapperCommon(
+      /* Func             */ MemcpyFunc,
+      /* FuncName         */ "memcpy",
+      /* CallRewriter     */ &RewritePNaClLibraryCalls::rewriteMemcpyCall,
+      /* CallCannotReturn */ false,
+      /* ...              */ "dest", "src", "len", NULL);
+}
+
+void RewritePNaClLibraryCalls::populateMemmoveWrapper(Function *MemmoveFunc) {
+  populateWrapperCommon(
+      /* Func             */ MemmoveFunc,
+      /* FuncName         */ "memmove",
+      /* CallRewriter     */ &RewritePNaClLibraryCalls::rewriteMemmoveCall,
+      /* CallCannotReturn */ false,
+      /* ...              */ "dest", "src", "len", NULL);
+}
+
+void RewritePNaClLibraryCalls::populateMemsetWrapper(Function *MemsetFunc) {
+  populateWrapperCommon(
+      /* Func             */ MemsetFunc,
+      /* FuncName         */ "memset",
+      /* CallRewriter     */ &RewritePNaClLibraryCalls::rewriteMemsetCall,
+      /* CallCannotReturn */ false,
+      /* ...              */ "dest", "val", "len", NULL);
+}
+
+Function *RewritePNaClLibraryCalls::findSetjmpIntrinsic() {
+  if (!SetjmpIntrinsic) {
+    SetjmpIntrinsic = Intrinsic::getDeclaration(
+        TheModule, Intrinsic::nacl_setjmp);
+  }
+  return SetjmpIntrinsic;
+}
+
+Function *RewritePNaClLibraryCalls::findLongjmpIntrinsic() {
+  if (!LongjmpIntrinsic) {
+    LongjmpIntrinsic = Intrinsic::getDeclaration(
+        TheModule, Intrinsic::nacl_longjmp);
+  }
+  return LongjmpIntrinsic;
+}
+
+Function *RewritePNaClLibraryCalls::findMemcpyIntrinsic() {
+  if (!MemcpyIntrinsic) {
+    Type *Tys[] = { Type::getInt8PtrTy(*Context),
+                    Type::getInt8PtrTy(*Context),
+                    Type::getInt32Ty(*Context) };
+    MemcpyIntrinsic = Intrinsic::getDeclaration(
+        TheModule, Intrinsic::memcpy, Tys);
+  }
+  return MemcpyIntrinsic;
+}
+
+Function *RewritePNaClLibraryCalls::findMemmoveIntrinsic() {
+  if (!MemmoveIntrinsic) {
+    Type *Tys[] = { Type::getInt8PtrTy(*Context),
+                    Type::getInt8PtrTy(*Context),
+                    Type::getInt32Ty(*Context) };
+    MemmoveIntrinsic = Intrinsic::getDeclaration(
+        TheModule, Intrinsic::memmove, Tys);
+  }
+  return MemmoveIntrinsic;
+}
+
+Function *RewritePNaClLibraryCalls::findMemsetIntrinsic() {
+  if (!MemsetIntrinsic) {
+    Type *Tys[] = { Type::getInt8PtrTy(*Context), Type::getInt32Ty(*Context) };
+    MemsetIntrinsic = Intrinsic::getDeclaration(
+        TheModule, Intrinsic::memset, Tys);
+  }
+  return MemsetIntrinsic;
+}
+
+ModulePass *llvm::createRewritePNaClLibraryCallsPass() {
+  return new RewritePNaClLibraryCalls();
+}
diff --git a/lib/Target/JSBackend/NaCl/SimplifiedFuncTypeMap.cpp b/lib/Target/JSBackend/NaCl/SimplifiedFuncTypeMap.cpp
new file mode 100644
index 000000000000..5e09e1ac4c29
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/SimplifiedFuncTypeMap.cpp
@@ -0,0 +1,140 @@
+//===-- SimplifiedFuncTypeMap.cpp - Consistent type remapping----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SimplifiedFuncTypeMap.h"
+
+using namespace llvm;
+
+Type *SimplifiedFuncTypeMap::getSimpleType(LLVMContext &Ctx, Type *Ty) {
+  auto Found = MappedTypes.find(Ty);
+  if (Found != MappedTypes.end()) {
+    return Found->second;
+  }
+
+  StructMap Tentatives;
+  auto Ret = getSimpleAggregateTypeInternal(Ctx, Ty, Tentatives);
+  assert(Tentatives.size() == 0);
+
+  if (!Ty->isStructTy()) {
+    // Structs are memoized in getSimpleAggregateTypeInternal.
+    MappedTypes[Ty] = Ret;
+  }
+  return Ret;
+}
+
+// Transforms any type that could transitively reference a function pointer
+// into a simplified type.
+// We enter this function trying to determine the mapping of a type. Because
+// of how structs are handled (not interned by llvm - see further comments
+// below) we may be working with temporary types - types (pointers, for example)
+// transitively referencing "tentative" structs. For that reason, we do not
+// memoize anything here, except for structs. The latter is so that we avoid
+// unnecessary repeated creation of types (pointers, function types, etc),
+// as we try to map a given type.
+SimplifiedFuncTypeMap::MappingResult
+SimplifiedFuncTypeMap::getSimpleAggregateTypeInternal(LLVMContext &Ctx,
+                                                      Type *Ty,
+                                                      StructMap &Tentatives) {
+  // Leverage the map for types we encounter on the way.
+  auto Found = MappedTypes.find(Ty);
+  if (Found != MappedTypes.end()) {
+    return {Found->second, Found->second != Ty};
+  }
+
+  if (auto *OldFnTy = dyn_cast<FunctionType>(Ty)) {
+    return getSimpleFuncType(Ctx, Tentatives, OldFnTy);
+  }
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    auto NewTy = getSimpleAggregateTypeInternal(
+        Ctx, PtrTy->getPointerElementType(), Tentatives);
+
+    return {NewTy->getPointerTo(PtrTy->getAddressSpace()), NewTy.isChanged()};
+  }
+
+  if (auto ArrTy = dyn_cast<ArrayType>(Ty)) {
+    auto NewTy = getSimpleAggregateTypeInternal(
+        Ctx, ArrTy->getArrayElementType(), Tentatives);
+    return {ArrayType::get(NewTy, ArrTy->getArrayNumElements()),
+            NewTy.isChanged()};
+  }
+
+  if (auto VecTy = dyn_cast<VectorType>(Ty)) {
+    auto NewTy = getSimpleAggregateTypeInternal(
+        Ctx, VecTy->getVectorElementType(), Tentatives);
+    return {VectorType::get(NewTy, VecTy->getVectorNumElements()),
+            NewTy.isChanged()};
+  }
+
+  // LLVM doesn't intern identified structs (the ones with a name). This,
+  // together with the fact that such structs can be recursive,
+  // complicates things a bit. We want to make sure that we only change
+  // "unsimplified" structs (those that somehow reference funcs that
+  // are not simple).
+  // We don't want to change "simplified" structs, otherwise converting
+  // instruction types will become trickier.
+  if (auto StructTy = dyn_cast<StructType>(Ty)) {
+    ParamTypeVector ElemTypes;
+    if (!StructTy->isLiteral()) {
+      // Literals - struct without a name - cannot be recursive, so we
+      // don't need to form tentatives.
+      auto Found = Tentatives.find(StructTy);
+
+      // Having a tentative means we are in a recursion trying to map this
+      // particular struct, so arriving back to it is not a change.
+      // We will determine if this struct is actually
+      // changed by checking its other fields.
+      if (Found != Tentatives.end()) {
+        return {Found->second, false};
+      }
+      // We have never seen this struct, so we start a tentative.
+      std::string NewName = StructTy->getStructName();
+      NewName += ".simplified";
+      StructType *Tentative = StructType::create(Ctx, NewName);
+      Tentatives[StructTy] = Tentative;
+
+      bool Changed = isChangedStruct(Ctx, StructTy, ElemTypes, Tentatives);
+
+      Tentatives.erase(StructTy);
+      // We can now decide the mapping of the struct. We will register it
+      // early with MappedTypes, to avoid leaking tentatives unnecessarily.
+      // We are leaking the created struct here, but there is no way to
+      // correctly delete it.
+      if (!Changed) {
+        return {MappedTypes[StructTy] = StructTy, false};
+      } else {
+        Tentative->setBody(ElemTypes, StructTy->isPacked());
+        return {MappedTypes[StructTy] = Tentative, true};
+      }
+    } else {
+      bool Changed = isChangedStruct(Ctx, StructTy, ElemTypes, Tentatives);
+      return {MappedTypes[StructTy] =
+                  StructType::get(Ctx, ElemTypes, StructTy->isPacked()),
+              Changed};
+    }
+  }
+
+  // Anything else stays the same.
+  return {Ty, false};
+}
+
+bool SimplifiedFuncTypeMap::isChangedStruct(LLVMContext &Ctx,
+                                            StructType *StructTy,
+                                            ParamTypeVector &ElemTypes,
+                                            StructMap &Tentatives) {
+  bool Changed = false;
+  unsigned StructElemCount = StructTy->getStructNumElements();
+  for (unsigned I = 0; I < StructElemCount; I++) {
+    auto NewElem = getSimpleAggregateTypeInternal(
+        Ctx, StructTy->getStructElementType(I), Tentatives);
+    ElemTypes.push_back(NewElem);
+    Changed |= NewElem.isChanged();
+  }
+  return Changed;
+}
\ No newline at end of file
diff --git a/lib/Target/JSBackend/NaCl/SimplifiedFuncTypeMap.h b/lib/Target/JSBackend/NaCl/SimplifiedFuncTypeMap.h
new file mode 100644
index 000000000000..3847a27247ec
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/SimplifiedFuncTypeMap.h
@@ -0,0 +1,61 @@
+//===-- SimplifiedFuncTypeMap.h - Consistent type remapping------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SIMPLIFIEDFUNCTYPEMAP_H
+#define LLVM_SIMPLIFIEDFUNCTYPEMAP_H
+
+#include <llvm/ADT/DenseMap.h>
+#include "llvm/IR/DerivedTypes.h"
+
+namespace llvm {
+// SimplifiedFuncTypeMap provides a consistent type map, given a rule
+// for mapping function types - which is provided by implementing
+// getSimpleFuncType.
+// A few transformations require changing function types, for example
+// SimplifyStructRegSignatures or PromoteIntegers. When doing so, we also
+// want to change any references to function types - for example structs
+// with fields typed as function pointer(s). Structs are not interned by LLVM,
+// which is what SimplifiedFuncTypeMap addresses.
+class SimplifiedFuncTypeMap {
+public:
+  typedef DenseMap<StructType *, StructType *> StructMap;
+  Type *getSimpleType(LLVMContext &Ctx, Type *Ty);
+  virtual ~SimplifiedFuncTypeMap() {}
+
+protected:
+  class MappingResult {
+  public:
+    MappingResult(Type *ATy, bool Chg) {
+      Ty = ATy;
+      Changed = Chg;
+    }
+    bool isChanged() { return Changed; }
+    Type *operator->() { return Ty; }
+    operator Type *() { return Ty; }
+
+  private:
+    Type *Ty;
+    bool Changed;
+  };
+
+  virtual MappingResult getSimpleFuncType(LLVMContext &Ctx,
+                                          StructMap &Tentatives,
+                                          FunctionType *OldFnTy) = 0;
+
+  typedef SmallVector<Type *, 8> ParamTypeVector;
+  DenseMap<Type *, Type *> MappedTypes;
+
+  MappingResult getSimpleAggregateTypeInternal(LLVMContext &Ctx, Type *Ty,
+                                               StructMap &Tentatives);
+
+  bool isChangedStruct(LLVMContext &Ctx, StructType *StructTy,
+                       ParamTypeVector &ElemTypes, StructMap &Tentatives);
+};
+}
+#endif // LLVM_SIMPLIFIEDFUNCTYPEMAP_H
diff --git a/lib/Target/JSBackend/NaCl/SimplifyAllocas.cpp b/lib/Target/JSBackend/NaCl/SimplifyAllocas.cpp
new file mode 100644
index 000000000000..9b9789619deb
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/SimplifyAllocas.cpp
@@ -0,0 +1,147 @@
+//===- SimplifyAllocas.cpp - Simplify allocas to arrays of bytes         --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simplify all allocas into allocas of byte arrays.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+namespace {
+class SimplifyAllocas : public BasicBlockPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  SimplifyAllocas()
+      : BasicBlockPass(ID), Initialized(false), M(nullptr), IntPtrType(nullptr),
+        Int8Type(nullptr), DL(nullptr) {
+    initializeSimplifyAllocasPass(*PassRegistry::getPassRegistry());
+  }
+
+private:
+  bool Initialized;
+  const Module *M;
+  Type *IntPtrType;
+  Type *Int8Type;
+  const DataLayout *DL;
+
+  using llvm::Pass::doInitialization;
+  bool doInitialization(Function &F) override {
+    if (!Initialized) {
+      M = F.getParent();
+      DL = &M->getDataLayout();
+      IntPtrType = DL->getIntPtrType(M->getContext());
+      Int8Type = Type::getInt8Ty(M->getContext());
+      Initialized = true;
+      return true;
+    }
+    return false;
+  }
+
+  AllocaInst *findAllocaFromCast(CastInst *CInst) {
+    Value *Op0 = CInst->getOperand(0);
+    while (!llvm::isa<AllocaInst>(Op0)) {
+      auto *NextCast = llvm::dyn_cast<CastInst>(Op0);
+      if (NextCast && NextCast->isNoopCast(IntPtrType)) {
+        Op0 = NextCast->getOperand(0);
+      } else {
+        return nullptr;
+      }
+    }
+    return llvm::cast<AllocaInst>(Op0);
+  }
+
+  bool runOnBasicBlock(BasicBlock &BB) override {
+    bool Changed = false;
+    for (BasicBlock::iterator I = BB.getFirstInsertionPt(), E = BB.end();
+         I != E;) {
+      Instruction *Inst = &*I++;
+      if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Inst)) {
+        Changed = true;
+        Type *ElementTy = Alloca->getType()->getPointerElementType();
+        Constant *ElementSize =
+            ConstantInt::get(IntPtrType, DL->getTypeAllocSize(ElementTy));
+        // Expand out alloca's built-in multiplication.
+        Value *MulSize;
+        if (ConstantInt *C = dyn_cast<ConstantInt>(Alloca->getArraySize())) {
+          const APInt Value =
+              C->getValue().zextOrTrunc(IntPtrType->getScalarSizeInBits());
+          MulSize = ConstantExpr::getMul(ElementSize,
+                                         ConstantInt::get(IntPtrType, Value));
+        } else {
+          Value *ArraySize = Alloca->getArraySize();
+          if (ArraySize->getType() != IntPtrType) {
+            // We assume ArraySize is always positive, and thus is unsigned.
+            assert(!isa<ConstantInt>(ArraySize) ||
+                   !cast<ConstantInt>(ArraySize)->isNegative());
+            ArraySize =
+                CastInst::CreateIntegerCast(ArraySize, IntPtrType,
+                                            /* isSigned = */ false, "", Alloca);
+          }
+          MulSize = CopyDebug(
+              BinaryOperator::Create(Instruction::Mul, ElementSize, ArraySize,
+                                     Alloca->getName() + ".alloca_mul", Alloca),
+              Alloca);
+        }
+        unsigned Alignment = Alloca->getAlignment();
+        if (Alignment == 0)
+          Alignment = DL->getPrefTypeAlignment(ElementTy);
+        AllocaInst *Tmp =
+            new AllocaInst(Int8Type, MulSize, Alignment, "", Alloca);
+        CopyDebug(Tmp, Alloca);
+        Tmp->takeName(Alloca);
+        BitCastInst *BC = new BitCastInst(Tmp, Alloca->getType(),
+                                          Tmp->getName() + ".bc", Alloca);
+        CopyDebug(BC, Alloca);
+        Alloca->replaceAllUsesWith(BC);
+        Alloca->eraseFromParent();
+      }
+      else if (auto *Call = dyn_cast<IntrinsicInst>(Inst)) {
+        if (Call->getIntrinsicID() == Intrinsic::dbg_declare) {
+          // dbg.declare's first argument is a special metadata that wraps a
+          // value, and RAUW works on those. It is supposed to refer to the
+          // alloca that represents the variable's storage, but the alloca
+          // simplification may have RAUWed it to use the bitcast.
+          // Fix it up here by recreating the metadata to use the new alloca.
+          auto *MV = cast<MetadataAsValue>(Call->getArgOperand(0));
+          // Sometimes dbg.declare points to an argument instead of an alloca.
+          if (auto *VM = dyn_cast<ValueAsMetadata>(MV->getMetadata())) {
+            if (auto *CInst = dyn_cast<CastInst>(VM->getValue())) {
+              if (AllocaInst *Alloca = findAllocaFromCast(CInst)) {
+                Call->setArgOperand(
+                    0,
+                    MetadataAsValue::get(Inst->getContext(),
+                                         ValueAsMetadata::get(Alloca)));
+                Changed = true;
+              }
+            }
+          }
+        }
+      }
+    }
+    return Changed;
+  }
+};
+}
+char SimplifyAllocas::ID = 0;
+
+INITIALIZE_PASS(SimplifyAllocas, "simplify-allocas",
+                "Simplify allocas to arrays of bytes", false, false)
+
+BasicBlockPass *llvm::createSimplifyAllocasPass() {
+  return new SimplifyAllocas();
+}
diff --git a/lib/Target/JSBackend/NaCl/SimplifyStructRegSignatures.cpp b/lib/Target/JSBackend/NaCl/SimplifyStructRegSignatures.cpp
new file mode 100644
index 000000000000..70d5e7763cd6
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/SimplifyStructRegSignatures.cpp
@@ -0,0 +1,597 @@
+//===- SimplifyStructRegSignatures.cpp - struct regs to struct pointers----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass replaces function signatures exposing struct registers
+// to byval pointer-based signatures.
+//
+// There are 2 types of signatures that are thus changed:
+//
+// @foo(%some_struct %val) -> @foo(%some_struct* byval %val)
+//      and
+// %someStruct @bar(<other_args>) -> void @bar(%someStruct* sret, <other_args>)
+//
+// Such function types may appear in other type declarations, for example:
+//
+// %a_struct = type { void (%some_struct)*, i32 }
+//
+// We map such types to corresponding types, mapping the function types
+// appropriately:
+//
+// %a_struct.0 = type { void (%some_struct*)*, i32 }
+//===----------------------------------------------------------------------===//
+#include "SimplifiedFuncTypeMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/PassInfo.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/PassSupport.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+using namespace llvm;
+namespace {
+static const unsigned int TypicalFuncArity = 8;
+static bool shouldPromote(const Type *Ty) {
+  return Ty->isAggregateType();
+}
+// Utility class. For any given type, get the associated type that is free of
+// struct register arguments.
+class TypeMapper : public SimplifiedFuncTypeMap {
+protected:
+  MappingResult getSimpleFuncType(LLVMContext &Ctx, StructMap &Tentatives,
+                                  FunctionType *OldFnTy) override {
+    Type *OldRetType = OldFnTy->getReturnType();
+    Type *NewRetType = OldRetType;
+    Type *Void = Type::getVoidTy(Ctx);
+    ParamTypeVector NewArgs;
+    bool Changed = false;
+    // Struct register returns become the first parameter of the new FT.
+    // The new FT has void for the return type
+    if (shouldPromote(OldRetType)) {
+      NewRetType = Void;
+      Changed = true;
+      NewArgs.push_back(getSimpleArgumentType(Ctx, OldRetType, Tentatives));
+    }
+    for (auto OldParam : OldFnTy->params()) {
+      auto NewType = getSimpleArgumentType(Ctx, OldParam, Tentatives);
+      Changed |= NewType.isChanged();
+      NewArgs.push_back(NewType);
+    }
+    Type *NewFuncType =
+        FunctionType::get(NewRetType, NewArgs, OldFnTy->isVarArg());
+    return {NewFuncType, Changed};
+  }
+private:
+  // Get the simplified type of a function argument.
+  MappingResult getSimpleArgumentType(LLVMContext &Ctx, Type *Ty,
+                                      StructMap &Tentatives) {
+    // struct registers become pointers to simple structs
+    if (shouldPromote(Ty)) {
+      return {PointerType::get(
+                  getSimpleAggregateTypeInternal(Ctx, Ty, Tentatives), 0),
+              true};
+    }
+    return getSimpleAggregateTypeInternal(Ctx, Ty, Tentatives);
+  }
+};
+// This is a ModulePass because the pass recreates functions in
+// order to change their signatures.
+class SimplifyStructRegSignatures : public ModulePass {
+public:
+  static char ID;
+  SimplifyStructRegSignatures() : ModulePass(ID) {
+    initializeSimplifyStructRegSignaturesPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnModule(Module &M);
+private:
+  TypeMapper Mapper;
+  DenseSet<Function *> FunctionsToDelete;
+  SetVector<CallInst *> CallsToPatch;
+  SetVector<InvokeInst *> InvokesToPatch;
+  DenseMap<Function *, Function *> FunctionMap;
+
+  struct FunctionAddressing {
+    Value *Temp;
+    Function *Old;
+    FunctionAddressing(Value *Temp, Function *Old) : Temp(Temp), Old(Old) {}
+  };
+  std::vector<FunctionAddressing> FunctionAddressings;
+
+  bool
+  simplifyFunction(LLVMContext &Ctx, Function *OldFunc);
+  void scheduleInstructionsForCleanup(Function *NewFunc);
+  template <class TCall>
+  void fixCallSite(LLVMContext &Ctx, TCall *Call, unsigned PreferredAlignment);
+  void fixFunctionBody(LLVMContext &Ctx, Function *OldFunc, Function *NewFunc);
+  template <class TCall>
+  TCall *fixCallTargetAndArguments(LLVMContext &Ctx, IRBuilder<> &Builder,
+                                   TCall *OldCall, Value *NewTarget,
+                                   FunctionType *NewType,
+                                   BasicBlock::iterator AllocaInsPoint,
+                                   Value *ExtraArg = nullptr);
+  void checkNoUnsupportedInstructions(LLVMContext &Ctx, Function *Fct);
+};
+}
+char SimplifyStructRegSignatures::ID = 0;
+INITIALIZE_PASS(
+    SimplifyStructRegSignatures, "simplify-struct-reg-signatures",
+    "Simplify function signatures by removing struct register parameters",
+    false, false)
+// Update the arg names for a newly created function.
+static void UpdateArgNames(Function *OldFunc, Function *NewFunc) {
+  auto NewArgIter = NewFunc->arg_begin();
+  if (shouldPromote(OldFunc->getReturnType())) {
+    NewArgIter->setName("retVal");
+    NewArgIter++;
+  }
+  for (const Argument &OldArg : OldFunc->args()) {
+    Argument *NewArg = &*NewArgIter++;
+    NewArg->setName(OldArg.getName() +
+                    (shouldPromote(OldArg.getType()) ? ".ptr" : ""));
+  }
+}
+// Replace all uses of an old value with a new one, disregarding the type. We
+// correct the types after we wire the new parameters in, in fixFunctionBody.
+static void BlindReplace(Value *Old, Value *New) {
+  for (auto UseIter = Old->use_begin(), E = Old->use_end(); E != UseIter;) {
+    Use &AUse = *(UseIter++);
+    AUse.set(New);
+  }
+}
+// Adapt the body of a function for the new arguments.
+static void ConvertArgumentValue(Value *Old, Value *New, Instruction *InsPoint,
+                                 const bool IsAggregateToPtr) {
+  if (Old == New)
+    return;
+  if (Old->getType() == New->getType()) {
+    Old->replaceAllUsesWith(New);
+    New->takeName(Old);
+    return;
+  }
+  BlindReplace(Old, (IsAggregateToPtr
+                         ? new LoadInst(New, Old->getName() + ".sreg", InsPoint)
+                         : New));
+}
+// Fix returns. Return true if fixes were needed.
+static void FixReturn(Function *OldFunc, Function *NewFunc) {
+  Argument *FirstNewArg = &*NewFunc->getArgumentList().begin();
+  for (auto BIter = NewFunc->begin(), LastBlock = NewFunc->end();
+       LastBlock != BIter;) {
+    BasicBlock *BB = &*BIter++;
+    for (auto IIter = BB->begin(), LastI = BB->end(); LastI != IIter;) {
+      Instruction *Instr = &*IIter++;
+      if (ReturnInst *Ret = dyn_cast<ReturnInst>(Instr)) {
+        auto RetVal = Ret->getReturnValue();
+        IRBuilder<> Builder(Ret);
+        StoreInst *Store = Builder.CreateStore(RetVal, FirstNewArg);
+        Store->setAlignment(FirstNewArg->getParamAlignment());
+        Builder.CreateRetVoid();
+        Ret->eraseFromParent();
+      }
+    }
+  }
+}
+/// In the next two functions, `RetIndex` is the index of the possibly promoted
+/// return.
+/// Ie if the return is promoted, `RetIndex` should be `1`, else `0`.
+static AttributeSet CopyRetAttributes(LLVMContext &C, const DataLayout &DL,
+                                      const AttributeSet From, Type *RetTy,
+                                      const unsigned RetIndex) {
+  AttributeSet NewAttrs;
+  if (RetIndex != 0) {
+    NewAttrs = NewAttrs.addAttribute(C, RetIndex, Attribute::StructRet);
+    NewAttrs = NewAttrs.addAttribute(C, RetIndex, Attribute::NonNull);
+    NewAttrs = NewAttrs.addAttribute(C, RetIndex, Attribute::NoCapture);
+    if (RetTy->isSized()) {
+      NewAttrs = NewAttrs.addDereferenceableAttr(C, RetIndex,
+                                                 DL.getTypeAllocSize(RetTy));
+    }
+  } else {
+    NewAttrs = NewAttrs.addAttributes(C, RetIndex, From.getRetAttributes());
+  }
+  auto FnAttrs = From.getFnAttributes();
+  if (RetIndex != 0) {
+    FnAttrs = FnAttrs.removeAttribute(C, AttributeSet::FunctionIndex,
+                                      Attribute::ReadOnly);
+    FnAttrs = FnAttrs.removeAttribute(C, AttributeSet::FunctionIndex,
+                                      Attribute::ReadNone);
+  }
+  NewAttrs = NewAttrs.addAttributes(C, AttributeSet::FunctionIndex, FnAttrs);
+  return NewAttrs;
+}
+/// Iff the argument in question was promoted, `NewArgTy` should be non-null.
+static AttributeSet CopyArgAttributes(AttributeSet NewAttrs, LLVMContext &C,
+                                      const DataLayout &DL,
+                                      const AttributeSet From,
+                                      const unsigned OldArg, Type *NewArgTy,
+                                      const unsigned RetIndex) {
+  const unsigned NewIndex = RetIndex + OldArg + 1;
+  if (!NewArgTy) {
+    const unsigned OldIndex = OldArg + 1;
+    auto OldAttrs = From.getParamAttributes(OldIndex);
+    if (OldAttrs.getNumSlots() == 0) {
+      return NewAttrs;
+    }
+    // move the params to the new index position:
+    unsigned OldSlot = 0;
+    for (; OldSlot < OldAttrs.getNumSlots(); ++OldSlot) {
+      if (OldAttrs.getSlotIndex(OldSlot) == OldIndex) {
+        break;
+      }
+    }
+    assert(OldSlot != OldAttrs.getNumSlots());
+    AttrBuilder B(AttributeSet(), NewIndex);
+    for (auto II = OldAttrs.begin(OldSlot), IE = OldAttrs.end(OldSlot);
+         II != IE; ++II) {
+      B.addAttribute(*II);
+    }
+    auto Attrs = AttributeSet::get(C, NewIndex, B);
+    NewAttrs = NewAttrs.addAttributes(C, NewIndex, Attrs);
+    return NewAttrs;
+  } else {
+    NewAttrs = NewAttrs.addAttribute(C, NewIndex, Attribute::NonNull);
+    NewAttrs = NewAttrs.addAttribute(C, NewIndex, Attribute::NoCapture);
+    NewAttrs = NewAttrs.addAttribute(C, NewIndex, Attribute::ReadOnly);
+    if (NewArgTy->isSized()) {
+      NewAttrs = NewAttrs.addDereferenceableAttr(C, NewIndex,
+                                                 DL.getTypeAllocSize(NewArgTy));
+    }
+    return NewAttrs;
+  }
+}
+// TODO (mtrofin): is this comprehensive?
+template <class TCall>
+void CopyCallAttributesAndMetadata(TCall *Orig, TCall *NewCall) {
+  NewCall->setCallingConv(Orig->getCallingConv());
+  NewCall->setAttributes(NewCall->getAttributes().addAttributes(
+      Orig->getContext(), AttributeSet::FunctionIndex,
+      Orig->getAttributes().getFnAttributes()));
+  NewCall->takeName(Orig);
+}
+static InvokeInst *CreateCallFrom(InvokeInst *Orig, Value *Target,
+                                  ArrayRef<Value *> &Args,
+                                  IRBuilder<> &Builder) {
+  auto Ret = Builder.CreateInvoke(Target, Orig->getNormalDest(),
+                                  Orig->getUnwindDest(), Args);
+  CopyCallAttributesAndMetadata(Orig, Ret);
+  return Ret;
+}
+static CallInst *CreateCallFrom(CallInst *Orig, Value *Target,
+                                ArrayRef<Value *> &Args, IRBuilder<> &Builder) {
+  CallInst *Ret = Builder.CreateCall(Target, Args);
+  Ret->setTailCallKind(Orig->getTailCallKind());
+  CopyCallAttributesAndMetadata(Orig, Ret);
+  return Ret;
+}
+// Insert Alloca at a specified location (normally, beginning of function)
+// to avoid memory leaks if reason for inserting the Alloca
+// (typically a call/invoke) is in a loop.
+static AllocaInst *InsertAllocaAtLocation(IRBuilder<> &Builder,
+                                          BasicBlock::iterator &AllocaInsPoint,
+                                          Type *ValType) {
+  auto SavedInsPoint = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(&*AllocaInsPoint);
+  auto *Alloca = Builder.CreateAlloca(ValType);
+  AllocaInsPoint = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(&*SavedInsPoint);
+  return Alloca;
+}
+// Fix a call site by handing return type changes and/or parameter type and
+// attribute changes.
+template <class TCall>
+void SimplifyStructRegSignatures::fixCallSite(LLVMContext &Ctx, TCall *OldCall,
+                                              unsigned PreferredAlignment) {
+  Value *NewTarget = OldCall->getCalledValue();
+  bool IsTargetFunction = false;
+  if (Function *CalledFunc = dyn_cast<Function>(NewTarget)) {
+    NewTarget = this->FunctionMap[CalledFunc];
+    IsTargetFunction = true;
+  }
+  assert(NewTarget);
+  auto *NewType = cast<FunctionType>(
+      Mapper.getSimpleType(Ctx, NewTarget->getType())->getPointerElementType());
+  IRBuilder<> Builder(OldCall);
+  if (!IsTargetFunction) {
+    NewTarget = Builder.CreateBitCast(NewTarget, NewType->getPointerTo());
+  }
+  auto *OldRetType = OldCall->getType();
+  const bool IsSRet =
+      !OldCall->getType()->isVoidTy() && NewType->getReturnType()->isVoidTy();
+  auto AllocaInsPoint =
+      OldCall->getParent()->getParent()->getEntryBlock().getFirstInsertionPt();
+  if (IsSRet) {
+    auto *Alloca = InsertAllocaAtLocation(Builder, AllocaInsPoint, OldRetType);
+    Alloca->takeName(OldCall);
+    Alloca->setAlignment(PreferredAlignment);
+    auto *NewCall = fixCallTargetAndArguments(Ctx, Builder, OldCall, NewTarget,
+                                              NewType, AllocaInsPoint, Alloca);
+    assert(NewCall);
+    if (auto *Invoke = dyn_cast<InvokeInst>(OldCall))
+      Builder.SetInsertPoint(&*Invoke->getNormalDest()->getFirstInsertionPt());
+    auto *Load = Builder.CreateLoad(Alloca, Alloca->getName() + ".sreg");
+    Load->setAlignment(Alloca->getAlignment());
+    OldCall->replaceAllUsesWith(Load);
+  } else {
+    auto *NewCall = fixCallTargetAndArguments(Ctx, Builder, OldCall, NewTarget,
+                                              NewType, AllocaInsPoint);
+    OldCall->replaceAllUsesWith(NewCall);
+  }
+  OldCall->eraseFromParent();
+}
+template <class TCall>
+TCall *SimplifyStructRegSignatures::fixCallTargetAndArguments(
+    LLVMContext &Ctx, IRBuilder<> &Builder, TCall *OldCall, Value *NewTarget,
+    FunctionType *NewType, BasicBlock::iterator AllocaInsPoint,
+    Value *ExtraArg) {
+  SmallVector<Value *, TypicalFuncArity> NewArgs;
+  const DataLayout &DL = OldCall->getParent() // BB
+                             ->getParent()    // F
+                             ->getParent()    // M
+                             ->getDataLayout();
+  const AttributeSet OldSet = OldCall->getAttributes();
+  unsigned argOffset = ExtraArg ? 1 : 0;
+  const unsigned RetSlot = AttributeSet::ReturnIndex + argOffset;
+  if (ExtraArg)
+    NewArgs.push_back(ExtraArg);
+  AttributeSet NewSet =
+      CopyRetAttributes(Ctx, DL, OldSet, OldCall->getType(), RetSlot);
+  // Go over the argument list used in the call/invoke, in order to
+  // correctly deal with varargs scenarios.
+  unsigned NumActualParams = OldCall->getNumArgOperands();
+  unsigned VarargMark = NewType->getNumParams();
+  for (unsigned ArgPos = 0; ArgPos < NumActualParams; ArgPos++) {
+    Use &OldArgUse = OldCall->getOperandUse(ArgPos);
+    Value *OldArg = OldArgUse;
+    Type *OldArgType = OldArg->getType();
+    unsigned NewArgPos = OldArgUse.getOperandNo() + argOffset;
+    Type *NewArgType = NewArgPos < VarargMark ? NewType->getFunctionParamType(NewArgPos) : nullptr;
+    Type *InnerNewArgType = nullptr;
+    if (OldArgType != NewArgType && shouldPromote(OldArgType)) {
+      if (NewArgPos >= VarargMark) {
+        errs() << *OldCall << '\n';
+        report_fatal_error("Aggregate register vararg is not supported");
+      }
+      auto *Alloca =
+          InsertAllocaAtLocation(Builder, AllocaInsPoint, OldArgType);
+      Alloca->setName(OldArg->getName() + ".ptr");
+      Builder.CreateStore(OldArg, Alloca);
+      NewArgs.push_back(Alloca);
+      InnerNewArgType = NewArgType->getPointerElementType();
+    } else if (NewArgType && OldArgType != NewArgType && isa<Function>(OldArg)) {
+      // If a function pointer has a changed type due to struct reg changes, it will still have
+      // the wrong type here, since we may have not changed that method yet. We'll fix it up
+      // later, and meanwhile place an undef of the right type in that slot.
+      Value *Temp = UndefValue::get(NewArgType);
+      FunctionAddressings.emplace_back(Temp, cast<Function>(OldArg));
+      NewArgs.push_back(Temp);
+    } else if (NewArgType && OldArgType != NewArgType && OldArgType->isPointerTy()) {
+      // This would be a function ptr or would have a function type nested in
+      // it.
+      NewArgs.push_back(Builder.CreatePointerCast(OldArg, NewArgType));
+    } else {
+      NewArgs.push_back(OldArg);
+    }
+    NewSet = CopyArgAttributes(NewSet, Ctx, DL, OldSet, ArgPos, InnerNewArgType,
+                               RetSlot);
+  }
+
+  if (isa<Instruction>(NewTarget)) {
+    Type* NewPointerType = PointerType::get(NewType, 0);
+    if (NewPointerType != OldCall->getType()) {
+      // This is a function pointer, and it has the wrong type after our
+      // changes. Bitcast it.
+      NewTarget = Builder.CreateBitCast(NewTarget, NewPointerType, ".casttarget");
+    }
+  }
+
+  ArrayRef<Value *> ArrRef = NewArgs;
+  TCall *NewCall = CreateCallFrom(OldCall, NewTarget, ArrRef, Builder);
+  NewCall->setAttributes(NewSet);
+  return NewCall;
+}
+void
+SimplifyStructRegSignatures::scheduleInstructionsForCleanup(Function *NewFunc) {
+  for (auto &BBIter : NewFunc->getBasicBlockList()) {
+    for (auto &IIter : BBIter.getInstList()) {
+      if (CallInst *Call = dyn_cast<CallInst>(&IIter)) {
+        if (Function *F = dyn_cast<Function>(Call->getCalledValue())) {
+          if (F->isIntrinsic()) {
+            continue;
+          }
+        }
+        CallsToPatch.insert(Call);
+      } else if (InvokeInst *Invoke = dyn_cast<InvokeInst>(&IIter)) {
+        InvokesToPatch.insert(Invoke);
+      }
+    }
+  }
+}
+// Change function body in the light of type changes.
+void SimplifyStructRegSignatures::fixFunctionBody(LLVMContext &Ctx,
+                                                  Function *OldFunc,
+                                                  Function *NewFunc) {
+  const DataLayout &DL = OldFunc->getParent()->getDataLayout();
+  bool returnWasFixed = shouldPromote(OldFunc->getReturnType());
+  const AttributeSet OldSet = OldFunc->getAttributes();
+  const unsigned RetSlot = AttributeSet::ReturnIndex + (returnWasFixed ? 1 : 0);
+  AttributeSet NewSet =
+      CopyRetAttributes(Ctx, DL, OldSet, OldFunc->getReturnType(), RetSlot);
+  Instruction *InsPoint = &*NewFunc->begin()->begin();
+  auto NewArgIter = NewFunc->arg_begin();
+  // Advance one more if we used to return a struct register.
+  if (returnWasFixed)
+    NewArgIter++;
+  // Wire new parameters in.
+  unsigned ArgIndex = 0;
+  for (auto ArgIter = OldFunc->arg_begin(), E = OldFunc->arg_end();
+       E != ArgIter; ArgIndex++) {
+    Argument *OldArg = &*ArgIter++;
+    Argument *NewArg = &*NewArgIter++;
+    const bool IsAggregateToPtr =
+        shouldPromote(OldArg->getType()) && NewArg->getType()->isPointerTy();
+    if (!NewFunc->empty()) {
+      ConvertArgumentValue(OldArg, NewArg, InsPoint, IsAggregateToPtr);
+    }
+    Type *Inner = nullptr;
+    if (IsAggregateToPtr) {
+      Inner = NewArg->getType()->getPointerElementType();
+    }
+    NewSet =
+        CopyArgAttributes(NewSet, Ctx, DL, OldSet, ArgIndex, Inner, RetSlot);
+  }
+  NewFunc->setAttributes(NewSet);
+  // Now fix instruction types. We know that each value could only possibly be
+  // of a simplified type. At the end of this, call sites will be invalid, but
+  // we handle that afterwards, to make sure we have all the functions changed
+  // first (so that calls have valid targets)
+  for (auto BBIter = NewFunc->begin(), LBlock = NewFunc->end();
+       LBlock != BBIter;) {
+    auto Block = BBIter++;
+    for (auto IIter = Block->begin(), LIns = Block->end(); LIns != IIter;) {
+      auto Instr = IIter++;
+      auto *NewTy = Mapper.getSimpleType(Ctx, Instr->getType());
+      Instr->mutateType(NewTy);
+      if (isa<CallInst>(Instr) ||
+          isa<InvokeInst>(Instr)) {
+        continue;
+      }
+      for (unsigned OpI = 0; OpI < Instr->getNumOperands(); OpI++) {
+        if(Constant *C = dyn_cast<Constant>(Instr->getOperand(OpI))) {
+          auto *NewTy = Mapper.getSimpleType(Ctx, C->getType());
+          if (NewTy == C->getType()) { continue; }
+          const auto CastOp = CastInst::getCastOpcode(C, false, NewTy, false);
+          auto *NewOp = ConstantExpr::getCast(CastOp, C, NewTy);
+          Instr->setOperand(OpI, NewOp);
+        }
+      }
+    }
+  }
+  if (returnWasFixed)
+    FixReturn(OldFunc, NewFunc);
+}
+// Ensure function is simplified, returning true if the function
+// had to be changed.
+bool SimplifyStructRegSignatures::simplifyFunction(
+    LLVMContext &Ctx, Function *OldFunc) {
+  auto *OldFT = OldFunc->getFunctionType();
+  auto *NewFT = cast<FunctionType>(Mapper.getSimpleType(Ctx, OldFT));
+  Function *&AssociatedFctLoc = FunctionMap[OldFunc];
+  if (NewFT != OldFT) {
+    auto *NewFunc = Function::Create(NewFT, OldFunc->getLinkage());
+    AssociatedFctLoc = NewFunc;
+    OldFunc->getParent()->getFunctionList().insert(OldFunc->getIterator(), NewFunc);
+    NewFunc->takeName(OldFunc);
+    UpdateArgNames(OldFunc, NewFunc);
+    NewFunc->getBasicBlockList().splice(NewFunc->begin(),
+                                        OldFunc->getBasicBlockList());
+    fixFunctionBody(Ctx, OldFunc, NewFunc);
+    Constant *Cast = ConstantExpr::getPointerCast(NewFunc, OldFunc->getType());
+    OldFunc->replaceAllUsesWith(Cast);
+    FunctionsToDelete.insert(OldFunc);
+  } else {
+    AssociatedFctLoc = OldFunc;
+  }
+  scheduleInstructionsForCleanup(AssociatedFctLoc);
+  return NewFT != OldFT;
+}
+bool SimplifyStructRegSignatures::runOnModule(Module &M) {
+  bool Changed = false;
+  unsigned PreferredAlignment = 0;
+  PreferredAlignment = M.getDataLayout().getStackAlignment();
+  LLVMContext &Ctx = M.getContext();
+  // Change function signatures and fix a changed function body by
+  // wiring the new arguments. Call sites are unchanged at this point.
+  for (Module::iterator Iter = M.begin(), E = M.end(); Iter != E;) {
+    Function *Func = &*Iter++;
+    if (Func->isIntrinsic()) {
+      // Can't rewrite intrinsics.
+      continue;
+    }
+    checkNoUnsupportedInstructions(Ctx, Func);
+    Changed |= simplifyFunction(Ctx, Func);
+  }
+  // Fix call sites.
+  for (auto &CallToFix : CallsToPatch) {
+    fixCallSite(Ctx, CallToFix, PreferredAlignment);
+  }
+  for (auto &InvokeToFix : InvokesToPatch) {
+    fixCallSite(Ctx, InvokeToFix, PreferredAlignment);
+  }
+
+  // Update taking of a function's address from a parameter
+  for (auto &Addressing : FunctionAddressings) {
+    Value *Temp = Addressing.Temp;
+    Function *Old = Addressing.Old;
+    Function *New = FunctionMap[Old];
+    assert(New);
+    Temp->replaceAllUsesWith(New);
+  }
+
+  // Remaining uses of functions we modified (like in a global vtable)
+  // can be handled via a constantexpr bitcast
+  for (auto &Old : FunctionsToDelete) {
+    Function *New = FunctionMap[Old];
+    assert(New);
+    Old->replaceAllUsesWith(ConstantExpr::getBitCast(New, Old->getType()));
+  }
+
+  // Delete leftover functions - the ones with old signatures.
+  for (auto &ToDelete : FunctionsToDelete) {
+    ToDelete->eraseFromParent();
+  }
+  return Changed;
+}
+void
+SimplifyStructRegSignatures::checkNoUnsupportedInstructions(LLVMContext &Ctx,
+                                                            Function *Fct) {
+  for (auto &BB : Fct->getBasicBlockList()) {
+    for (auto &Inst : BB.getInstList()) {
+      if (auto *Landing = dyn_cast<LandingPadInst>(&Inst)) {
+        auto *LType = Fct->getPersonalityFn()->getType();
+        if (LType != Mapper.getSimpleType(Ctx, LType)) {
+          errs() << *Landing << '\n';
+          report_fatal_error("Landing pads with aggregate register "
+                             "signatures are not supported.");
+        }
+      } else if (auto *Resume = dyn_cast<ResumeInst>(&Inst)) {
+        auto *RType = Resume->getValue()->getType();
+        if (RType != Mapper.getSimpleType(Ctx, RType)) {
+          errs() << *Resume << '\n';
+          report_fatal_error(
+              "Resumes with aggregate register signatures are not supported.");
+        }
+      }
+    }
+  }
+}
+ModulePass *llvm::createSimplifyStructRegSignaturesPass() {
+  return new SimplifyStructRegSignatures();
+}
diff --git a/lib/Target/JSBackend/NaCl/StripAttributes.cpp b/lib/Target/JSBackend/NaCl/StripAttributes.cpp
new file mode 100644
index 000000000000..97051b277e47
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/StripAttributes.cpp
@@ -0,0 +1,246 @@
+//===- StripAttributes.cpp - Remove attributes not supported by PNaCl------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass strips out attributes that are not supported by PNaCl's
+// stable ABI.  Currently, this strips out:
+//
+//  * Function and argument attributes from functions and function
+//    calls.
+//  * Calling conventions from functions and function calls.
+//  * The "align" attribute on functions.
+//  * The "unnamed_addr" attribute on functions and global variables.
+//  * The distinction between "internal" and "private" linkage.
+//  * "protected" and "internal" visibility of functions and globals.
+//  * All sections are stripped. A few sections cause warnings.
+//  * The arithmetic attributes "nsw", "nuw" and "exact".
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  // This is a ModulePass so that it can modify attributes of global
+  // variables.
+  class StripAttributes : public ModulePass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    StripAttributes() : ModulePass(ID) {
+      initializeStripAttributesPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnModule(Module &M) override;
+  };
+}
+
+char StripAttributes::ID = 0;
+INITIALIZE_PASS(StripAttributes, "nacl-strip-attributes",
+                "Strip out attributes that are not part of PNaCl's ABI",
+                false, false)
+
+static void CheckAttributes(AttributeSet Attrs) {
+  for (unsigned Slot = 0; Slot < Attrs.getNumSlots(); ++Slot) {
+    for (AttributeSet::iterator Attr = Attrs.begin(Slot), E = Attrs.end(Slot);
+         Attr != E; ++Attr) {
+      if (!Attr->isEnumAttribute()) {
+        continue;
+      }
+      switch (Attr->getKindAsEnum()) {
+        // The vast majority of attributes are hints that can safely
+        // be removed, so don't complain if we see attributes we don't
+        // recognize.
+        default:
+
+        // The following attributes can affect calling conventions.
+        // Rather than complaining, we just strip these out.
+        // ExpandSmallArguments should have rendered SExt/ZExt
+        // meaningless since the function arguments will be at least
+        // 32-bit.
+        case Attribute::InReg:
+        case Attribute::SExt:
+        case Attribute::ZExt:
+        // These attributes influence ABI decisions that should not be
+        // visible to PNaCl pexes.
+        case Attribute::NonLazyBind:  // Only relevant to dynamic linking.
+        case Attribute::NoRedZone:
+        case Attribute::StackAlignment:
+
+        // The following attributes are just hints, which can be
+        // safely removed.
+        case Attribute::AlwaysInline:
+        case Attribute::InlineHint:
+        case Attribute::MinSize:
+        case Attribute::NoAlias:
+        case Attribute::NoBuiltin:
+        case Attribute::NoCapture:
+        case Attribute::NoDuplicate:
+        case Attribute::NoImplicitFloat:
+        case Attribute::NoInline:
+        case Attribute::NoReturn:
+        case Attribute::OptimizeForSize:
+        case Attribute::ReadNone:
+        case Attribute::ReadOnly:
+
+        // PNaCl does not support -fstack-protector in the translator.
+        case Attribute::StackProtect:
+        case Attribute::StackProtectReq:
+        case Attribute::StackProtectStrong:
+        // PNaCl does not support ASan in the translator.
+        case Attribute::SanitizeAddress:
+        case Attribute::SanitizeThread:
+        case Attribute::SanitizeMemory:
+
+        // The Language References cites setjmp() as an example of a
+        // function which returns twice, and says ReturnsTwice is
+        // necessary to disable optimizations such as tail calls.
+        // However, in the PNaCl ABI, setjmp() is an intrinsic, and
+        // user-defined functions are not allowed to return twice.
+        case Attribute::ReturnsTwice:
+
+        // NoUnwind is not a hint if it causes unwind info to be
+        // omitted, since this will prevent C++ exceptions from
+        // propagating.  In the future, when PNaCl supports zero-cost
+        // C++ exception handling using unwind info, we might allow
+        // NoUnwind and UWTable.  Alternatively, we might continue to
+        // disallow them, and just generate unwind info for all
+        // functions.
+        case Attribute::NoUnwind:
+        case Attribute::UWTable:
+          break;
+
+        // A few attributes can change program behaviour if removed,
+        // so check for these.
+        case Attribute::ByVal:
+        case Attribute::StructRet:
+        case Attribute::Alignment:
+          Attrs.dump();
+          report_fatal_error(
+              "Attribute should already have been removed by ExpandByVal");
+
+        case Attribute::Naked:
+        case Attribute::Nest:
+          Attrs.dump();
+          report_fatal_error("Unsupported attribute");
+      }
+    }
+  }
+}
+
+static const char* ShouldWarnAboutSection(const char* Section) {
+  static const char* SpecialSections[] = {
+    ".init_array",
+    ".init",
+    ".fini_array",
+    ".fini",
+
+    // Java/LSB:
+    ".jcr",
+
+    // LSB:
+    ".ctors",
+    ".dtors",
+  };
+
+  for (auto CheckSection : SpecialSections) {
+    if (strcmp(Section, CheckSection) == 0) {
+      return CheckSection;
+    }
+  }
+
+  return nullptr;
+}
+
+void stripGlobalValueAttrs(GlobalValue *GV) {
+  // In case source code uses __attribute__((visibility("hidden"))) or
+  // __attribute__((visibility("protected"))), strip these attributes.
+  GV->setVisibility(GlobalValue::DefaultVisibility);
+
+  GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  if (GV->hasSection()) {
+    const char *Section = GV->getSection().data();
+    // check for a few special cases
+    if (const char *WarnSection = ShouldWarnAboutSection(Section)) {
+      errs() << "Warning: " << GV->getName() <<
+        " will have its section (" <<
+        WarnSection << ") stripped.\n";
+    }
+
+    if(GlobalObject* GO = dyn_cast<GlobalObject>(GV)) {
+      GO->setSection("");
+    }
+    // Nothing we can do if GV isn't a GlobalObject.
+  }
+
+  // Convert "private" linkage to "internal" to reduce the number of
+  // linkage types that need to be represented in PNaCl's wire format.
+  //
+  // We convert "private" to "internal" rather than vice versa because
+  // "private" symbols are omitted from the nexe's symbol table, which
+  // would get in the way of debugging when an unstripped pexe is
+  // translated offline.
+  if (GV->getLinkage() == GlobalValue::PrivateLinkage)
+    GV->setLinkage(GlobalValue::InternalLinkage);
+}
+
+void stripFunctionAttrs(DataLayout *DL, Function *F) {
+  CheckAttributes(F->getAttributes());
+  F->setAttributes(AttributeSet());
+  F->setCallingConv(CallingConv::C);
+  F->setAlignment(0);
+
+  for (BasicBlock &BB : *F) {
+    for (Instruction &I : BB) {
+      CallSite Call(&I);
+      if (Call) {
+        CheckAttributes(Call.getAttributes());
+        Call.setAttributes(AttributeSet());
+        Call.setCallingConv(CallingConv::C);
+      } else if (OverflowingBinaryOperator *Op =
+                     dyn_cast<OverflowingBinaryOperator>(&I)) {
+        cast<BinaryOperator>(Op)->setHasNoUnsignedWrap(false);
+        cast<BinaryOperator>(Op)->setHasNoSignedWrap(false);
+      } else if (PossiblyExactOperator *Op =
+                     dyn_cast<PossiblyExactOperator>(&I)) {
+        cast<BinaryOperator>(Op)->setIsExact(false);
+      }
+    }
+  }
+}
+
+bool StripAttributes::runOnModule(Module &M) {
+  DataLayout DL(&M);
+  for (Function &F : M)
+    // Avoid stripping attributes from intrinsics because the
+    // constructor for Functions just adds them back again.  It would
+    // be confusing if the attributes were sometimes present on
+    // intrinsics and sometimes not.
+    if (!F.isIntrinsic()) {
+      stripGlobalValueAttrs(&F);
+      stripFunctionAttrs(&DL, &F);
+    }
+
+  for (GlobalVariable &GV : M.globals())
+    stripGlobalValueAttrs(&GV);
+
+  return true;
+}
+
+ModulePass *llvm::createStripAttributesPass() {
+  return new StripAttributes();
+}
diff --git a/lib/Target/JSBackend/NaCl/StripMetadata.cpp b/lib/Target/JSBackend/NaCl/StripMetadata.cpp
new file mode 100644
index 000000000000..bf8eb92c699d
--- /dev/null
+++ b/lib/Target/JSBackend/NaCl/StripMetadata.cpp
@@ -0,0 +1,163 @@
+//===- StripMetadata.cpp - Strip non-stable non-debug metadata       ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The StripMetadata transformation strips instruction attachment
+// metadata, such as !tbaa and !prof metadata.
+// TODO: Strip NamedMetadata too.
+//
+// It does not strip debug metadata.  Debug metadata is used by debug
+// intrinsic functions and calls to those intrinsic functions.  Use the
+// -strip-debug or -strip pass to strip that instead.
+//
+// The goal of this pass is to reduce bitcode ABI surface area.
+// We don't know yet which kind of metadata is considered stable.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/NaCl.h"
+
+using namespace llvm;
+
+namespace {
+  class StripMetadata : public ModulePass {
+  public:
+    static char ID;
+    StripMetadata() : ModulePass(ID), ShouldStripModuleFlags(false) {
+      initializeStripMetadataPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+
+  protected:
+    bool ShouldStripModuleFlags;
+  };
+
+  class StripModuleFlags : public StripMetadata {
+  public:
+    static char ID;
+    StripModuleFlags() : StripMetadata() {
+      initializeStripModuleFlagsPass(*PassRegistry::getPassRegistry());
+      ShouldStripModuleFlags = true;
+    }
+  };
+
+// In certain cases, linked bitcode files can have DISupbrogram metadata which
+// points to a Function that has no dbg attachments. This causes problem later
+// (e.g. in inlining). See https://llvm.org/bugs/show_bug.cgi?id=23874
+// Until that bug is fixed upstream (the fix will involve infrastructure that we
+// don't have in our branch yet) we have to ensure we don't expose this case
+// to further optimizations. So we'd like to strip out such debug info.
+// Unfortunately once created the metadata is not easily deleted or even
+// modified; the best we can easily do is to set the Function object it points
+// to to null. Fortunately this is legitimate (declarations have no Function
+// either) and should be workable until the fix lands.
+class StripDanglingDISubprograms : public ModulePass {
+ public:
+  static char ID;
+  StripDanglingDISubprograms() : ModulePass(ID) {
+    initializeStripDanglingDISubprogramsPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+};
+}
+
+char StripMetadata::ID = 0;
+INITIALIZE_PASS(StripMetadata, "strip-metadata",
+                "Strip all non-stable non-debug metadata from a module.",
+                false, false)
+
+char StripModuleFlags::ID = 0;
+INITIALIZE_PASS(StripModuleFlags, "strip-module-flags",
+                "Strip all non-stable non-debug metadata from a module, "
+                "including the llvm.module.flags metadata.",
+                false, false)
+
+char StripDanglingDISubprograms::ID = 0;
+INITIALIZE_PASS(StripDanglingDISubprograms, "strip-dangling-disubprograms",
+                "Strip DISubprogram metadata for functions with no debug info",
+                false, false)
+
+ModulePass *llvm::createStripMetadataPass() {
+  return new StripMetadata();
+}
+
+ModulePass *llvm::createStripModuleFlagsPass() {
+  return new StripModuleFlags();
+}
+
+ModulePass *llvm::createStripDanglingDISubprogramsPass() {
+  return new StripDanglingDISubprograms();
+}
+
+static bool IsWhitelistedMetadata(const NamedMDNode *node,
+                                  bool StripModuleFlags) {
+  // Leave debug metadata to the -strip-debug pass.
+  return (node->getName().startswith("llvm.dbg.") ||
+          // "Debug Info Version" is in llvm.module.flags.
+          (!StripModuleFlags && node->getName().equals("llvm.module.flags")));
+}
+
+static bool DoStripMetadata(Module &M, bool StripModuleFlags) {
+  bool Changed = false;
+
+  if (!StripModuleFlags)
+    for (Function &F : M)
+      for (BasicBlock &B : F)
+        for (Instruction &I : B) {
+          SmallVector<std::pair<unsigned, MDNode *>, 8> InstMeta;
+          // Let the debug metadata be stripped by the -strip-debug pass.
+          I.getAllMetadataOtherThanDebugLoc(InstMeta);
+          for (size_t i = 0; i < InstMeta.size(); ++i) {
+            I.setMetadata(InstMeta[i].first, NULL);
+            Changed = true;
+          }
+        }
+
+  // Strip unsupported named metadata.
+  SmallVector<NamedMDNode*, 8> ToErase;
+  for (Module::NamedMDListType::iterator I = M.named_metadata_begin(),
+           E = M.named_metadata_end(); I != E; ++I) {
+    if (!IsWhitelistedMetadata(&*I, StripModuleFlags))
+      ToErase.push_back(&*I);
+  }
+  for (size_t i = 0; i < ToErase.size(); ++i)
+    M.eraseNamedMetadata(ToErase[i]);
+
+  return Changed;
+}
+
+bool StripMetadata::runOnModule(Module &M) {
+  return DoStripMetadata(M, ShouldStripModuleFlags);
+}
+
+static bool functionHasDbgAttachment(const Function &F) {
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      if (I.getDebugLoc()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool StripDanglingDISubprograms::runOnModule(Module &M) {
+  NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes)
+    return false;
+
+  return false; // TODO: we don't need this anymore
+}
diff --git a/lib/Target/JSBackend/OptPasses.h b/lib/Target/JSBackend/OptPasses.h
new file mode 100644
index 000000000000..86e3c707e1c3
--- /dev/null
+++ b/lib/Target/JSBackend/OptPasses.h
@@ -0,0 +1,24 @@
+//===-- JSTargetMachine.h - TargetMachine for the JS Backend ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef OPT_PASSES_H
+#define OPT_PASSES_H
+
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+  extern FunctionPass *createEmscriptenSimplifyAllocasPass();
+  extern ModulePass *createEmscriptenRemoveLLVMAssumePass();
+  extern FunctionPass *createEmscriptenExpandBigSwitchesPass();
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/JSBackend/Relooper.cpp b/lib/Target/JSBackend/Relooper.cpp
new file mode 100644
index 000000000000..00415f2d27e9
--- /dev/null
+++ b/lib/Target/JSBackend/Relooper.cpp
@@ -0,0 +1,1432 @@
+// We are implementing the Relooper C API, so always export from this file.
+#ifndef RELOOPERDLL_EXPORTS
+#define RELOOPERDLL_EXPORTS
+#endif
+
+#include "Relooper.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <list>
+#include <stack>
+#include <string>
+
+// uncomment these out to get LLVM errs() debugging support
+//#include <llvm/Support/raw_ostream.h>
+//using namespace llvm;
+
+template <class T, class U> static bool contains(const T& container, const U& contained) {
+  return container.count(contained);
+}
+
+#if DEBUG
+static void PrintDebug(const char *Format, ...);
+#define DebugDump(x, ...) Debugging::Dump(x, __VA_ARGS__)
+#else
+#define PrintDebug(x, ...)
+#define DebugDump(x, ...)
+#endif
+
+#define INDENTATION 1
+
+struct Indenter {
+  static int CurrIndent;
+
+  static void Indent() { CurrIndent++; }
+  static void Unindent() { CurrIndent--; }
+};
+
+static void PrintIndented(const char *Format, ...);
+static void PutIndented(const char *String);
+
+static char *OutputBufferRoot = NULL;
+static char *OutputBuffer = NULL;
+static int OutputBufferSize = 0;
+static int OutputBufferOwned = false;
+
+static int LeftInOutputBuffer() {
+  return OutputBufferSize - (OutputBuffer - OutputBufferRoot);
+}
+
+static bool EnsureOutputBuffer(int Needed) { // ensures the output buffer is sufficient. returns true is no problem happened
+  Needed++; // ensure the trailing \0 is not forgotten
+  int Left = LeftInOutputBuffer();
+  if (!OutputBufferOwned) {
+    assert(Needed < Left);
+  } else {
+    // we own the buffer, and can resize if necessary
+    if (Needed >= Left) {
+      int Offset = OutputBuffer - OutputBufferRoot;
+      int TotalNeeded = OutputBufferSize + Needed - Left + 10240;
+      int NewSize = OutputBufferSize;
+      while (NewSize < TotalNeeded) NewSize = NewSize + (NewSize/2);
+      //printf("resize %d => %d\n", OutputBufferSize, NewSize);
+      OutputBufferRoot = (char*)realloc(OutputBufferRoot, NewSize);
+      assert(OutputBufferRoot);
+      OutputBuffer = OutputBufferRoot + Offset;
+      OutputBufferSize = NewSize;
+      return false;
+    }
+  }
+  return true;
+}
+
+void PrintIndented(const char *Format, ...) {
+  assert(OutputBuffer);
+  EnsureOutputBuffer(Indenter::CurrIndent*INDENTATION);
+  for (int i = 0; i < Indenter::CurrIndent*INDENTATION; i++, OutputBuffer++) *OutputBuffer = ' ';
+  int Written;
+  while (1) { // write and potentially resize buffer until we have enough room
+    int Left = LeftInOutputBuffer();
+    va_list Args;
+    va_start(Args, Format);
+    Written = vsnprintf(OutputBuffer, Left, Format, Args);
+    va_end(Args);
+#ifdef _MSC_VER
+    // VC CRT specific: vsnprintf returns -1 on failure, other runtimes return the number of characters that would have been
+    // written. On VC, if we get -1, count the number of characters manually.
+    if (Written < 0) {
+      va_start(Args, Format);
+      Written = _vscprintf(Format, Args);
+      va_end(Args);
+    }
+#endif
+
+    if (EnsureOutputBuffer(Written)) break;
+  }
+  OutputBuffer += Written;
+}
+
+void PutIndented(const char *String) {
+  assert(OutputBuffer);
+  EnsureOutputBuffer(Indenter::CurrIndent*INDENTATION);
+  for (int i = 0; i < Indenter::CurrIndent*INDENTATION; i++, OutputBuffer++) *OutputBuffer = ' ';
+  int Needed = strlen(String)+1;
+  EnsureOutputBuffer(Needed);
+  strcpy(OutputBuffer, String);
+  OutputBuffer += strlen(String);
+  *OutputBuffer++ = '\n';
+  *OutputBuffer = 0;
+}
+
+static int AsmJS = 0;
+
+// Indenter
+
+int Indenter::CurrIndent = 1;
+
+// Branch
+
+Branch::Branch(const char *ConditionInit, const char *CodeInit) : Ancestor(NULL), Labeled(true) {
+  Condition = ConditionInit ? strdup(ConditionInit) : NULL;
+  Code = CodeInit ? strdup(CodeInit) : NULL;
+}
+
+Branch::~Branch() {
+  free(static_cast<void *>(const_cast<char *>(Condition)));
+  free(static_cast<void *>(const_cast<char *>(Code)));
+}
+
+void Branch::Render(Block *Target, bool SetLabel) {
+  if (Code) PrintIndented("%s\n", Code);
+  if (SetLabel) PrintIndented("label = %d;\n", Target->Id);
+  if (Ancestor) {
+    if (Type == Break || Type == Continue) {
+      if (Labeled) {
+        PrintIndented("%s L%d;\n", Type == Break ? "break" : "continue", Ancestor->Id);
+      } else {
+        PrintIndented("%s;\n", Type == Break ? "break" : "continue");
+      }
+    }
+  }
+}
+
+// Block
+
+Block::Block(const char *CodeInit, const char *BranchVarInit) : Parent(NULL), Id(-1), IsCheckedMultipleEntry(false) {
+  Code = strdup(CodeInit);
+  BranchVar = BranchVarInit ? strdup(BranchVarInit) : NULL;
+}
+
+Block::~Block() {
+  free(static_cast<void *>(const_cast<char *>(Code)));
+  free(static_cast<void *>(const_cast<char *>(BranchVar)));
+  for (BlockBranchMap::iterator iter = ProcessedBranchesOut.begin(); iter != ProcessedBranchesOut.end(); iter++) {
+    delete iter->second;
+  }
+}
+
+void Block::AddBranchTo(Block *Target, const char *Condition, const char *Code) {
+  assert(!contains(BranchesOut, Target)); // cannot add more than one branch to the same target
+  BranchesOut[Target] = new Branch(Condition, Code);
+}
+
+void Block::Render(bool InLoop) {
+  if (IsCheckedMultipleEntry && InLoop) {
+    PrintIndented("label = 0;\n");
+  }
+
+  if (Code) {
+    // Print code in an indented manner, even over multiple lines
+    char *Start = const_cast<char*>(Code);
+    while (*Start) {
+      char *End = strchr(Start, '\n');
+      if (End) *End = 0;
+      PutIndented(Start);
+      if (End) *End = '\n'; else break;
+      Start = End+1;
+    }
+  }
+
+  if (!ProcessedBranchesOut.size()) return;
+
+  bool SetLabel = true; // in some cases it is clear we can avoid setting label, see later
+  bool ForceSetLabel = Shape::IsEmulated(Parent);
+
+  // A setting of the label variable (label = x) is necessary if it can
+  // cause an impact. The main case is where we set label to x, then elsewhere
+  // we check if label is equal to that value, i.e., that label is an entry
+  // in a multiple block. We also need to reset the label when we enter
+  // that block, so that each setting is a one-time action: consider
+  //
+  //    while (1) {
+  //      if (check) label = 1;
+  //      if (label == 1) { label = 0 }
+  //    }
+  //
+  // (Note that this case is impossible due to fusing, but that is not
+  // material here.) So setting to 0 is important just to clear the 1 for
+  // future iterations.
+  // TODO: When inside a loop, if necessary clear the label variable
+  //       once on the top, and never do settings that are in effect clears
+
+  // Fusing: If the next is a Multiple, we can fuse it with this block. Note
+  // that we must be the Inner of a Simple, so fusing means joining a Simple
+  // to a Multiple. What happens there is that all options in the Multiple
+  // *must* appear in the Simple (the Simple is the only one reaching the
+  // Multiple), so we can remove the Multiple and add its independent groups
+  // into the Simple's branches.
+  MultipleShape *Fused = Shape::IsMultiple(Parent->Next);
+  if (Fused) {
+    PrintDebug("Fusing Multiple to Simple\n", 0);
+    Parent->Next = Parent->Next->Next;
+    Fused->UseSwitch = false; // TODO: emit switches here
+    Fused->RenderLoopPrefix();
+
+    // When the Multiple has the same number of groups as we have branches,
+    // they will all be fused, so it is safe to not set the label at all
+    if (SetLabel && Fused->InnerMap.size() == ProcessedBranchesOut.size()) {
+      SetLabel = false;
+    }
+  }
+
+  Block *DefaultTarget(NULL); // The block we branch to without checking the condition, if none of the other conditions held.
+
+  // Find the default target, the one without a condition
+  for (BlockBranchMap::iterator iter = ProcessedBranchesOut.begin(); iter != ProcessedBranchesOut.end(); iter++) {
+    if (!iter->second->Condition) {
+      assert(!DefaultTarget); // Must be exactly one default
+      DefaultTarget = iter->first;
+    }
+  }
+  assert(DefaultTarget); // Since each block *must* branch somewhere, this must be set
+
+  bool useSwitch = BranchVar != NULL;
+
+  if (useSwitch) {
+    PrintIndented("switch (%s) {\n", BranchVar);
+  }
+
+  std::string RemainingConditions;
+  bool First = !useSwitch; // when using a switch, there is no special first
+  for (BlockBranchMap::iterator iter = ProcessedBranchesOut.begin();; iter++) {
+    Block *Target;
+    Branch *Details;
+    if (iter != ProcessedBranchesOut.end()) {
+      Target = iter->first;
+      if (Target == DefaultTarget) continue; // done at the end
+      Details = iter->second;
+      assert(Details->Condition); // must have a condition if this is not the default target
+    } else {
+      Target = DefaultTarget;
+      Details = ProcessedBranchesOut[DefaultTarget];
+    }
+    bool SetCurrLabel = (SetLabel && Target->IsCheckedMultipleEntry) || ForceSetLabel;
+    bool HasFusedContent = Fused && contains(Fused->InnerMap, Target->Id);
+    bool HasContent = SetCurrLabel || Details->Type != Branch::Direct || HasFusedContent || Details->Code;
+    if (iter != ProcessedBranchesOut.end()) {
+      // If there is nothing to show in this branch, omit the condition
+      if (useSwitch) {
+        PrintIndented("%s {\n", Details->Condition);
+      } else {
+        if (HasContent) {
+          PrintIndented("%sif (%s) {\n", First ? "" : "} else ", Details->Condition);
+          First = false;
+        } else {
+          if (RemainingConditions.size() > 0) RemainingConditions += " && ";
+          RemainingConditions += "!(";
+          if (BranchVar) {
+            RemainingConditions += BranchVar;
+            RemainingConditions += " == ";
+          }
+          RemainingConditions += Details->Condition;
+          RemainingConditions += ")";
+        }
+      }
+    } else {
+      // this is the default
+      if (useSwitch) {
+        PrintIndented("default: {\n");
+      } else {
+        if (HasContent) {
+          if (RemainingConditions.size() > 0) {
+            if (First) {
+              PrintIndented("if (%s) {\n", RemainingConditions.c_str());
+              First = false;
+            } else {
+              PrintIndented("} else if (%s) {\n", RemainingConditions.c_str());
+            }
+          } else if (!First) {
+            PrintIndented("} else {\n");
+          }
+        }
+      }
+    }
+    if (!First) Indenter::Indent();
+    Details->Render(Target, SetCurrLabel);
+    if (HasFusedContent) {
+      Fused->InnerMap.find(Target->Id)->second->Render(InLoop);
+    } else if (Details->Type == Branch::Nested) {
+      // Nest the parent content here, and remove it from showing up afterwards as Next
+      assert(Parent->Next);
+      Parent->Next->Render(InLoop);
+      Parent->Next = NULL;
+    }
+    if (useSwitch && iter != ProcessedBranchesOut.end()) {
+      PrintIndented("break;\n");
+    }
+    if (!First) Indenter::Unindent();
+    if (useSwitch) {
+      PrintIndented("}\n");
+    }
+    if (iter == ProcessedBranchesOut.end()) break;
+  }
+  if (!First) PrintIndented("}\n");
+
+  if (Fused) {
+    Fused->RenderLoopPostfix();
+  }
+}
+
+// MultipleShape
+
+void MultipleShape::RenderLoopPrefix() {
+  if (Breaks) {
+    if (UseSwitch) {
+      if (Labeled) {
+        PrintIndented("L%d: ", Id);
+      }
+    } else {
+      if (Labeled) {
+        PrintIndented("L%d: do {\n", Id);
+      } else {
+        PrintIndented("do {\n");
+      }
+      Indenter::Indent();
+    }
+  }
+}
+
+void MultipleShape::RenderLoopPostfix() {
+  if (Breaks && !UseSwitch) {
+    Indenter::Unindent();
+    PrintIndented("} while(0);\n");
+  }
+}
+
+void MultipleShape::Render(bool InLoop) {
+  RenderLoopPrefix();
+
+  if (!UseSwitch) {
+    // emit an if-else chain
+    bool First = true;
+    for (IdShapeMap::iterator iter = InnerMap.begin(); iter != InnerMap.end(); iter++) {
+      if (AsmJS) {
+        PrintIndented("%sif ((label|0) == %d) {\n", First ? "" : "else ", iter->first);
+      } else {
+        PrintIndented("%sif (label == %d) {\n", First ? "" : "else ", iter->first);
+      }
+      First = false;
+      Indenter::Indent();
+      iter->second->Render(InLoop);
+      Indenter::Unindent();
+      PrintIndented("}\n");
+    }
+  } else {
+    // emit a switch
+    if (AsmJS) {
+      PrintIndented("switch (label|0) {\n");
+    } else {
+      PrintIndented("switch (label) {\n");
+    }
+    Indenter::Indent();
+    for (IdShapeMap::iterator iter = InnerMap.begin(); iter != InnerMap.end(); iter++) {
+      PrintIndented("case %d: {\n", iter->first);
+      Indenter::Indent();
+      iter->second->Render(InLoop);
+      PrintIndented("break;\n");
+      Indenter::Unindent();
+      PrintIndented("}\n");
+    }
+    Indenter::Unindent();
+    PrintIndented("}\n");
+  }
+
+  RenderLoopPostfix();
+  if (Next) Next->Render(InLoop);
+}
+
+// LoopShape
+
+void LoopShape::Render(bool InLoop) {
+  if (Labeled) {
+    PrintIndented("L%d: while(1) {\n", Id);
+  } else {
+    PrintIndented("while(1) {\n");
+  }
+  Indenter::Indent();
+  Inner->Render(true);
+  Indenter::Unindent();
+  PrintIndented("}\n");
+  if (Next) Next->Render(InLoop);
+}
+
+// EmulatedShape
+
+void EmulatedShape::Render(bool InLoop) {
+  PrintIndented("label = %d;\n", Entry->Id);
+  if (Labeled) {
+    PrintIndented("L%d: ", Id);
+  }
+  PrintIndented("while(1) {\n");
+  Indenter::Indent();
+  PrintIndented("switch(label|0) {\n");
+  Indenter::Indent();
+  for (BlockSet::iterator iter = Blocks.begin(); iter != Blocks.end(); iter++) {
+    Block *Curr = *iter;
+    PrintIndented("case %d: {\n", Curr->Id);
+    Indenter::Indent();
+    Curr->Render(InLoop);
+    PrintIndented("break;\n");
+    Indenter::Unindent();
+    PrintIndented("}\n");
+  }
+  Indenter::Unindent();
+  PrintIndented("}\n");
+  Indenter::Unindent();
+  PrintIndented("}\n");
+  if (Next) Next->Render(InLoop);
+}
+
+// Relooper
+
+Relooper::Relooper() : Root(NULL), Emulate(false), MinSize(false), BlockIdCounter(1), ShapeIdCounter(0) { // block ID 0 is reserved for clearings
+}
+
+Relooper::~Relooper() {
+  for (unsigned i = 0; i < Blocks.size(); i++) delete Blocks[i];
+  for (unsigned i = 0; i < Shapes.size(); i++) delete Shapes[i];
+}
+
+void Relooper::AddBlock(Block *New, int Id) {
+  New->Id = Id == -1 ? BlockIdCounter++ : Id;
+  Blocks.push_back(New);
+}
+
+struct RelooperRecursor {
+  Relooper *Parent;
+  RelooperRecursor(Relooper *ParentInit) : Parent(ParentInit) {}
+};
+
+typedef std::list<Block*> BlockList;
+
+void Relooper::Calculate(Block *Entry) {
+  // Scan and optimize the input
+  struct PreOptimizer : public RelooperRecursor {
+    PreOptimizer(Relooper *Parent) : RelooperRecursor(Parent) {}
+    BlockSet Live;
+
+    void FindLive(Block *Root) {
+      BlockList ToInvestigate;
+      ToInvestigate.push_back(Root);
+      while (ToInvestigate.size() > 0) {
+        Block *Curr = ToInvestigate.front();
+        ToInvestigate.pop_front();
+        if (contains(Live, Curr)) continue;
+        Live.insert(Curr);
+        for (BlockBranchMap::iterator iter = Curr->BranchesOut.begin(); iter != Curr->BranchesOut.end(); iter++) {
+          ToInvestigate.push_back(iter->first);
+        }
+      }
+    }
+
+    // If a block has multiple entries but no exits, and it is small enough, it is useful to split it.
+    // A common example is a C++ function where everything ends up at a final exit block and does some
+    // RAII cleanup. Without splitting, we will be forced to introduce labelled loops to allow
+    // reaching the final block
+    void SplitDeadEnds() {
+      unsigned TotalCodeSize = 0;
+      for (BlockSet::iterator iter = Live.begin(); iter != Live.end(); iter++) {
+        Block *Curr = *iter;
+        TotalCodeSize += strlen(Curr->Code);
+      }
+      BlockSet Splits;
+      BlockSet Removed;
+      //DebugDump(Live, "before");
+      for (BlockSet::iterator iter = Live.begin(); iter != Live.end(); iter++) {
+        Block *Original = *iter;
+        if (Original->BranchesIn.size() <= 1 || Original->BranchesOut.size() > 0) continue; // only dead ends, for now
+        if (contains(Original->BranchesOut, Original)) continue; // cannot split a looping node
+        if (strlen(Original->Code)*(Original->BranchesIn.size()-1) > TotalCodeSize/5) continue; // if splitting increases raw code size by a significant amount, abort
+        // Split the node (for simplicity, we replace all the blocks, even though we could have reused the original)
+        PrintDebug("Splitting block %d\n", Original->Id);
+        for (BlockSet::iterator iter = Original->BranchesIn.begin(); iter != Original->BranchesIn.end(); iter++) {
+          Block *Prior = *iter;
+          Block *Split = new Block(Original->Code, Original->BranchVar);
+          Parent->AddBlock(Split, Original->Id);
+          Split->BranchesIn.insert(Prior);
+          Branch *Details = Prior->BranchesOut[Original];
+          Prior->BranchesOut[Split] = new Branch(Details->Condition, Details->Code);
+          delete Details;
+          Prior->BranchesOut.erase(Original);
+          for (BlockBranchMap::iterator iter = Original->BranchesOut.begin(); iter != Original->BranchesOut.end(); iter++) {
+            Block *Post = iter->first;
+            Branch *Details = iter->second;
+            Split->BranchesOut[Post] = new Branch(Details->Condition, Details->Code);
+            Post->BranchesIn.insert(Split);
+          }
+          Splits.insert(Split);
+          Removed.insert(Original);
+        }
+        for (BlockBranchMap::iterator iter = Original->BranchesOut.begin(); iter != Original->BranchesOut.end(); iter++) {
+          Block *Post = iter->first;
+          Post->BranchesIn.erase(Original);
+        }
+        //DebugDump(Live, "mid");
+      }
+      for (BlockSet::iterator iter = Splits.begin(); iter != Splits.end(); iter++) {
+        Live.insert(*iter);
+      }
+      for (BlockSet::iterator iter = Removed.begin(); iter != Removed.end(); iter++) {
+        Live.erase(*iter);
+      }
+      //DebugDump(Live, "after");
+    }
+  };
+  PreOptimizer Pre(this);
+  Pre.FindLive(Entry);
+
+  // Add incoming branches from live blocks, ignoring dead code
+  for (unsigned i = 0; i < Blocks.size(); i++) {
+    Block *Curr = Blocks[i];
+    if (!contains(Pre.Live, Curr)) continue;
+    for (BlockBranchMap::iterator iter = Curr->BranchesOut.begin(); iter != Curr->BranchesOut.end(); iter++) {
+      iter->first->BranchesIn.insert(Curr);
+    }
+  }
+
+  if (!Emulate && !MinSize) Pre.SplitDeadEnds();
+
+  // Recursively process the graph
+
+  struct Analyzer : public RelooperRecursor {
+    Analyzer(Relooper *Parent) : RelooperRecursor(Parent) {}
+
+    // Add a shape to the list of shapes in this Relooper calculation
+    void Notice(Shape *New) {
+      New->Id = Parent->ShapeIdCounter++;
+      Parent->Shapes.push_back(New);
+    }
+
+    // Create a list of entries from a block. If LimitTo is provided, only results in that set
+    // will appear
+    void GetBlocksOut(Block *Source, BlockSet& Entries, BlockSet *LimitTo=NULL) {
+      for (BlockBranchMap::iterator iter = Source->BranchesOut.begin(); iter != Source->BranchesOut.end(); iter++) {
+        if (!LimitTo || contains(*LimitTo, iter->first)) {
+          Entries.insert(iter->first);
+        }
+      }
+    }
+
+    // Converts/processes all branchings to a specific target
+    void Solipsize(Block *Target, Branch::FlowType Type, Shape *Ancestor, BlockSet &From) {
+      PrintDebug("Solipsizing branches into %d\n", Target->Id);
+      DebugDump(From, "  relevant to solipsize: ");
+      for (BlockSet::iterator iter = Target->BranchesIn.begin(); iter != Target->BranchesIn.end();) {
+        Block *Prior = *iter;
+        if (!contains(From, Prior)) {
+          iter++;
+          continue;
+        }
+        Branch *PriorOut = Prior->BranchesOut[Target];
+        PriorOut->Ancestor = Ancestor;
+        PriorOut->Type = Type;
+        if (MultipleShape *Multiple = Shape::IsMultiple(Ancestor)) {
+          Multiple->Breaks++; // We are breaking out of this Multiple, so need a loop
+        }
+        iter++; // carefully increment iter before erasing
+        Target->BranchesIn.erase(Prior);
+        Target->ProcessedBranchesIn.insert(Prior);
+        Prior->BranchesOut.erase(Target);
+        Prior->ProcessedBranchesOut[Target] = PriorOut;
+        PrintDebug("  eliminated branch from %d\n", Prior->Id);
+      }
+    }
+
+    Shape *MakeSimple(BlockSet &Blocks, Block *Inner, BlockSet &NextEntries) {
+      PrintDebug("creating simple block with block #%d\n", Inner->Id);
+      SimpleShape *Simple = new SimpleShape;
+      Notice(Simple);
+      Simple->Inner = Inner;
+      Inner->Parent = Simple;
+      if (Blocks.size() > 1) {
+        Blocks.erase(Inner);
+        GetBlocksOut(Inner, NextEntries, &Blocks);
+        BlockSet JustInner;
+        JustInner.insert(Inner);
+        for (BlockSet::iterator iter = NextEntries.begin(); iter != NextEntries.end(); iter++) {
+          Solipsize(*iter, Branch::Direct, Simple, JustInner);
+        }
+      }
+      return Simple;
+    }
+
+    Shape *MakeEmulated(BlockSet &Blocks, Block *Entry, BlockSet &NextEntries) {
+      PrintDebug("creating emulated block with entry #%d and everything it can reach, %d blocks\n", Entry->Id, Blocks.size());
+      EmulatedShape *Emulated = new EmulatedShape;
+      Notice(Emulated);
+      Emulated->Entry = Entry;
+      for (BlockSet::iterator iter = Blocks.begin(); iter != Blocks.end(); iter++) {
+        Block *Curr = *iter;
+        Emulated->Blocks.insert(Curr);
+        Curr->Parent = Emulated;
+        Solipsize(Curr, Branch::Continue, Emulated, Blocks);
+      }
+      Blocks.clear();
+      return Emulated;
+    }
+
+    Shape *MakeLoop(BlockSet &Blocks, BlockSet& Entries, BlockSet &NextEntries) {
+      // Find the inner blocks in this loop. Proceed backwards from the entries until
+      // you reach a seen block, collecting as you go.
+      BlockSet InnerBlocks;
+      BlockSet Queue = Entries;
+      while (Queue.size() > 0) {
+        Block *Curr = *(Queue.begin());
+        Queue.erase(Queue.begin());
+        if (!contains(InnerBlocks, Curr)) {
+          // This element is new, mark it as inner and remove from outer
+          InnerBlocks.insert(Curr);
+          Blocks.erase(Curr);
+          // Add the elements prior to it
+          for (BlockSet::iterator iter = Curr->BranchesIn.begin(); iter != Curr->BranchesIn.end(); iter++) {
+            Queue.insert(*iter);
+          }
+#if 0
+          // Add elements it leads to, if they are dead ends. There is no reason not to hoist dead ends
+          // into loops, as it can avoid multiple entries after the loop
+          for (BlockBranchMap::iterator iter = Curr->BranchesOut.begin(); iter != Curr->BranchesOut.end(); iter++) {
+            Block *Target = iter->first;
+            if (Target->BranchesIn.size() <= 1 && Target->BranchesOut.size() == 0) {
+              Queue.insert(Target);
+            }
+          }
+#endif
+        }
+      }
+      assert(InnerBlocks.size() > 0);
+
+      for (BlockSet::iterator iter = InnerBlocks.begin(); iter != InnerBlocks.end(); iter++) {
+        Block *Curr = *iter;
+        for (BlockBranchMap::iterator iter = Curr->BranchesOut.begin(); iter != Curr->BranchesOut.end(); iter++) {
+          Block *Possible = iter->first;
+          if (!contains(InnerBlocks, Possible)) {
+            NextEntries.insert(Possible);
+          }
+        }
+      }
+
+#if 0
+      // We can avoid multiple next entries by hoisting them into the loop.
+      if (NextEntries.size() > 1) {
+        BlockBlockSetMap IndependentGroups;
+        FindIndependentGroups(NextEntries, IndependentGroups, &InnerBlocks);
+
+        while (IndependentGroups.size() > 0 && NextEntries.size() > 1) {
+          Block *Min = NULL;
+          int MinSize = 0;
+          for (BlockBlockSetMap::iterator iter = IndependentGroups.begin(); iter != IndependentGroups.end(); iter++) {
+            Block *Entry = iter->first;
+            BlockSet &Blocks = iter->second;
+            if (!Min || Blocks.size() < MinSize) { // TODO: code size, not # of blocks
+              Min = Entry;
+              MinSize = Blocks.size();
+            }
+          }
+          // check how many new entries this would cause
+          BlockSet &Hoisted = IndependentGroups[Min];
+          bool abort = false;
+          for (BlockSet::iterator iter = Hoisted.begin(); iter != Hoisted.end() && !abort; iter++) {
+            Block *Curr = *iter;
+            for (BlockBranchMap::iterator iter = Curr->BranchesOut.begin(); iter != Curr->BranchesOut.end(); iter++) {
+              Block *Target = iter->first;
+              if (!contains(Hoisted, Target) && !contains(NextEntries, Target)) {
+                // abort this hoisting
+                abort = true;
+                break;
+              }
+            }
+          }
+          if (abort) {
+            IndependentGroups.erase(Min);
+            continue;
+          }
+          // hoist this entry
+          PrintDebug("hoisting %d into loop\n", Min->Id);
+          NextEntries.erase(Min);
+          for (BlockSet::iterator iter = Hoisted.begin(); iter != Hoisted.end(); iter++) {
+            Block *Curr = *iter;
+            InnerBlocks.insert(Curr);
+            Blocks.erase(Curr);
+          }
+          IndependentGroups.erase(Min);
+        }
+      }
+#endif
+
+      PrintDebug("creating loop block:\n", 0);
+      DebugDump(InnerBlocks, "  inner blocks:");
+      DebugDump(Entries, "  inner entries:");
+      DebugDump(Blocks, "  outer blocks:");
+      DebugDump(NextEntries, "  outer entries:");
+
+      LoopShape *Loop = new LoopShape();
+      Notice(Loop);
+
+      // Solipsize the loop, replacing with break/continue and marking branches as Processed (will not affect later calculations)
+      // A. Branches to the loop entries become a continue to this shape
+      for (BlockSet::iterator iter = Entries.begin(); iter != Entries.end(); iter++) {
+        Solipsize(*iter, Branch::Continue, Loop, InnerBlocks);
+      }
+      // B. Branches to outside the loop (a next entry) become breaks on this shape
+      for (BlockSet::iterator iter = NextEntries.begin(); iter != NextEntries.end(); iter++) {
+        Solipsize(*iter, Branch::Break, Loop, InnerBlocks);
+      }
+      // Finish up
+      Shape *Inner = Process(InnerBlocks, Entries, NULL);
+      Loop->Inner = Inner;
+      return Loop;
+    }
+
+    // For each entry, find the independent group reachable by it. The independent group is
+    // the entry itself, plus all the blocks it can reach that cannot be directly reached by another entry. Note that we
+    // ignore directly reaching the entry itself by another entry.
+    //   @param Ignore - previous blocks that are irrelevant
+    void FindIndependentGroups(BlockSet &Entries, BlockBlockSetMap& IndependentGroups, BlockSet *Ignore=NULL) {
+      typedef std::map<Block*, Block*> BlockBlockMap;
+
+      struct HelperClass {
+        BlockBlockSetMap& IndependentGroups;
+        BlockBlockMap Ownership; // For each block, which entry it belongs to. We have reached it from there.
+
+        HelperClass(BlockBlockSetMap& IndependentGroupsInit) : IndependentGroups(IndependentGroupsInit) {}
+        void InvalidateWithChildren(Block *New) { // TODO: rename New
+          BlockList ToInvalidate; // Being in the list means you need to be invalidated
+          ToInvalidate.push_back(New);
+          while (ToInvalidate.size() > 0) {
+            Block *Invalidatee = ToInvalidate.front();
+            ToInvalidate.pop_front();
+            Block *Owner = Ownership[Invalidatee];
+            if (contains(IndependentGroups, Owner)) { // Owner may have been invalidated, do not add to IndependentGroups!
+              IndependentGroups[Owner].erase(Invalidatee);
+            }
+            if (Ownership[Invalidatee]) { // may have been seen before and invalidated already
+              Ownership[Invalidatee] = NULL;
+              for (BlockBranchMap::iterator iter = Invalidatee->BranchesOut.begin(); iter != Invalidatee->BranchesOut.end(); iter++) {
+                Block *Target = iter->first;
+                BlockBlockMap::iterator Known = Ownership.find(Target);
+                if (Known != Ownership.end()) {
+                  Block *TargetOwner = Known->second;
+                  if (TargetOwner) {
+                    ToInvalidate.push_back(Target);
+                  }
+                }
+              }
+            }
+          }
+        }
+      };
+      HelperClass Helper(IndependentGroups);
+
+      // We flow out from each of the entries, simultaneously.
+      // When we reach a new block, we add it as belonging to the one we got to it from.
+      // If we reach a new block that is already marked as belonging to someone, it is reachable by
+      // two entries and is not valid for any of them. Remove it and all it can reach that have been
+      // visited.
+
+      BlockList Queue; // Being in the queue means we just added this item, and we need to add its children
+      for (BlockSet::iterator iter = Entries.begin(); iter != Entries.end(); iter++) {
+        Block *Entry = *iter;
+        Helper.Ownership[Entry] = Entry;
+        IndependentGroups[Entry].insert(Entry);
+        Queue.push_back(Entry);
+      }
+      while (Queue.size() > 0) {
+        Block *Curr = Queue.front();
+        Queue.pop_front();
+        Block *Owner = Helper.Ownership[Curr]; // Curr must be in the ownership map if we are in the queue
+        if (!Owner) continue; // we have been invalidated meanwhile after being reached from two entries
+        // Add all children
+        for (BlockBranchMap::iterator iter = Curr->BranchesOut.begin(); iter != Curr->BranchesOut.end(); iter++) {
+          Block *New = iter->first;
+          BlockBlockMap::iterator Known = Helper.Ownership.find(New);
+          if (Known == Helper.Ownership.end()) {
+            // New node. Add it, and put it in the queue
+            Helper.Ownership[New] = Owner;
+            IndependentGroups[Owner].insert(New);
+            Queue.push_back(New);
+            continue;
+          }
+          Block *NewOwner = Known->second;
+          if (!NewOwner) continue; // We reached an invalidated node
+          if (NewOwner != Owner) {
+            // Invalidate this and all reachable that we have seen - we reached this from two locations
+            Helper.InvalidateWithChildren(New);
+          }
+          // otherwise, we have the same owner, so do nothing
+        }
+      }
+
+      // Having processed all the interesting blocks, we remain with just one potential issue:
+      // If a->b, and a was invalidated, but then b was later reached by someone else, we must
+      // invalidate b. To check for this, we go over all elements in the independent groups,
+      // if an element has a parent which does *not* have the same owner, we must remove it
+      // and all its children.
+
+      for (BlockSet::iterator iter = Entries.begin(); iter != Entries.end(); iter++) {
+        BlockSet &CurrGroup = IndependentGroups[*iter];
+        BlockList ToInvalidate;
+        for (BlockSet::iterator iter = CurrGroup.begin(); iter != CurrGroup.end(); iter++) {
+          Block *Child = *iter;
+          for (BlockSet::iterator iter = Child->BranchesIn.begin(); iter != Child->BranchesIn.end(); iter++) {
+            Block *Parent = *iter;
+            if (Ignore && contains(*Ignore, Parent)) continue;
+            if (Helper.Ownership[Parent] != Helper.Ownership[Child]) {
+              ToInvalidate.push_back(Child);
+            }
+          }
+        }
+        while (ToInvalidate.size() > 0) {
+          Block *Invalidatee = ToInvalidate.front();
+          ToInvalidate.pop_front();
+          Helper.InvalidateWithChildren(Invalidatee);
+        }
+      }
+
+      // Remove empty groups
+      for (BlockSet::iterator iter = Entries.begin(); iter != Entries.end(); iter++) {
+        if (IndependentGroups[*iter].size() == 0) {
+          IndependentGroups.erase(*iter);
+        }
+      }
+
+#if DEBUG
+      PrintDebug("Investigated independent groups:\n");
+      for (BlockBlockSetMap::iterator iter = IndependentGroups.begin(); iter != IndependentGroups.end(); iter++) {
+        DebugDump(iter->second, " group: ");
+      }
+#endif
+    }
+
+    Shape *MakeMultiple(BlockSet &Blocks, BlockSet& Entries, BlockBlockSetMap& IndependentGroups, Shape *Prev, BlockSet &NextEntries) {
+      PrintDebug("creating multiple block with %d inner groups\n", IndependentGroups.size());
+      bool Fused = !!(Shape::IsSimple(Prev));
+      MultipleShape *Multiple = new MultipleShape();
+      Notice(Multiple);
+      BlockSet CurrEntries;
+      for (BlockBlockSetMap::iterator iter = IndependentGroups.begin(); iter != IndependentGroups.end(); iter++) {
+        Block *CurrEntry = iter->first;
+        BlockSet &CurrBlocks = iter->second;
+        PrintDebug("  multiple group with entry %d:\n", CurrEntry->Id);
+        DebugDump(CurrBlocks, "    ");
+        // Create inner block
+        CurrEntries.clear();
+        CurrEntries.insert(CurrEntry);
+        for (BlockSet::iterator iter = CurrBlocks.begin(); iter != CurrBlocks.end(); iter++) {
+          Block *CurrInner = *iter;
+          // Remove the block from the remaining blocks
+          Blocks.erase(CurrInner);
+          // Find new next entries and fix branches to them
+          for (BlockBranchMap::iterator iter = CurrInner->BranchesOut.begin(); iter != CurrInner->BranchesOut.end();) {
+            Block *CurrTarget = iter->first;
+            BlockBranchMap::iterator Next = iter;
+            Next++;
+            if (!contains(CurrBlocks, CurrTarget)) {
+              NextEntries.insert(CurrTarget);
+              Solipsize(CurrTarget, Branch::Break, Multiple, CurrBlocks); 
+            }
+            iter = Next; // increment carefully because Solipsize can remove us
+          }
+        }
+        Multiple->InnerMap[CurrEntry->Id] = Process(CurrBlocks, CurrEntries, NULL);
+        // If we are not fused, then our entries will actually be checked
+        if (!Fused) {
+          CurrEntry->IsCheckedMultipleEntry = true;
+        }
+      }
+      DebugDump(Blocks, "  remaining blocks after multiple:");
+      // Add entries not handled as next entries, they are deferred
+      for (BlockSet::iterator iter = Entries.begin(); iter != Entries.end(); iter++) {
+        Block *Entry = *iter;
+        if (!contains(IndependentGroups, Entry)) {
+          NextEntries.insert(Entry);
+        }
+      }
+      // The multiple has been created, we can decide how to implement it
+      if (Multiple->InnerMap.size() >= 10) {
+        Multiple->UseSwitch = true;
+        Multiple->Breaks++; // switch captures breaks
+      }
+      return Multiple;
+    }
+
+    // Main function.
+    // Process a set of blocks with specified entries, returns a shape
+    // The Make* functions receive a NextEntries. If they fill it with data, those are the entries for the
+    //   ->Next block on them, and the blocks are what remains in Blocks (which Make* modify). In this way
+    //   we avoid recursing on Next (imagine a long chain of Simples, if we recursed we could blow the stack).
+    Shape *Process(BlockSet &Blocks, BlockSet& InitialEntries, Shape *Prev) {
+      PrintDebug("Process() called\n", 0);
+      BlockSet *Entries = &InitialEntries;
+      BlockSet TempEntries[2];
+      int CurrTempIndex = 0;
+      BlockSet *NextEntries;
+      Shape *Ret = NULL;
+      #define Make(call) \
+        Shape *Temp = call; \
+        if (Prev) Prev->Next = Temp; \
+        if (!Ret) Ret = Temp; \
+        if (!NextEntries->size()) { PrintDebug("Process() returning\n", 0); return Ret; } \
+        Prev = Temp; \
+        Entries = NextEntries; \
+        continue;
+      while (1) {
+        PrintDebug("Process() running\n", 0);
+        DebugDump(Blocks, "  blocks : ");
+        DebugDump(*Entries, "  entries: ");
+
+        CurrTempIndex = 1-CurrTempIndex;
+        NextEntries = &TempEntries[CurrTempIndex];
+        NextEntries->clear();
+
+        if (Entries->size() == 0) return Ret;
+        if (Entries->size() == 1) {
+          Block *Curr = *(Entries->begin());
+          if (Parent->Emulate) {
+            Make(MakeEmulated(Blocks, Curr, *NextEntries));
+          }
+          if (Curr->BranchesIn.size() == 0) {
+            // One entry, no looping ==> Simple
+            Make(MakeSimple(Blocks, Curr, *NextEntries));
+          }
+          // One entry, looping ==> Loop
+          Make(MakeLoop(Blocks, *Entries, *NextEntries));
+        }
+
+        // More than one entry, try to eliminate through a Multiple groups of
+        // independent blocks from an entry/ies. It is important to remove through
+        // multiples as opposed to looping since the former is more performant.
+        BlockBlockSetMap IndependentGroups;
+        FindIndependentGroups(*Entries, IndependentGroups);
+
+        PrintDebug("Independent groups: %d\n", IndependentGroups.size());
+
+        if (IndependentGroups.size() > 0) {
+          // We can handle a group in a multiple if its entry cannot be reached by another group.
+          // Note that it might be reachable by itself - a loop. But that is fine, we will create
+          // a loop inside the multiple block (which is the performant order to do it).
+          for (BlockBlockSetMap::iterator iter = IndependentGroups.begin(); iter != IndependentGroups.end();) {
+            Block *Entry = iter->first;
+            BlockSet &Group = iter->second;
+            BlockBlockSetMap::iterator curr = iter++; // iterate carefully, we may delete
+            for (BlockSet::iterator iterBranch = Entry->BranchesIn.begin(); iterBranch != Entry->BranchesIn.end(); iterBranch++) {
+              Block *Origin = *iterBranch;
+              if (!contains(Group, Origin)) {
+                // Reached from outside the group, so we cannot handle this
+                PrintDebug("Cannot handle group with entry %d because of incoming branch from %d\n", Entry->Id, Origin->Id);
+                IndependentGroups.erase(curr);
+                break;
+              }
+            }
+          }
+
+          // As an optimization, if we have 2 independent groups, and one is a small dead end, we can handle only that dead end.
+          // The other then becomes a Next - without nesting in the code and recursion in the analysis.
+          // TODO: if the larger is the only dead end, handle that too
+          // TODO: handle >2 groups
+          // TODO: handle not just dead ends, but also that do not branch to the NextEntries. However, must be careful
+          //       there since we create a Next, and that Next can prevent eliminating a break (since we no longer
+          //       naturally reach the same place), which may necessitate a one-time loop, which makes the unnesting
+          //       pointless.
+          if (IndependentGroups.size() == 2) {
+            // Find the smaller one
+            BlockBlockSetMap::iterator iter = IndependentGroups.begin();
+            Block *SmallEntry = iter->first;
+            int SmallSize = iter->second.size();
+            iter++;
+            Block *LargeEntry = iter->first;
+            int LargeSize = iter->second.size();
+            if (SmallSize != LargeSize) { // ignore the case where they are identical - keep things symmetrical there
+              if (SmallSize > LargeSize) {
+                Block *Temp = SmallEntry;
+                SmallEntry = LargeEntry;
+                LargeEntry = Temp; // Note: we did not flip the Sizes too, they are now invalid. TODO: use the smaller size as a limit?
+              }
+              // Check if dead end
+              bool DeadEnd = true;
+              BlockSet &SmallGroup = IndependentGroups[SmallEntry];
+              for (BlockSet::iterator iter = SmallGroup.begin(); iter != SmallGroup.end(); iter++) {
+                Block *Curr = *iter;
+                for (BlockBranchMap::iterator iter = Curr->BranchesOut.begin(); iter != Curr->BranchesOut.end(); iter++) {
+                  Block *Target = iter->first;
+                  if (!contains(SmallGroup, Target)) {
+                    DeadEnd = false;
+                    break;
+                  }
+                }
+                if (!DeadEnd) break;
+              }
+              if (DeadEnd) {
+                PrintDebug("Removing nesting by not handling large group because small group is dead end\n", 0);
+                IndependentGroups.erase(LargeEntry);
+              }
+            }
+          }
+
+          PrintDebug("Handleable independent groups: %d\n", IndependentGroups.size());
+
+          if (IndependentGroups.size() > 0) {
+            // Some groups removable ==> Multiple
+            Make(MakeMultiple(Blocks, *Entries, IndependentGroups, Prev, *NextEntries));
+          }
+        }
+        // No independent groups, must be loopable ==> Loop
+        Make(MakeLoop(Blocks, *Entries, *NextEntries));
+      }
+    }
+  };
+
+  // Main
+
+  BlockSet AllBlocks;
+  for (BlockSet::iterator iter = Pre.Live.begin(); iter != Pre.Live.end(); iter++) {
+    Block *Curr = *iter;
+    AllBlocks.insert(Curr);
+#if DEBUG
+    PrintDebug("Adding block %d (%s)\n", Curr->Id, Curr->Code);
+#endif
+  }
+
+  BlockSet Entries;
+  Entries.insert(Entry);
+  Root = Analyzer(this).Process(AllBlocks, Entries, NULL);
+  assert(Root);
+
+  // Post optimizations
+
+  struct PostOptimizer {
+    Relooper *Parent;
+    std::stack<Shape*> *Closure;
+
+    PostOptimizer(Relooper *ParentInit) : Parent(ParentInit), Closure(NULL) {}
+
+    #define RECURSE_Multiple(shape, func) \
+      for (IdShapeMap::iterator iter = shape->InnerMap.begin(); iter != shape->InnerMap.end(); iter++) { \
+        func(iter->second); \
+      }
+    #define RECURSE_Loop(shape, func) \
+      func(shape->Inner);
+    #define RECURSE(shape, func) RECURSE_##shape(shape, func);
+
+    #define SHAPE_SWITCH(var, simple, multiple, loop) \
+      if (SimpleShape *Simple = Shape::IsSimple(var)) { \
+        (void)Simple; \
+        simple; \
+      } else if (MultipleShape *Multiple = Shape::IsMultiple(var)) { \
+        (void)Multiple; \
+        multiple; \
+      } else if (LoopShape *Loop = Shape::IsLoop(var)) { \
+        (void)Loop; \
+        loop; \
+      }
+
+    // Find the blocks that natural control flow can get us directly to, or through a multiple that we ignore
+    void FollowNaturalFlow(Shape *S, BlockSet &Out) {
+      SHAPE_SWITCH(S, {
+        Out.insert(Simple->Inner);
+      }, {
+        for (IdShapeMap::iterator iter = Multiple->InnerMap.begin(); iter != Multiple->InnerMap.end(); iter++) {
+          FollowNaturalFlow(iter->second, Out);
+        }
+        FollowNaturalFlow(Multiple->Next, Out);
+      }, {
+        FollowNaturalFlow(Loop->Inner, Out);
+      });
+    }
+
+    void FindNaturals(Shape *Root, Shape *Otherwise=NULL) {
+      if (Root->Next) {
+        Root->Natural = Root->Next;
+        FindNaturals(Root->Next, Otherwise);
+      } else {
+        Root->Natural = Otherwise;
+      }
+
+      SHAPE_SWITCH(Root, {
+      }, {
+        for (IdShapeMap::iterator iter = Multiple->InnerMap.begin(); iter != Multiple->InnerMap.end(); iter++) {
+          FindNaturals(iter->second, Root->Natural);
+        }
+      }, {
+        FindNaturals(Loop->Inner, Loop->Inner);
+      });
+    }
+
+    // Remove unneeded breaks and continues.
+    // A flow operation is trivially unneeded if the shape we naturally get to by normal code
+    // execution is the same as the flow forces us to.
+    void RemoveUnneededFlows(Shape *Root, Shape *Natural=NULL, LoopShape *LastLoop=NULL, unsigned Depth=0) {
+      BlockSet NaturalBlocks;
+      FollowNaturalFlow(Natural, NaturalBlocks);
+      Shape *Next = Root;
+      while (Next) {
+        Root = Next;
+        Next = NULL;
+        SHAPE_SWITCH(Root, {
+          if (Simple->Inner->BranchVar) LastLoop = NULL; // a switch clears out the loop (TODO: only for breaks, not continue)
+
+          if (Simple->Next) {
+            if (!Simple->Inner->BranchVar && Simple->Inner->ProcessedBranchesOut.size() == 2 && Depth < 20) {
+              // If there is a next block, we already know at Simple creation time to make direct branches,
+              // and we can do nothing more in general. But, we try to optimize the case of a break and
+              // a direct: This would normally be  if (break?) { break; } ..  but if we
+              // make sure to nest the else, we can save the break,  if (!break?) { .. }  . This is also
+              // better because the more canonical nested form is easier to further optimize later. The
+              // downside is more nesting, which adds to size in builds with whitespace.
+              // Note that we avoid switches, as it complicates control flow and is not relevant
+              // for the common case we optimize here.
+              bool Found = false;
+              bool Abort = false;
+              for (BlockBranchMap::iterator iter = Simple->Inner->ProcessedBranchesOut.begin(); iter != Simple->Inner->ProcessedBranchesOut.end(); iter++) {
+                Block *Target = iter->first;
+                Branch *Details = iter->second;
+                if (Details->Type == Branch::Break) {
+                  Found = true;
+                  if (!contains(NaturalBlocks, Target)) Abort = true;
+                } else if (Details->Type != Branch::Direct) {
+                  Abort = true;
+                }
+              }
+              if (Found && !Abort) {
+                for (BlockBranchMap::iterator iter = Simple->Inner->ProcessedBranchesOut.begin(); iter != Simple->Inner->ProcessedBranchesOut.end(); iter++) {
+                  Branch *Details = iter->second;
+                  if (Details->Type == Branch::Break) {
+                    Details->Type = Branch::Direct;
+                    if (MultipleShape *Multiple = Shape::IsMultiple(Details->Ancestor)) {
+                      Multiple->Breaks--;
+                    }
+                  } else {
+                    assert(Details->Type == Branch::Direct);
+                    Details->Type = Branch::Nested;
+                  }
+                }
+              }
+              Depth++; // this optimization increases depth, for us and all our next chain (i.e., until this call returns)
+            }
+            Next = Simple->Next;
+          } else {
+            // If there is no next then Natural is where we will
+            // go to by doing nothing, so we can potentially optimize some branches to direct.
+            for (BlockBranchMap::iterator iter = Simple->Inner->ProcessedBranchesOut.begin(); iter != Simple->Inner->ProcessedBranchesOut.end(); iter++) {
+              Block *Target = iter->first;
+              Branch *Details = iter->second;
+              if (Details->Type != Branch::Direct && contains(NaturalBlocks, Target)) { // note: cannot handle split blocks
+                Details->Type = Branch::Direct;
+                if (MultipleShape *Multiple = Shape::IsMultiple(Details->Ancestor)) {
+                  Multiple->Breaks--;
+                }
+              } else if (Details->Type == Branch::Break && LastLoop && LastLoop->Natural == Details->Ancestor->Natural) {
+                // it is important to simplify breaks, as simpler breaks enable other optimizations
+                Details->Labeled = false;
+                if (MultipleShape *Multiple = Shape::IsMultiple(Details->Ancestor)) {
+                  Multiple->Breaks--;
+                }
+              }
+            }
+          }
+        }, {
+          for (IdShapeMap::iterator iter = Multiple->InnerMap.begin(); iter != Multiple->InnerMap.end(); iter++) {
+            RemoveUnneededFlows(iter->second, Multiple->Next, Multiple->Breaks ? NULL : LastLoop, Depth+1);
+          }
+          Next = Multiple->Next;
+        }, {
+          RemoveUnneededFlows(Loop->Inner, Loop->Inner, Loop, Depth+1);
+          Next = Loop->Next;
+        });
+      }
+    }
+
+    // After we know which loops exist, we can calculate which need to be labeled
+    void FindLabeledLoops(Shape *Root) {
+      bool First = Closure == NULL;
+      if (First) {
+        Closure = new std::stack<Shape*>;
+      }
+      std::stack<Shape*> &LoopStack = *Closure;
+
+      Shape *Next = Root;
+      while (Next) {
+        Root = Next;
+        Next = NULL;
+
+        SHAPE_SWITCH(Root, {
+          MultipleShape *Fused = Shape::IsMultiple(Root->Next);
+          // If we are fusing a Multiple with a loop into this Simple, then visit it now
+          if (Fused && Fused->Breaks) {
+            LoopStack.push(Fused);
+          }
+          if (Simple->Inner->BranchVar) {
+            LoopStack.push(NULL); // a switch means breaks are now useless, push a dummy
+          }
+          if (Fused) {
+            if (Fused->UseSwitch) {
+              LoopStack.push(NULL); // a switch means breaks are now useless, push a dummy
+            }
+            RECURSE_Multiple(Fused, FindLabeledLoops);
+          }
+          for (BlockBranchMap::iterator iter = Simple->Inner->ProcessedBranchesOut.begin(); iter != Simple->Inner->ProcessedBranchesOut.end(); iter++) {
+            Branch *Details = iter->second;
+            if (Details->Type == Branch::Break || Details->Type == Branch::Continue) {
+              assert(LoopStack.size() > 0);
+              if (Details->Ancestor != LoopStack.top() && Details->Labeled) {
+                LabeledShape *Labeled = Shape::IsLabeled(Details->Ancestor);
+                Labeled->Labeled = true;
+              } else {
+                Details->Labeled = false;
+              }
+            }
+          }
+          if (Fused && Fused->UseSwitch) {
+            LoopStack.pop();
+          }
+          if (Simple->Inner->BranchVar) {
+            LoopStack.pop();
+          }
+          if (Fused && Fused->Breaks) {
+            LoopStack.pop();
+          }
+          if (Fused) {
+            Next = Fused->Next;
+          } else {
+            Next = Root->Next;
+          }
+        }, {
+          if (Multiple->Breaks) {
+            LoopStack.push(Multiple);
+          }
+          RECURSE(Multiple, FindLabeledLoops);
+          if (Multiple->Breaks) {
+            LoopStack.pop();
+          }
+          Next = Root->Next;
+        }, {
+          LoopStack.push(Loop);
+          RECURSE(Loop, FindLabeledLoops);
+          LoopStack.pop();
+          Next = Root->Next;
+        });
+      }
+
+      if (First) {
+        delete Closure;
+      }
+    }
+
+    void Process(Shape *Root) {
+      FindNaturals(Root);
+      RemoveUnneededFlows(Root);
+      FindLabeledLoops(Root);
+    }
+  };
+
+  PrintDebug("=== Optimizing shapes ===\n", 0);
+
+  PostOptimizer(this).Process(Root);
+}
+
+void Relooper::Render() {
+  OutputBuffer = OutputBufferRoot;
+  assert(Root);
+  Root->Render(false);
+}
+
+void Relooper::SetOutputBuffer(char *Buffer, int Size) {
+  OutputBufferRoot = OutputBuffer = Buffer;
+  OutputBufferSize = Size;
+  OutputBufferOwned = false;
+}
+
+void Relooper::MakeOutputBuffer(int Size) {
+  if (OutputBufferRoot && OutputBufferSize >= Size && OutputBufferOwned) return;
+  OutputBufferRoot = OutputBuffer = (char*)malloc(Size);
+  OutputBufferSize = Size;
+  OutputBufferOwned = true;
+}
+
+char *Relooper::GetOutputBuffer() {
+  return OutputBufferRoot;
+}
+
+void Relooper::SetAsmJSMode(int On) {
+  AsmJS = On;
+}
+
+#if DEBUG
+// Debugging
+
+void Debugging::Dump(BlockSet &Blocks, const char *prefix) {
+  if (prefix) printf("%s ", prefix);
+  for (BlockSet::iterator iter = Blocks.begin(); iter != Blocks.end(); iter++) {
+    Block *Curr = *iter;
+    printf("%d:\n", Curr->Id);
+    for (BlockBranchMap::iterator iter2 = Curr->BranchesOut.begin(); iter2 != Curr->BranchesOut.end(); iter2++) {
+      Block *Other = iter2->first;
+      printf("  -> %d\n", Other->Id);
+      assert(contains(Other->BranchesIn, Curr));
+    }
+  }
+}
+
+void Debugging::Dump(Shape *S, const char *prefix) {
+  if (prefix) printf("%s ", prefix);
+  if (!S) {
+    printf(" (null)\n");
+    return;
+  }
+  printf(" %d ", S->Id);
+  SHAPE_SWITCH(S, {
+    printf("<< Simple with block %d\n", Simple->Inner->Id);
+  }, {
+    printf("<< Multiple\n");
+    for (IdShapeMap::iterator iter = Multiple->InnerMap.begin(); iter != Multiple->InnerMap.end(); iter++) {
+      printf("     with entry %d\n", iter->first);
+    }
+  }, {
+    printf("<< Loop\n");
+  });
+}
+
+static void PrintDebug(const char *Format, ...) {
+  printf("// ");
+  va_list Args;
+  va_start(Args, Format);
+  vprintf(Format, Args);
+  va_end(Args);
+}
+#endif
+
+// C API - useful for binding to other languages
+
+typedef std::map<void*, int> VoidIntMap;
+VoidIntMap __blockDebugMap__; // maps block pointers in currently running code to block ids, for generated debug output
+
+extern "C" {
+
+RELOOPERDLL_API void rl_set_output_buffer(char *buffer, int size) {
+#if DEBUG
+  printf("#include \"Relooper.h\"\n");
+  printf("int main() {\n");
+  printf("  char buffer[100000];\n");
+  printf("  rl_set_output_buffer(buffer);\n");
+#endif
+  Relooper::SetOutputBuffer(buffer, size);
+}
+
+RELOOPERDLL_API void rl_make_output_buffer(int size) {
+  Relooper::SetOutputBuffer((char*)malloc(size), size);
+}
+
+RELOOPERDLL_API void rl_set_asm_js_mode(int on) {
+  Relooper::SetAsmJSMode(on);
+}
+
+RELOOPERDLL_API void *rl_new_block(const char *text, const char *branch_var) {
+  Block *ret = new Block(text, branch_var);
+#if DEBUG
+  printf("  void *b%d = rl_new_block(\"// code %d\");\n", ret->Id, ret->Id);
+  __blockDebugMap__[ret] = ret->Id;
+  printf("  block_map[%d] = b%d;\n", ret->Id, ret->Id);
+#endif
+  return ret;
+}
+
+RELOOPERDLL_API void rl_delete_block(void *block) {
+#if DEBUG
+  printf("  rl_delete_block(block_map[%d]);\n", ((Block*)block)->Id);
+#endif
+  delete (Block*)block;
+}
+
+RELOOPERDLL_API void rl_block_add_branch_to(void *from, void *to, const char *condition, const char *code) {
+#if DEBUG
+  printf("  rl_block_add_branch_to(block_map[%d], block_map[%d], %s%s%s, %s%s%s);\n", ((Block*)from)->Id, ((Block*)to)->Id, condition ? "\"" : "", condition ? condition : "NULL", condition ? "\"" : "", code ? "\"" : "", code ? code : "NULL", code ? "\"" : "");
+#endif
+  ((Block*)from)->AddBranchTo((Block*)to, condition, code);
+}
+
+RELOOPERDLL_API void *rl_new_relooper() {
+#if DEBUG
+  printf("  void *block_map[10000];\n");
+  printf("  void *rl = rl_new_relooper();\n");
+#endif
+  return new Relooper;
+}
+
+RELOOPERDLL_API void rl_delete_relooper(void *relooper) {
+  delete (Relooper*)relooper;
+}
+
+RELOOPERDLL_API void rl_relooper_add_block(void *relooper, void *block) {
+#if DEBUG
+  printf("  rl_relooper_add_block(rl, block_map[%d]);\n", ((Block*)block)->Id);
+#endif
+  ((Relooper*)relooper)->AddBlock((Block*)block);
+}
+
+RELOOPERDLL_API void rl_relooper_calculate(void *relooper, void *entry) {
+#if DEBUG
+  printf("  rl_relooper_calculate(rl, block_map[%d]);\n", ((Block*)entry)->Id);
+  printf("  rl_relooper_render(rl);\n");
+  printf("  rl_delete_relooper(rl);\n");
+  printf("  puts(buffer);\n");
+  printf("  return 0;\n");
+  printf("}\n");
+#endif
+  ((Relooper*)relooper)->Calculate((Block*)entry);
+}
+
+RELOOPERDLL_API void rl_relooper_render(void *relooper) {
+  ((Relooper*)relooper)->Render();
+}
+
+}
diff --git a/lib/Target/JSBackend/Relooper.h b/lib/Target/JSBackend/Relooper.h
new file mode 100644
index 000000000000..776095e4c26a
--- /dev/null
+++ b/lib/Target/JSBackend/Relooper.h
@@ -0,0 +1,376 @@
+/*
+This is an optimized C++ implemention of the Relooper algorithm originally
+developed as part of Emscripten. This implementation includes optimizations
+added since the original academic paper [1] was published about it, and is
+written in an LLVM-friendly way with the goal of inclusion in upstream
+LLVM.
+
+[1] Alon Zakai. 2011. Emscripten: an LLVM-to-JavaScript compiler. In Proceedings of the ACM international conference companion on Object oriented programming systems languages and applications companion (SPLASH '11). ACM, New York, NY, USA, 301-312. DOI=10.1145/2048147.2048224 http://doi.acm.org/10.1145/2048147.2048224
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+
+#ifdef __cplusplus
+
+#include <map>
+#include <deque>
+#include <set>
+#include <list>
+
+struct Block;
+struct Shape;
+
+// Info about a branching from one block to another
+struct Branch {
+  enum FlowType {
+    Direct = 0,   // We will directly reach the right location through other means, no need for continue or break
+    Break = 1,
+    Continue = 2,
+    Nested = 3    // This code is directly reached, but we must be careful to ensure it is nested in an if - it is not reached
+                  // unconditionally, other code paths exist alongside it that we need to make sure do not intertwine
+  };
+  Shape *Ancestor; // If not NULL, this shape is the relevant one for purposes of getting to the target block. We break or continue on it
+  Branch::FlowType Type; // If Ancestor is not NULL, this says whether to break or continue
+  bool Labeled; // If a break or continue, whether we need to use a label
+  const char *Condition; // The condition for which we branch. For example, "my_var == 1". Conditions are checked one by one. One of the conditions should have NULL as the condition, in which case it is the default
+  const char *Code; // If provided, code that is run right before the branch is taken. This is useful for phis
+
+  Branch(const char *ConditionInit, const char *CodeInit=NULL);
+  ~Branch();
+
+  // Prints out the branch
+  void Render(Block *Target, bool SetLabel);
+};
+
+// like std::set, except that begin() -> end() iterates in the
+// order that elements were added to the set (not in the order
+// of operator<(T, T))
+template<typename T>
+struct InsertOrderedSet
+{
+  std::map<T, typename std::list<T>::iterator>  Map;
+  std::list<T>                                  List;
+
+  typedef typename std::list<T>::iterator iterator;
+  iterator begin() { return List.begin(); }
+  iterator end() { return List.end(); }
+
+  void erase(const T& val) {
+    auto it = Map.find(val);
+    if (it != Map.end()) {
+      List.erase(it->second);
+      Map.erase(it);
+    }
+  }
+
+  void erase(iterator position) {
+    Map.erase(*position);
+    List.erase(position);
+  }
+
+  // cheating a bit, not returning the iterator
+  void insert(const T& val) {
+    auto it = Map.find(val);
+    if (it == Map.end()) {
+      List.push_back(val);
+      Map.insert(std::make_pair(val, --List.end()));
+    }
+  }
+
+  size_t size() const { return Map.size(); }
+
+  void clear() {
+    Map.clear();
+    List.clear();
+  }
+
+  size_t count(const T& val) const { return Map.count(val); }
+
+  InsertOrderedSet() {}
+  InsertOrderedSet(const InsertOrderedSet& other) {
+    for (auto i : other.List) {
+      insert(i); // inserting manually creates proper iterators
+    }
+  }
+  InsertOrderedSet& operator=(const InsertOrderedSet& other) {
+    abort(); // TODO, watch out for iterators
+  }
+};
+
+// like std::map, except that begin() -> end() iterates in the
+// order that elements were added to the map (not in the order
+// of operator<(Key, Key))
+template<typename Key, typename T>
+struct InsertOrderedMap
+{
+  std::map<Key, typename std::list<std::pair<Key,T>>::iterator> Map;
+  std::list<std::pair<Key,T>>                                   List;
+
+  T& operator[](const Key& k) {
+    auto it = Map.find(k);
+    if (it == Map.end()) {
+      List.push_back(std::make_pair(k, T()));
+      auto e = --List.end();
+      Map.insert(std::make_pair(k, e));
+      return e->second;
+    }
+    return it->second->second;
+  }
+
+  typedef typename std::list<std::pair<Key,T>>::iterator iterator;
+  iterator begin() { return List.begin(); }
+  iterator end() { return List.end(); }
+
+  void erase(const Key& k) {
+    auto it = Map.find(k);
+    if (it != Map.end()) {
+      List.erase(it->second);
+      Map.erase(it);
+    }
+  }
+
+  void erase(iterator position) {
+    erase(position->first);
+  }
+
+  size_t size() const { return Map.size(); }
+  size_t count(const Key& k) const { return Map.count(k); }
+
+  InsertOrderedMap() {}
+  InsertOrderedMap(InsertOrderedMap& other) {
+    abort(); // TODO, watch out for iterators
+  }
+  InsertOrderedMap& operator=(const InsertOrderedMap& other) {
+    abort(); // TODO, watch out for iterators
+  }
+};
+
+
+typedef InsertOrderedSet<Block*> BlockSet;
+typedef InsertOrderedMap<Block*, Branch*> BlockBranchMap;
+
+// Represents a basic block of code - some instructions that end with a
+// control flow modifier (a branch, return or throw).
+struct Block {
+  // Branches become processed after we finish the shape relevant to them. For example,
+  // when we recreate a loop, branches to the loop start become continues and are now
+  // processed. When we calculate what shape to generate from a set of blocks, we ignore
+  // processed branches.
+  // Blocks own the Branch objects they use, and destroy them when done.
+  BlockBranchMap BranchesOut;
+  BlockSet BranchesIn;
+  BlockBranchMap ProcessedBranchesOut;
+  BlockSet ProcessedBranchesIn;
+  Shape *Parent; // The shape we are directly inside
+  int Id; // A unique identifier, defined when added to relooper. Note that this uniquely identifies a *logical* block - if we split it, the two instances have the same content *and* the same Id
+  const char *Code; // The string representation of the code in this block. Owning pointer (we copy the input)
+  const char *BranchVar; // A variable whose value determines where we go; if this is not NULL, emit a switch on that variable
+  bool IsCheckedMultipleEntry; // If true, we are a multiple entry, so reaching us requires setting the label variable
+
+  Block(const char *CodeInit, const char *BranchVarInit);
+  ~Block();
+
+  void AddBranchTo(Block *Target, const char *Condition, const char *Code=NULL);
+
+  // Prints out the instructions code and branchings
+  void Render(bool InLoop);
+};
+
+// Represents a structured control flow shape, one of
+//
+//  Simple: No control flow at all, just instructions. If several
+//          blocks, then 
+//
+//  Multiple: A shape with more than one entry. If the next block to
+//            be entered is among them, we run it and continue to
+//            the next shape, otherwise we continue immediately to the
+//            next shape.
+//
+//  Loop: An infinite loop.
+//
+//  Emulated: Control flow is managed by a switch in a loop. This
+//            is necessary in some cases, for example when control
+//            flow is not known until runtime (indirect branches,
+//            setjmp returns, etc.)
+//
+
+struct SimpleShape;
+struct LabeledShape;
+struct MultipleShape;
+struct LoopShape;
+struct EmulatedShape;
+
+struct Shape {
+  int Id; // A unique identifier. Used to identify loops, labels are Lx where x is the Id. Defined when added to relooper
+  Shape *Next; // The shape that will appear in the code right after this one
+  Shape *Natural; // The shape that control flow gets to naturally (if there is Next, then this is Next)
+
+  enum ShapeType {
+    Simple,
+    Multiple,
+    Loop,
+    Emulated
+  };
+  ShapeType Type;
+
+  Shape(ShapeType TypeInit) : Id(-1), Next(NULL), Type(TypeInit) {}
+  virtual ~Shape() {}
+
+  virtual void Render(bool InLoop) = 0;
+
+  static SimpleShape *IsSimple(Shape *It) { return It && It->Type == Simple ? (SimpleShape*)It : NULL; }
+  static MultipleShape *IsMultiple(Shape *It) { return It && It->Type == Multiple ? (MultipleShape*)It : NULL; }
+  static LoopShape *IsLoop(Shape *It) { return It && It->Type == Loop ? (LoopShape*)It : NULL; }
+  static LabeledShape *IsLabeled(Shape *It) { return IsMultiple(It) || IsLoop(It) ? (LabeledShape*)It : NULL; }
+  static EmulatedShape *IsEmulated(Shape *It) { return It && It->Type == Emulated ? (EmulatedShape*)It : NULL; }
+};
+
+struct SimpleShape : public Shape {
+  Block *Inner;
+
+  SimpleShape() : Shape(Simple), Inner(NULL) {}
+  void Render(bool InLoop) override {
+    Inner->Render(InLoop);
+    if (Next) Next->Render(InLoop);
+  }
+};
+
+// A shape that may be implemented with a labeled loop.
+struct LabeledShape : public Shape {
+  bool Labeled; // If we have a loop, whether it needs to be labeled
+
+  LabeledShape(ShapeType TypeInit) : Shape(TypeInit), Labeled(false) {}
+};
+
+// Blocks with the same id were split and are identical, so we just care about ids in Multiple entries
+typedef std::map<int, Shape*> IdShapeMap;
+
+struct MultipleShape : public LabeledShape {
+  IdShapeMap InnerMap; // entry block ID -> shape
+  int Breaks; // If we have branches on us, we need a loop (or a switch). This is a counter of requirements,
+                     // if we optimize it to 0, the loop is unneeded
+  bool UseSwitch; // Whether to switch on label as opposed to an if-else chain
+
+  MultipleShape() : LabeledShape(Multiple), Breaks(0), UseSwitch(false) {}
+
+  void RenderLoopPrefix();
+  void RenderLoopPostfix();
+
+  void Render(bool InLoop) override;
+};
+
+struct LoopShape : public LabeledShape {
+  Shape *Inner;
+
+  LoopShape() : LabeledShape(Loop), Inner(NULL) {}
+  void Render(bool InLoop) override;
+};
+
+// TODO EmulatedShape is only partially functional. Currently it can be used for the
+//      entire set of blocks being relooped, but not subsets.
+struct EmulatedShape : public LabeledShape {
+  Block *Entry;
+  BlockSet Blocks;
+
+  EmulatedShape() : LabeledShape(Emulated) { Labeled = true; }
+  void Render(bool InLoop) override;
+};
+
+// Implements the relooper algorithm for a function's blocks.
+//
+// Usage:
+//  1. Instantiate this struct.
+//  2. Call AddBlock with the blocks you have. Each should already
+//     have its branchings in specified (the branchings out will
+//     be calculated by the relooper).
+//  3. Call Render().
+//
+// Implementation details: The Relooper instance has
+// ownership of the blocks and shapes, and frees them when done.
+struct Relooper {
+  std::deque<Block*> Blocks;
+  std::deque<Shape*> Shapes;
+  Shape *Root;
+  bool Emulate;
+  bool MinSize;
+  int BlockIdCounter;
+  int ShapeIdCounter;
+
+  Relooper();
+  ~Relooper();
+
+  void AddBlock(Block *New, int Id=-1);
+
+  // Calculates the shapes
+  void Calculate(Block *Entry);
+
+  // Renders the result.
+  void Render();
+
+  // Sets the global buffer all printing goes to. Must call this or MakeOutputBuffer.
+  // XXX: this is deprecated, see MakeOutputBuffer
+  static void SetOutputBuffer(char *Buffer, int Size);
+
+  // Creates an internal output buffer. Must call this or SetOutputBuffer. Size is
+  // a hint for the initial size of the buffer, it can be resized later one demand.
+  // For that reason this is more recommended than SetOutputBuffer.
+  static void MakeOutputBuffer(int Size);
+
+  static char *GetOutputBuffer();
+
+  // Sets asm.js mode on or off (default is off)
+  static void SetAsmJSMode(int On);
+
+  // Sets whether we must emulate everything with switch-loop code
+  void SetEmulate(int E) { Emulate = E; }
+
+  // Sets us to try to minimize size
+  void SetMinSize(bool MinSize_) { MinSize = MinSize_; }
+};
+
+typedef InsertOrderedMap<Block*, BlockSet> BlockBlockSetMap;
+
+#if DEBUG
+struct Debugging {
+  static void Dump(BlockSet &Blocks, const char *prefix=NULL);
+  static void Dump(Shape *S, const char *prefix=NULL);
+};
+#endif
+
+#endif // __cplusplus
+
+// C API - useful for binding to other languages
+
+#ifdef _WIN32
+  #ifdef RELOOPERDLL_EXPORTS
+    #define RELOOPERDLL_API __declspec(dllexport)
+  #else
+    #define RELOOPERDLL_API __declspec(dllimport)
+  #endif
+#else
+  #define RELOOPERDLL_API
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+RELOOPERDLL_API void  rl_set_output_buffer(char *buffer, int size);
+RELOOPERDLL_API void  rl_make_output_buffer(int size);
+RELOOPERDLL_API void  rl_set_asm_js_mode(int on);
+RELOOPERDLL_API void *rl_new_block(const char *text, const char *branch_var);
+RELOOPERDLL_API void  rl_delete_block(void *block);
+RELOOPERDLL_API void  rl_block_add_branch_to(void *from, void *to, const char *condition, const char *code);
+RELOOPERDLL_API void *rl_new_relooper();
+RELOOPERDLL_API void  rl_delete_relooper(void *relooper);
+RELOOPERDLL_API void  rl_relooper_add_block(void *relooper, void *block);
+RELOOPERDLL_API void  rl_relooper_calculate(void *relooper, void *entry);
+RELOOPERDLL_API void  rl_relooper_render(void *relooper);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/lib/Target/JSBackend/RemoveLLVMAssume.cpp b/lib/Target/JSBackend/RemoveLLVMAssume.cpp
new file mode 100644
index 000000000000..c18a9f13a402
--- /dev/null
+++ b/lib/Target/JSBackend/RemoveLLVMAssume.cpp
@@ -0,0 +1,64 @@
+//===-- RemoveLLVMAssume.cpp ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-----------------------------------------------------------------------===//
+//
+//===-----------------------------------------------------------------------===//
+
+#include "OptPasses.h"
+
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace llvm {
+
+// Remove all uses of llvm.assume; we don't need them anymore
+struct RemoveLLVMAssume : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  RemoveLLVMAssume() : ModulePass(ID) {}
+    // XXX initialize..(*PassRegistry::getPassRegistry()); }
+
+  bool runOnModule(Module &M) override;
+
+  StringRef getPassName() const override { return "RemoveLLVMAssume"; }
+};
+
+char RemoveLLVMAssume::ID = 0;
+
+bool RemoveLLVMAssume::runOnModule(Module &M) {
+  bool Changed = false;
+
+  Function *LLVMAssume = M.getFunction("llvm.assume");
+
+  if (LLVMAssume) {
+    SmallVector<CallInst*, 10> Assumes;
+    for (Instruction::user_iterator UI = LLVMAssume->user_begin(), UE = LLVMAssume->user_end(); UI != UE; ++UI) {
+      User *U = *UI;
+      if (CallInst *CI = dyn_cast<CallInst>(U)) {
+        Assumes.push_back(CI);
+      }
+    }
+
+    for (auto CI : Assumes) {
+      Value *V = CI->getOperand(0);
+      CI->eraseFromParent();
+      RecursivelyDeleteTriviallyDeadInstructions(V); // the single operand is likely dead
+    }
+  }
+  return Changed;
+}
+
+//
+
+extern ModulePass *createEmscriptenRemoveLLVMAssumePass() {
+  return new RemoveLLVMAssume();
+}
+
+} // End llvm namespace
diff --git a/lib/Target/JSBackend/SimplifyAllocas.cpp b/lib/Target/JSBackend/SimplifyAllocas.cpp
new file mode 100644
index 000000000000..ebc92f758564
--- /dev/null
+++ b/lib/Target/JSBackend/SimplifyAllocas.cpp
@@ -0,0 +1,107 @@
+//===-- SimplifyAllocas.cpp - Alloca optimization ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===-----------------------------------------------------------------------===//
+//
+// There shouldn't be any opportunities for this pass to do anything if the
+// regular LLVM optimizer passes are run. However, it does make things nicer
+// at -O0.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "OptPasses.h"
+
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Constants.h"
+
+namespace llvm {
+
+/*
+ * Find cases where an alloca is used only to load and store a single value,
+ * even though it is bitcast. Then replace it with a direct alloca of that
+ * simple type, and avoid the bitcasts.
+ */
+
+struct SimplifyAllocas : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  SimplifyAllocas() : FunctionPass(ID) {}
+    // XXX initialize..(*PassRegistry::getPassRegistry()); }
+
+  bool runOnFunction(Function &Func) override;
+
+  StringRef getPassName() const override { return "SimplifyAllocas"; }
+};
+
+char SimplifyAllocas::ID = 0;
+
+bool SimplifyAllocas::runOnFunction(Function &Func) {
+  bool Changed = false;
+  Type *i32 = Type::getInt32Ty(Func.getContext());
+  std::vector<Instruction*> ToRemove; // removing can invalidate our iterators, so do it all at the end
+  for (Function::iterator B = Func.begin(), E = Func.end(); B != E; ++B) {
+    for (BasicBlock::iterator BI = B->begin(), BE = B->end(); BI != BE; ) {
+      Instruction *I = &*BI++;
+      AllocaInst *AI = dyn_cast<AllocaInst>(I);
+      if (!AI) continue;
+      if (!isa<ConstantInt>(AI->getArraySize())) continue;
+      bool Fail = false;
+      Type *ActualType = NULL;
+      #define CHECK_TYPE(TT) {              \
+        Type *T = TT;                       \
+        if (!ActualType) {                  \
+          ActualType = T;                   \
+        } else {                            \
+          if (T != ActualType) Fail = true; \
+        }                                   \
+      }
+      std::vector<Instruction*> Aliases; // the bitcasts of this alloca
+      for (Instruction::user_iterator UI = AI->user_begin(), UE = AI->user_end(); UI != UE && !Fail; ++UI) {
+        Instruction *U = cast<Instruction>(*UI);
+        if (U->getOpcode() != Instruction::BitCast) { Fail = true; break; }
+        // bitcasting just to do loads and stores is ok
+        for (Instruction::user_iterator BUI = U->user_begin(), BUE = U->user_end(); BUI != BUE && !Fail; ++BUI) {
+          Instruction *BU = cast<Instruction>(*BUI);
+          if (BU->getOpcode() == Instruction::Load) {
+            CHECK_TYPE(BU->getType());
+            break;
+          }
+          if (BU->getOpcode() != Instruction::Store) { Fail = true; break; }
+          CHECK_TYPE(BU->getOperand(0)->getType());
+          if (BU->getOperand(0) == U) { Fail = true; break; }
+        }
+        if (!Fail) Aliases.push_back(U);
+      }
+      if (!Fail && Aliases.size() > 0 && ActualType) {
+        // success, replace the alloca and the bitcast aliases with a single simple alloca
+        AllocaInst *NA = new AllocaInst(ActualType, ConstantInt::get(i32, 1), "", I);
+        NA->takeName(AI);
+        NA->setAlignment(AI->getAlignment());
+        NA->setDebugLoc(AI->getDebugLoc());
+        for (unsigned i = 0; i < Aliases.size(); i++) {
+          Aliases[i]->replaceAllUsesWith(NA);
+          ToRemove.push_back(Aliases[i]);
+        }
+        ToRemove.push_back(AI);
+        Changed = true;
+      }
+    }
+  }
+  for (unsigned i = 0; i < ToRemove.size(); i++) {
+    ToRemove[i]->eraseFromParent();
+  }
+  return Changed;
+}
+
+//
+
+extern FunctionPass *createEmscriptenSimplifyAllocasPass() {
+  return new SimplifyAllocas();
+}
+
+} // End llvm namespace
diff --git a/lib/Target/JSBackend/TargetInfo/CMakeLists.txt b/lib/Target/JSBackend/TargetInfo/CMakeLists.txt
new file mode 100644
index 000000000000..29994eb8f95e
--- /dev/null
+++ b/lib/Target/JSBackend/TargetInfo/CMakeLists.txt
@@ -0,0 +1,5 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMJSBackendInfo
+  JSBackendTargetInfo.cpp
+  )
diff --git a/lib/Target/JSBackend/TargetInfo/JSBackendTargetInfo.cpp b/lib/Target/JSBackend/TargetInfo/JSBackendTargetInfo.cpp
new file mode 100644
index 000000000000..cdf9752a07e6
--- /dev/null
+++ b/lib/Target/JSBackend/TargetInfo/JSBackendTargetInfo.cpp
@@ -0,0 +1,20 @@
+//===-- JSBackendTargetInfo.cpp - JSBackend Target Implementation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===--------------------------------------------------------------------===//
+
+#include "JSTargetMachine.h"
+#include "MCTargetDesc/JSBackendMCTargetDesc.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheJSBackendTarget;
+
+extern "C" void LLVMInitializeJSBackendTargetInfo() { 
+  RegisterTarget<Triple::asmjs, /*HasJIT=*/false> X(TheJSBackendTarget, "js", "JavaScript (asm.js, emscripten) backend");
+}
diff --git a/lib/Target/JSBackend/TargetInfo/LLVMBuild.txt b/lib/Target/JSBackend/TargetInfo/LLVMBuild.txt
new file mode 100644
index 000000000000..732058260970
--- /dev/null
+++ b/lib/Target/JSBackend/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/JSBackend/TargetInfo/LLVMBuild.txt ---------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===-----------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===-----------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = JSBackendInfo
+parent = JSBackend
+required_libraries = MC Support Target
+add_to_library_groups = JSBackend
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 8be2a898e380..169edb5cf72d 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -24,6 +24,7 @@ subdirectories =
  AArch64
  AVR
  BPF
+ JSBackend
  Lanai
  Hexagon
  MSP430
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index d75ed206ad23..66fce3934084 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -81,6 +81,9 @@ static bool mergeConstants(Module &M) {
 
   bool MadeChange = false;
 
+  // XXX EMSCRIPTEN: mark @__init_array_start as not to be touched
+  const GlobalValue *InitArrayStart = M.getNamedGlobal("__init_array_start");
+
   // Iterate constant merging while we are still making progress.  Merging two
   // constants together may allow us to merge other constants together if the
   // second level constants have initializers which point to the globals that
@@ -92,6 +95,10 @@ static bool mergeConstants(Module &M) {
          GVI != E; ) {
       GlobalVariable *GV = &*GVI++;
 
+      // XXX EMSCRIPTEN: mark @__init_array_start as not to be touched
+      if (GV == InitArrayStart)
+        continue;
+
       // If this GV is dead, remove it.
       GV->removeDeadConstantUsers();
       if (GV->use_empty() && GV->hasLocalLinkage()) {
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 5b0d5e3bc01e..7abe16b2a239 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1536,6 +1536,61 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
   return false;
 }
 
+/// TryToAddRangeMetadata - At this point, we have learned that the only
+/// two values ever stored into GV are its initializer and OtherVal.  See if we
+/// can annotate loads from it with range metadata describing this.
+/// This exposes the values to other scalar optimizations.
+static bool TryToAddRangeMetadata(GlobalVariable *GV, Constant *OtherVal) {
+  Type *GVElType = GV->getType()->getElementType();
+
+  // If GVElType is already i1, it already has a minimal range. If the type of
+  // the GV is an FP value, pointer or vector, don't do this optimization
+  // because range metadata is currently only supported on scalar integers.
+  if (GVElType == Type::getInt1Ty(GV->getContext()) ||
+      GVElType->isFloatingPointTy() ||
+      GVElType->isPointerTy() || GVElType->isVectorTy())
+    return false;
+
+  // Walk the use list of the global seeing if all the uses are load or store.
+  // If there is anything else, bail out.
+  for (User *U : GV->users())
+    if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
+      return false;
+
+  Constant *InitVal = GV->getInitializer();
+  assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
+         "No reason to add range metadata!");
+
+  // The MD_range metadata only supports absolute integer constants.
+  if (!isa<ConstantInt>(InitVal) || !isa<ConstantInt>(OtherVal))
+    return false;
+
+  DEBUG(dbgs() << "   *** ADDING RANGE METADATA: " << *GV);
+
+  for (Value::user_iterator I = GV->user_begin(), E = GV->user_end(); I != E; ++I){
+    Instruction *UI = cast<Instruction>(*I);
+    if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+      // If we already have a range, don't add a new one, so that GlobalOpt
+      // terminates. In theory, we could merge the two ranges.
+      if (LI->getMetadata(LLVMContext::MD_range))
+        return false;
+      // Add range metadata to the load. We have two possible values, and we
+      // need to create a half-open range. The range can wrap, so we can use
+      // either signed or unsigned; we pick signed because it might be prettier
+      // in common cases.
+      Constant *Cmp = ConstantExpr::getICmp(ICmpInst::ICMP_SLT, InitVal, OtherVal);
+      Constant *One = ConstantInt::get(LI->getType(), 1);
+      SmallVector<Metadata *, 2> NewOps;
+      NewOps.push_back(ConstantAsMetadata::get(ConstantExpr::getSelect(Cmp, InitVal, OtherVal)));
+      NewOps.push_back(ConstantAsMetadata::get(ConstantExpr::getAdd(ConstantExpr::getSelect(Cmp, OtherVal, InitVal), One)));
+      MDNode *MD = MDNode::get(LI->getContext(), NewOps);
+      LI->setMetadata(LLVMContext::MD_range, MD);
+    }
+  }
+
+  return true;
+}
+
 /// At this point, we have learned that the only two values ever stored into GV
 /// are its initializer and OtherVal.  See if we can shrink the global into a
 /// boolean and select between the two values whenever it is used.  This exposes
@@ -1915,9 +1970,10 @@ static bool processInternalGlobal(
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
     // boolean.
+    // XXX EMSCRIPTEN - add range metadata instead
     if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
       if (GS.Ordering == AtomicOrdering::NotAtomic) {
-        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+        if (TryToAddRangeMetadata(GV, SOVConstant)) { // XXX EMSCRIPTEN
           ++NumShrunkToBool;
           return true;
         }
diff --git a/readme-emscripten-fastcomp.txt b/readme-emscripten-fastcomp.txt
new file mode 100644
index 000000000000..3df85ca78c08
--- /dev/null
+++ b/readme-emscripten-fastcomp.txt
@@ -0,0 +1,18 @@
+source info:
+
+llvm:
+
+origin	https://chromium.googlesource.com/native_client/pnacl-llvm
+
+commit 7026af7138fccfb256456b04b375d39b025bdb7c
+Author: Karl Schimpf <kschimpf@google.com>
+Date:   Thu Nov 21 10:34:00 2013 -0800
+
+clang:
+
+origin	https://chromium.googlesource.com/native_client/pnacl-clang 
+
+commit a963b803407c9d1cac644cc425004e0ccd28fa45
+Author: JF Bastien <jfb@chromium.org>
+Date:   Fri Nov 8 15:52:28 2013 -0800
+
diff --git a/test/CodeGen/JS/aliases.ll b/test/CodeGen/JS/aliases.ll
new file mode 100644
index 000000000000..11ebda5ff21c
--- /dev/null
+++ b/test/CodeGen/JS/aliases.ll
@@ -0,0 +1,52 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+@.str = private unnamed_addr constant [18 x i8] c"hello, world! %d\0A\00", align 1 ; [#uses=1 type=[18 x i8]*]
+
+@othername = internal alias void (i32), void (i32)* @doit
+@othername2 = internal alias void (i32), void (i32)* @othername
+@othername3 = internal alias void (i32), void (i32)* @othername2
+@othername4 = internal alias void (), bitcast (void (i32)* @othername2 to void ()*)
+
+@list = global i32 ptrtoint (void ()* @othername4 to i32)
+@list2 = global <{ i32, i32, i32, i32, i32 }> <{ i32 ptrtoint (void (i32)* @doit to i32), i32 ptrtoint (void (i32)* @othername to i32), i32 ptrtoint (void (i32)* @othername2 to i32), i32 ptrtoint (void (i32)* @othername3 to i32), i32 ptrtoint (void ()* @othername4 to i32) }>
+
+
+@value = global i32 17
+@value2 = alias i32, i32* @value
+@value3 = alias i32, i32* @value
+
+define internal void @doit(i32 %x) {
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str, i32 0, i32 0), i32 %x) ; [#uses=0 type=i32]
+  ret void
+}
+
+;;; we just check for compilation to succeed here, specifically of @list and @list2
+; CHECK: function _main() {
+; CHECK: }
+
+define i32 @main() {
+entry:
+  call void () @othername4()
+  %fp = ptrtoint void ()* @othername4 to i32
+  %fp1 = add i32 %fp, 0
+  %pf = inttoptr i32 %fp1 to void (i32)*
+  %x = load i32, i32* @value3
+  call void (i32) %pf(i32 %x)
+  %x1 = load i32, i32* @value2
+  call void (i32) @othername3(i32 %x1)
+  %x2 = load i32, i32* @value
+  call void (i32) @othername2(i32 %x2)
+  store i32 18, i32* @value
+  %x3 = load i32, i32* @value
+  call void (i32) @othername(i32 %x3)
+  store i32 19, i32* @value3
+  %x4 = load i32, i32* @value3
+  call void (i32) @doit(i32 %x4)
+  ret i32 1
+}
+
+declare i32 @printf(i8*, ...)
+
diff --git a/test/CodeGen/JS/alloca-contradiction.ll b/test/CodeGen/JS/alloca-contradiction.ll
new file mode 100644
index 000000000000..82b1bf87c9fe
--- /dev/null
+++ b/test/CodeGen/JS/alloca-contradiction.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s
+
+; In theory, the @llvm.lifetime intrinsics shouldn't contradict each other, but
+; in practice they apparently do sometimes. When they do, we should probably be
+; conservative.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; Don't merge these two allocas, even though lifetime markers may initially
+; appear to indicate that it's safe, because they also indicate that it's
+; unsafe.
+
+; CHECK: foo
+; CHECK: HEAP8[$p] = 0;
+; CHECK: HEAP8[$q] = 1;
+define void @foo() nounwind {
+entry:
+  %p = alloca i8
+  %q = alloca i8
+  br label %loop
+
+loop:
+  call void @llvm.lifetime.end(i64 1, i8* %q)
+  store volatile i8 0, i8* %p
+  store volatile i8 1, i8* %q
+  call void @llvm.lifetime.start(i64 1, i8* %p)
+  br i1 undef, label %loop, label %end
+
+end:                                              ; preds = %red
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/JS/allocamanager-phis.ll b/test/CodeGen/JS/allocamanager-phis.ll
new file mode 100644
index 000000000000..c04a21245ef4
--- /dev/null
+++ b/test/CodeGen/JS/allocamanager-phis.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s | FileCheck %s
+
+; Lifetime intrinsics are typically just referencing a single alloca, but
+; sometimes PRE decides to totally optimize a redundant bitcast and insert
+; phis. We need to look through the phis. In the code below, l_1565.i has
+; an overlapping lifetime with l_766.i which is only visible if we can
+; see through phis.
+
+; CHECK: $vararg_buffer3 = sp;
+; CHECK: $l_1565$i = sp + 16|0;
+; CHECK: $l_766$i = sp + 12|0;
+
+target datalayout = "e-p:32:32-i64:64-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+@g_15 = external hidden global [4 x i8], align 4
+@g_285 = external hidden global [4 x i8], align 4
+@g_423 = external hidden global i32, align 4
+@g_779 = external hidden global [4 x i8], align 4
+@g_784 = external hidden global [4 x i8], align 4
+@.str = external hidden unnamed_addr constant [25 x i8], align 1
+@.str1 = external hidden unnamed_addr constant [25 x i8], align 1
+@.str2 = external hidden unnamed_addr constant [15 x i8], align 1
+@.str3 = external hidden unnamed_addr constant [8 x i8], align 1
+@__func__._Z6func_6v = external hidden unnamed_addr constant [7 x i8], align 1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, i8* noalias) #0
+
+; Function Attrs: noreturn
+declare void @__assert_fail(i8*, i8*, i32, i8*) #1
+
+define void @test() {
+entry:
+  %vararg_buffer3 = alloca <{ i32*, i32**, i32* }>, align 8
+  %vararg_lifetime_bitcast4 = bitcast <{ i32*, i32**, i32* }>* %vararg_buffer3 to i8*
+  %vararg_buffer = alloca <{ i32*, i32**, i32* }>, align 8
+  %vararg_lifetime_bitcast = bitcast <{ i32*, i32**, i32* }>* %vararg_buffer to i8*
+  %l_767.i.i = alloca i32, align 4
+  %l_1565.i = alloca i32*, align 4
+  %l_767.i = alloca i32, align 4
+  %l_766.i = alloca [1 x i16*], align 4
+  %0 = load i32, i32* bitcast ([4 x i8]* @g_15 to i32*), align 4
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.then, label %entry.if.end_crit_edge
+
+entry.if.end_crit_edge:                           ; preds = %entry
+  %.pre = bitcast [1 x i16*]* %l_766.i to i8*
+  %.pre1 = getelementptr inbounds [1 x i16*], [1 x i16*]* %l_766.i, i32 0, i32 0
+  br label %if.end
+
+if.then:                                          ; preds = %entry
+  %1 = bitcast i32* %l_767.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %1)
+  %2 = bitcast [1 x i16*]* %l_766.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %2)
+  store i32 -1407759351, i32* %l_767.i, align 4
+  %3 = getelementptr inbounds [1 x i16*], [1 x i16*]* %l_766.i, i32 0, i32 0
+  store i16* null, i16** %3, align 4
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %if.then
+  %l_82.02.i = phi i32 [ 0, %if.then ], [ %inc.i, %for.body.i ]
+  %4 = load i32**, i32*** bitcast (i32* @g_423 to i32***), align 4
+  store i32* %l_767.i, i32** %4, align 4
+  store i16** %3, i16*** bitcast ([4 x i8]* @g_779 to i16***), align 4
+  %inc.i = add i32 %l_82.02.i, 1
+  %exitcond.i = icmp eq i32 %inc.i, 27
+  br i1 %exitcond.i, label %_Z7func_34v.exit, label %for.body.i
+
+_Z7func_34v.exit:                                 ; preds = %for.body.i
+  call void @llvm.lifetime.end(i64 4, i8* %1)
+  call void @llvm.lifetime.end(i64 4, i8* %2)
+  %5 = load i32**, i32*** bitcast (i32* @g_423 to i32***), align 4
+  store i32* bitcast ([4 x i8]* @g_285 to i32*), i32** %5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %_Z7func_34v.exit, %entry.if.end_crit_edge
+  %.pre-phi2 = phi i16** [ %.pre1, %entry.if.end_crit_edge ], [ %3, %_Z7func_34v.exit ]
+  %.pre-phi = phi i8* [ %.pre, %entry.if.end_crit_edge ], [ %2, %_Z7func_34v.exit ]
+  %6 = bitcast i32** %l_1565.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %6)
+  store i32* bitcast ([4 x i8]* @g_784 to i32*), i32** %l_1565.i, align 4
+  call void @llvm.lifetime.start(i64 12, i8* %vararg_lifetime_bitcast)
+  %vararg_ptr = getelementptr <{ i32*, i32**, i32* }>, <{ i32*, i32**, i32* }>* %vararg_buffer, i32 0, i32 0
+  store i32* bitcast ([4 x i8]* @g_784 to i32*), i32** %vararg_ptr, align 4
+  %vararg_ptr1 = getelementptr <{ i32*, i32**, i32* }>, <{ i32*, i32**, i32* }>* %vararg_buffer, i32 0, i32 1
+  store i32** %l_1565.i, i32*** %vararg_ptr1, align 4
+  %vararg_ptr2 = getelementptr <{ i32*, i32**, i32* }>, <{ i32*, i32**, i32* }>* %vararg_buffer, i32 0, i32 2
+  store i32* bitcast ([4 x i8]* @g_784 to i32*), i32** %vararg_ptr2, align 4
+  %call.i = call i32 bitcast (i32 (i8*, i8*)* @printf to i32 (i8*, <{ i32*, i32**, i32* }>*)*)(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str, i32 0, i32 0), <{ i32*, i32**, i32* }>* %vararg_buffer)
+  call void @llvm.lifetime.end(i64 12, i8* %vararg_lifetime_bitcast)
+  %7 = bitcast i32* %l_767.i.i to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %7)
+  call void @llvm.lifetime.start(i64 4, i8* %.pre-phi)
+  store i32 -1407759351, i32* %l_767.i.i, align 4
+  store i16* null, i16** %.pre-phi2, align 4
+  br label %for.body.i.i
+
+for.body.i.i:                                     ; preds = %for.body.i.i, %if.end
+  %l_82.02.i.i = phi i32 [ 0, %if.end ], [ %inc.i.i, %for.body.i.i ]
+  %8 = load i32**, i32*** bitcast (i32* @g_423 to i32***), align 4
+  store i32* %l_767.i.i, i32** %8, align 4
+  store i16** %.pre-phi2, i16*** bitcast ([4 x i8]* @g_779 to i16***), align 4
+  %inc.i.i = add i32 %l_82.02.i.i, 1
+  %exitcond.i.i = icmp eq i32 %inc.i.i, 27
+  br i1 %exitcond.i.i, label %_Z7func_34v.exit.i, label %for.body.i.i
+
+_Z7func_34v.exit.i:                               ; preds = %for.body.i.i
+  call void @llvm.lifetime.end(i64 4, i8* %7)
+  call void @llvm.lifetime.end(i64 4, i8* %.pre-phi)
+  %9 = load i32*, i32** %l_1565.i, align 4
+  call void @llvm.lifetime.start(i64 12, i8* %vararg_lifetime_bitcast4)
+  %vararg_ptr5 = getelementptr <{ i32*, i32**, i32* }>, <{ i32*, i32**, i32* }>* %vararg_buffer3, i32 0, i32 0
+  store i32* %9, i32** %vararg_ptr5, align 4
+  %vararg_ptr6 = getelementptr <{ i32*, i32**, i32* }>, <{ i32*, i32**, i32* }>* %vararg_buffer3, i32 0, i32 1
+  store i32** %l_1565.i, i32*** %vararg_ptr6, align 4
+  %vararg_ptr7 = getelementptr <{ i32*, i32**, i32* }>, <{ i32*, i32**, i32* }>* %vararg_buffer3, i32 0, i32 2
+  store i32* bitcast ([4 x i8]* @g_784 to i32*), i32** %vararg_ptr7, align 4
+  %call1.i = call i32 bitcast (i32 (i8*, i8*)* @printf to i32 (i8*, <{ i32*, i32**, i32* }>*)*)(i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str1, i32 0, i32 0), <{ i32*, i32**, i32* }>* %vararg_buffer3)
+  call void @llvm.lifetime.end(i64 12, i8* %vararg_lifetime_bitcast4)
+  %10 = load i32*, i32** %l_1565.i, align 4
+  %cmp.i = icmp eq i32* %10, bitcast ([4 x i8]* @g_784 to i32*)
+  br i1 %cmp.i, label %_Z6func_6v.exit, label %lor.rhs.i
+
+lor.rhs.i:                                        ; preds = %_Z7func_34v.exit.i
+  call void @__assert_fail(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str2, i32 0, i32 0), i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str3, i32 0, i32 0), i32 33, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @__func__._Z6func_6v, i32 0, i32 0)) #1
+  unreachable
+
+_Z6func_6v.exit:                                  ; preds = %_Z7func_34v.exit.i
+  call void @llvm.lifetime.end(i64 4, i8* %6)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn }
diff --git a/test/CodeGen/JS/allocamanager.ll b/test/CodeGen/JS/allocamanager.ll
new file mode 100644
index 000000000000..19f1ca7949f3
--- /dev/null
+++ b/test/CodeGen/JS/allocamanager.ll
@@ -0,0 +1,165 @@
+; RUN: llc < %s | FileCheck %s
+
+; Basic AllocaManager feature test. Eliminate user variable cupcake in favor of
+; user variable muffin, and combine all the vararg buffers. And align the stack
+; pointer.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+%struct._IO_FILE = type opaque
+
+@stderr = external constant [4 x i8], align 4
+@.str = private unnamed_addr constant [26 x i8] c"hello from %s; argc is %d\00", align 1
+@.str1 = private unnamed_addr constant [33 x i8] c"message from the program: \22%s\22!\0A\00", align 1
+@.str2 = private unnamed_addr constant [38 x i8] c"with argc %d, I, %s, must say goodbye\00", align 1
+@.str3 = private unnamed_addr constant [43 x i8] c"another message from the program: \22%s\22...\0A\00", align 1
+
+; CHECK: function _foo($argc,$argv) {
+; CHECK-NOT: cupcake
+; CHECK: STACKTOP = STACKTOP + 128|0;
+; CHECK-NEXT: vararg_buffer0 =
+; CHECK-NEXT: $muffin =
+; CHECK-NOT: cupcake
+; CHECK: }
+
+; Function Attrs: nounwind
+define void @foo(i32 %argc, i8** %argv) #0 {
+entry:
+  %vararg_buffer0 = alloca <{ i8* }>, align 8
+  %vararg_lifetime_bitcast10 = bitcast <{ i8* }>* %vararg_buffer0 to i8*
+  %vararg_buffer5 = alloca <{ i32, i8* }>, align 8
+  %vararg_lifetime_bitcast6 = bitcast <{ i32, i8* }>* %vararg_buffer5 to i8*
+  %vararg_buffer2 = alloca <{ i8* }>, align 8
+  %vararg_lifetime_bitcast3 = bitcast <{ i8* }>* %vararg_buffer2 to i8*
+  %vararg_buffer1 = alloca <{ i8*, i32 }>, align 8
+  %vararg_lifetime_bitcast = bitcast <{ i8*, i32 }>* %vararg_buffer1 to i8*
+  %muffin = alloca [117 x i8], align 1
+  %cupcake = alloca [119 x i8], align 1
+  %tmp = getelementptr [117 x i8], [117 x i8]* %muffin, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 117, i8* %tmp) #0
+  %tmp1 = load i8*, i8** %argv, align 4
+  call void @llvm.lifetime.start(i64 8, i8* %vararg_lifetime_bitcast)
+  %vararg_ptr = getelementptr <{ i8*, i32 }>, <{ i8*, i32 }>* %vararg_buffer1, i32 0, i32 0
+  store i8* %tmp1, i8** %vararg_ptr, align 4
+  %vararg_ptr1 = getelementptr <{ i8*, i32 }>, <{ i8*, i32 }>* %vararg_buffer1, i32 0, i32 1
+  store i32 %argc, i32* %vararg_ptr1, align 4
+  %call = call i32 bitcast (i32 (i8*, i8*, i8*)* @sprintf to i32 (i8*, i8*, <{ i8*, i32 }>*)*)(i8* %tmp, i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str, i32 0, i32 0), <{ i8*, i32 }>* %vararg_buffer1) #0
+  call void @llvm.lifetime.end(i64 8, i8* %vararg_lifetime_bitcast)
+  %tmp2 = load %struct._IO_FILE*, %struct._IO_FILE** bitcast ([4 x i8]* @stderr to %struct._IO_FILE**), align 4
+  call void @llvm.lifetime.start(i64 4, i8* %vararg_lifetime_bitcast3)
+  %vararg_ptr4 = getelementptr <{ i8* }>, <{ i8* }>* %vararg_buffer2, i32 0, i32 0
+  store i8* %tmp, i8** %vararg_ptr4, align 4
+  %call2 = call i32 bitcast (i32 (%struct._IO_FILE*, i8*, i8*)* @fprintf to i32 (%struct._IO_FILE*, i8*, <{ i8* }>*)*)(%struct._IO_FILE* %tmp2, i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str1, i32 0, i32 0), <{ i8* }>* %vararg_buffer2) #0
+  call void @llvm.lifetime.end(i64 4, i8* %vararg_lifetime_bitcast3)
+  call void @llvm.lifetime.end(i64 117, i8* %tmp) #0
+  %tmp3 = getelementptr [119 x i8], [119 x i8]* %cupcake, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 119, i8* %tmp3) #0
+  %tmp4 = load i8*, i8** %argv, align 4
+  call void @llvm.lifetime.start(i64 8, i8* %vararg_lifetime_bitcast6)
+  %vararg_ptr7 = getelementptr <{ i32, i8* }>, <{ i32, i8* }>* %vararg_buffer5, i32 0, i32 0
+  store i32 %argc, i32* %vararg_ptr7, align 4
+  %vararg_ptr8 = getelementptr <{ i32, i8* }>, <{ i32, i8* }>* %vararg_buffer5, i32 0, i32 1
+  store i8* %tmp4, i8** %vararg_ptr8, align 4
+  %call5 = call i32 bitcast (i32 (i8*, i8*, i8*)* @sprintf to i32 (i8*, i8*, <{ i32, i8* }>*)*)(i8* %tmp3, i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str2, i32 0, i32 0), <{ i32, i8* }>* %vararg_buffer5) #0
+  call void @llvm.lifetime.end(i64 8, i8* %vararg_lifetime_bitcast6)
+  call void @llvm.lifetime.start(i64 4, i8* %vararg_lifetime_bitcast10)
+  %vararg_ptr11 = getelementptr <{ i8* }>, <{ i8* }>* %vararg_buffer0, i32 0, i32 0
+  store i8* %tmp3, i8** %vararg_ptr11, align 4
+  %call7 = call i32 bitcast (i32 (%struct._IO_FILE*, i8*, i8*)* @fprintf to i32 (%struct._IO_FILE*, i8*, <{ i8* }>*)*)(%struct._IO_FILE* %tmp2, i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str3, i32 0, i32 0), <{ i8* }>* %vararg_buffer0) #0
+  call void @llvm.lifetime.end(i64 4, i8* %vararg_lifetime_bitcast10)
+  call void @llvm.lifetime.end(i64 119, i8* %tmp3) #0
+  ret void
+}
+
+; CHECK: function _bar($argc,$argv) {
+; CHECK-NOT: cupcake
+; CHECK: STACKTOP = STACKTOP + 128|0;
+; CHECK-NEXT: vararg_buffer0 =
+; CHECK-NEXT: $muffin =
+; CHECK-NOT: cupcake
+; CHECK: }
+
+; Function Attrs: nounwind
+define void @bar(i32 %argc, i8** %argv) #0 {
+entry:
+  %vararg_buffer0 = alloca <{ i8* }>, align 8
+  %vararg_lifetime_bitcast10 = bitcast <{ i8* }>* %vararg_buffer0 to i8*
+  %vararg_buffer5 = alloca <{ i32, i8* }>, align 8
+  %vararg_lifetime_bitcast6 = bitcast <{ i32, i8* }>* %vararg_buffer5 to i8*
+  %vararg_buffer2 = alloca <{ i8* }>, align 8
+  %vararg_lifetime_bitcast3 = bitcast <{ i8* }>* %vararg_buffer2 to i8*
+  %vararg_buffer1 = alloca <{ i8*, i32 }>, align 8
+  %vararg_lifetime_bitcast = bitcast <{ i8*, i32 }>* %vararg_buffer1 to i8*
+  %muffin = alloca [117 x i8], align 1
+  %cupcake = alloca [119 x i8], align 1
+  %tmp = getelementptr [117 x i8], [117 x i8]* %muffin, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 117, i8* %tmp) #0
+  %cmp = icmp eq i32 %argc, 39
+  br i1 %cmp, label %if.end.thread, label %if.end
+
+if.end.thread:                                    ; preds = %entry
+  call void @llvm.lifetime.end(i64 117, i8* %tmp) #0
+  %tmp1 = getelementptr [119 x i8], [119 x i8]* %cupcake, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 119, i8* %tmp1) #0
+  %.pre = load %struct._IO_FILE*, %struct._IO_FILE** bitcast ([4 x i8]* @stderr to %struct._IO_FILE**), align 4
+  br label %if.then4
+
+if.end:                                           ; preds = %entry
+  %tmp2 = load i8*, i8** %argv, align 4
+  call void @llvm.lifetime.start(i64 8, i8* %vararg_lifetime_bitcast)
+  %vararg_ptr = getelementptr <{ i8*, i32 }>, <{ i8*, i32 }>* %vararg_buffer1, i32 0, i32 0
+  store i8* %tmp2, i8** %vararg_ptr, align 4
+  %vararg_ptr1 = getelementptr <{ i8*, i32 }>, <{ i8*, i32 }>* %vararg_buffer1, i32 0, i32 1
+  store i32 %argc, i32* %vararg_ptr1, align 4
+  %call = call i32 bitcast (i32 (i8*, i8*, i8*)* @sprintf to i32 (i8*, i8*, <{ i8*, i32 }>*)*)(i8* %tmp, i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str, i32 0, i32 0), <{ i8*, i32 }>* %vararg_buffer1) #0
+  call void @llvm.lifetime.end(i64 8, i8* %vararg_lifetime_bitcast)
+  %tmp3 = load %struct._IO_FILE*, %struct._IO_FILE** bitcast ([4 x i8]* @stderr to %struct._IO_FILE**), align 4
+  call void @llvm.lifetime.start(i64 4, i8* %vararg_lifetime_bitcast3)
+  %vararg_ptr4 = getelementptr <{ i8* }>, <{ i8* }>* %vararg_buffer2, i32 0, i32 0
+  store i8* %tmp, i8** %vararg_ptr4, align 4
+  %call2 = call i32 bitcast (i32 (%struct._IO_FILE*, i8*, i8*)* @fprintf to i32 (%struct._IO_FILE*, i8*, <{ i8* }>*)*)(%struct._IO_FILE* %tmp3, i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str1, i32 0, i32 0), <{ i8* }>* %vararg_buffer2) #0
+  call void @llvm.lifetime.end(i64 4, i8* %vararg_lifetime_bitcast3)
+  call void @llvm.lifetime.end(i64 117, i8* %tmp) #0
+  %tmp4 = getelementptr [119 x i8], [119 x i8]* %cupcake, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 119, i8* %tmp4) #0
+  %cmp3 = icmp eq i32 %argc, 45
+  br i1 %cmp3, label %if.end10, label %if.then4
+
+if.then4:                                         ; preds = %if.end, %if.end.thread
+  %tmp5 = phi %struct._IO_FILE* [ %.pre, %if.end.thread ], [ %tmp3, %if.end ]
+  %tmp6 = phi i8* [ %tmp1, %if.end.thread ], [ %tmp4, %if.end ]
+  %tmp7 = load i8*, i8** %argv, align 4
+  call void @llvm.lifetime.start(i64 8, i8* %vararg_lifetime_bitcast6)
+  %vararg_ptr7 = getelementptr <{ i32, i8* }>, <{ i32, i8* }>* %vararg_buffer5, i32 0, i32 0
+  store i32 %argc, i32* %vararg_ptr7, align 4
+  %vararg_ptr8 = getelementptr <{ i32, i8* }>, <{ i32, i8* }>* %vararg_buffer5, i32 0, i32 1
+  store i8* %tmp7, i8** %vararg_ptr8, align 4
+  %call7 = call i32 bitcast (i32 (i8*, i8*, i8*)* @sprintf to i32 (i8*, i8*, <{ i32, i8* }>*)*)(i8* %tmp6, i8* getelementptr inbounds ([38 x i8], [38 x i8]* @.str2, i32 0, i32 0), <{ i32, i8* }>* %vararg_buffer5) #0
+  call void @llvm.lifetime.end(i64 8, i8* %vararg_lifetime_bitcast6)
+  call void @llvm.lifetime.start(i64 4, i8* %vararg_lifetime_bitcast10)
+  %vararg_ptr11 = getelementptr <{ i8* }>, <{ i8* }>* %vararg_buffer0, i32 0, i32 0
+  store i8* %tmp6, i8** %vararg_ptr11, align 4
+  %call9 = call i32 bitcast (i32 (%struct._IO_FILE*, i8*, i8*)* @fprintf to i32 (%struct._IO_FILE*, i8*, <{ i8* }>*)*)(%struct._IO_FILE* %tmp5, i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str3, i32 0, i32 0), <{ i8* }>* %vararg_buffer0) #0
+  call void @llvm.lifetime.end(i64 4, i8* %vararg_lifetime_bitcast10)
+  br label %if.end10
+
+if.end10:                                         ; preds = %if.then4, %if.end
+  %tmp8 = phi i8* [ %tmp4, %if.end ], [ %tmp6, %if.then4 ]
+  call void @llvm.lifetime.end(i64 119, i8* %tmp8) #0
+  ret void
+}
+
+; Function Attrs: nounwind
+declare i32 @sprintf(i8*, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare i32 @fprintf(%struct._IO_FILE*, i8*, i8*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/JS/asm.ll b/test/CodeGen/JS/asm.ll
new file mode 100644
index 000000000000..c3099e3239ab
--- /dev/null
+++ b/test/CodeGen/JS/asm.ll
@@ -0,0 +1,16 @@
+; RUN: not llc < %s
+
+; Inline asm isn't supported (yet?). llc should report an error when it
+; encounters inline asm.
+;
+; We could support the special case of an empty inline asm string without much
+; work, but code that uses such things most likely isn't portable anyway, and
+; there are usually much better alternatives.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+define void @foo() {
+  call void asm "", ""()
+  ret void
+}
diff --git a/test/CodeGen/JS/basics.ll b/test/CodeGen/JS/basics.ll
new file mode 100644
index 000000000000..573680f810ee
--- /dev/null
+++ b/test/CodeGen/JS/basics.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: function _simple_integer_math(
+; CHECK:   [[VAL_A:\$[a-z]+]] = [[VAL_A]]|0;
+; CHECK:   [[VAL_B:\$[a-z]+]] = [[VAL_B]]|0;
+; CHECK:   [[VAL_C:\$[a-z]+]] = (([[VAL_A]]) + ([[VAL_B]]))|0;
+; CHECK:   [[VAL_D:\$[a-z]+]] = ([[VAL_C]]*20)|0;
+; CHECK:   [[VAL_E:\$[a-z]+]] = (([[VAL_D]]|0) / ([[VAL_A]]|0))&-1;
+; CHECK:   [[VAL_F:\$[a-z]+]] = (([[VAL_E]]) - 3)|0;
+; CHECK:   return ([[VAL_F]]|0);
+define i32 @simple_integer_math(i32 %a, i32 %b) nounwind {
+  %c = add i32 %a, %b
+  %d = mul i32 %c, 20
+  %e = sdiv i32 %d, %a
+  %f = sub i32 %e, 3
+  ret i32 %f
+}
+
+; CHECK: function _fneg(
+; CHECK:   [[VAL_D:\$[a-z]+]] = +[[VAL_D]]
+; CHECK:   [[VAL_F:\$[a-z]+]] = +0
+; CHECK:   [[VAL_F]] = -[[VAL_D]]
+; CHECK:   return (+[[VAL_F]]);
+define double @fneg(double %d) nounwind {
+  %f = fsub double -0.0, %d
+  ret double %f
+}
+
+; CHECK: function _flt_rounds(
+; CHECK: t = 1;
+declare i32 @llvm.flt.rounds()
+define i32 @flt_rounds() {
+  %t = call i32 @llvm.flt.rounds()
+  ret i32 %t
+}
diff --git a/test/CodeGen/JS/blockchanges.ll b/test/CodeGen/JS/blockchanges.ll
new file mode 100644
index 000000000000..fb79af14d735
--- /dev/null
+++ b/test/CodeGen/JS/blockchanges.ll
@@ -0,0 +1,400 @@
+; RUN: llc < %s
+
+; regression check for emscripten #3088 - we were not clearing BlockChanges in i64 lowering
+
+; ModuleID = 'waka.bc'
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+%"class.ZenLib::uint128" = type <{ i64, i64 }>
+
+@.str = private unnamed_addr constant [15 x i8] c"hello, world!\0A\00", align 1
+
+@.str368164 = external hidden unnamed_addr constant [10 x i8], align 1
+@.str398167 = external hidden unnamed_addr constant [6 x i8], align 1
+@.str718199 = external hidden unnamed_addr constant [9 x i8], align 1
+@.str738201 = external hidden unnamed_addr constant [21 x i8], align 1
+@.str748202 = external hidden unnamed_addr constant [26 x i8], align 1
+@.str758203 = external hidden unnamed_addr constant [21 x i8], align 1
+@.str768204 = external hidden unnamed_addr constant [8 x i8], align 1
+@.str778205 = external hidden unnamed_addr constant [14 x i8], align 1
+@.str788206 = external hidden unnamed_addr constant [22 x i8], align 1
+@.str798207 = external hidden unnamed_addr constant [25 x i8], align 1
+@.str808208 = external hidden unnamed_addr constant [24 x i8], align 1
+@.str818209 = external hidden unnamed_addr constant [20 x i8], align 1
+@.str828210 = external hidden unnamed_addr constant [34 x i8], align 1
+@.str838211 = external hidden unnamed_addr constant [31 x i8], align 1
+@.str848212 = external hidden unnamed_addr constant [29 x i8], align 1
+@.str858213 = external hidden unnamed_addr constant [44 x i8], align 1
+@.str868214 = external hidden unnamed_addr constant [12 x i8], align 1
+@.str908218 = external hidden unnamed_addr constant [21 x i8], align 1
+@.str918219 = external hidden unnamed_addr constant [8 x i8], align 1
+@.str928220 = external hidden unnamed_addr constant [6 x i8], align 1
+@.str9210864 = external hidden unnamed_addr constant [5 x i8], align 1
+@.str514367 = external hidden unnamed_addr constant [5 x i8], align 1
+@.str214409 = external hidden unnamed_addr constant [4 x i8], align 1
+@.str20216493 = external hidden unnamed_addr constant [3 x i8], align 1
+@.str2017231 = external hidden unnamed_addr constant [11 x i8], align 1
+@.str2317234 = external hidden unnamed_addr constant [14 x i8], align 1
+@.str2417235 = external hidden unnamed_addr constant [4 x i8], align 1
+@.str2717238 = external hidden unnamed_addr constant [5 x i8], align 1
+@.str3217243 = external hidden unnamed_addr constant [4 x i8], align 1
+@.str1717689 = external hidden unnamed_addr constant [5 x i8], align 1
+@.str2104 = external hidden unnamed_addr constant [1 x i8], align 1
+
+; Function Attrs: nounwind readonly
+define hidden i8* @_ZN12MediaInfoLib22Mxf_EssenceCompressionEN6ZenLib7uint128E(%"class.ZenLib::uint128"* nocapture readonly %EssenceCompression) #0 {
+entry:
+  %hi = getelementptr inbounds %"class.ZenLib::uint128", %"class.ZenLib::uint128"* %EssenceCompression, i32 0, i32 1
+  %0 = load i64, i64* %hi, align 1
+  %and = and i64 %0, -256
+  %cmp = icmp eq i64 %and, 436333716306985216
+  br i1 %cmp, label %lor.lhs.false, label %return
+
+lor.lhs.false:                                    ; preds = %entry
+  %lo = getelementptr inbounds %"class.ZenLib::uint128", %"class.ZenLib::uint128"* %EssenceCompression, i32 0, i32 0
+  %1 = load i64, i64* %lo, align 1
+  %and1 = and i64 %1, -72057594037927936
+  switch i64 %and1, label %return [
+    i64 288230376151711744, label %if.end
+    i64 1008806316530991104, label %if.end
+  ]
+
+if.end:                                           ; preds = %lor.lhs.false, %lor.lhs.false
+  %shr = lshr i64 %1, 56
+  %conv = trunc i64 %shr to i32
+  %and10 = lshr i64 %1, 48
+  %and14 = lshr i64 %1, 40
+  %and18 = lshr i64 %1, 32
+  %conv20 = trunc i64 %and18 to i32
+  %and22 = lshr i64 %1, 24
+  %and26 = lshr i64 %1, 16
+  %conv28 = trunc i64 %and26 to i32
+  %and30 = lshr i64 %1, 8
+  %conv32 = trunc i64 %and30 to i32
+  switch i32 %conv, label %return [
+    i32 4, label %sw.bb
+    i32 14, label %sw.bb112
+  ]
+
+sw.bb:                                            ; preds = %if.end
+  %conv12 = trunc i64 %and10 to i32
+  %conv34 = and i32 %conv12, 255
+  switch i32 %conv34, label %return [
+    i32 1, label %sw.bb35
+    i32 2, label %sw.bb64
+  ]
+
+sw.bb35:                                          ; preds = %sw.bb
+  %conv36 = and i64 %and14, 255
+  %cond12 = icmp eq i64 %conv36, 2
+  br i1 %cond12, label %sw.bb37, label %return
+
+sw.bb37:                                          ; preds = %sw.bb35
+  %conv38 = and i32 %conv20, 255
+  switch i32 %conv38, label %return [
+    i32 1, label %sw.bb39
+    i32 2, label %sw.bb42
+  ]
+
+sw.bb39:                                          ; preds = %sw.bb37
+  %conv40 = and i64 %and22, 255
+  %cond14 = icmp eq i64 %conv40, 1
+  %. = select i1 %cond14, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str214409, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0)
+  br label %return
+
+sw.bb42:                                          ; preds = %sw.bb37
+  %2 = trunc i64 %and22 to i32
+  %conv43 = and i32 %2, 255
+  switch i32 %conv43, label %sw.default61 [
+    i32 1, label %sw.bb44
+    i32 2, label %return
+    i32 3, label %sw.bb56
+    i32 113, label %sw.bb60
+  ]
+
+sw.bb44:                                          ; preds = %sw.bb42
+  %conv45 = and i32 %conv28, 255
+  switch i32 %conv45, label %sw.default54 [
+    i32 0, label %return
+    i32 1, label %return
+    i32 2, label %return
+    i32 3, label %return
+    i32 4, label %return
+    i32 17, label %return
+    i32 32, label %sw.bb52
+    i32 48, label %sw.bb53
+    i32 49, label %sw.bb53
+    i32 50, label %sw.bb53
+    i32 51, label %sw.bb53
+    i32 52, label %sw.bb53
+    i32 53, label %sw.bb53
+    i32 54, label %sw.bb53
+    i32 55, label %sw.bb53
+    i32 56, label %sw.bb53
+    i32 57, label %sw.bb53
+    i32 58, label %sw.bb53
+    i32 59, label %sw.bb53
+    i32 60, label %sw.bb53
+    i32 61, label %sw.bb53
+    i32 62, label %sw.bb53
+    i32 63, label %sw.bb53
+  ]
+
+sw.bb52:                                          ; preds = %sw.bb44
+  br label %return
+
+sw.bb53:                                          ; preds = %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44
+  br label %return
+
+sw.default54:                                     ; preds = %sw.bb44
+  br label %return
+
+sw.bb56:                                          ; preds = %sw.bb42
+  %conv57 = and i64 %and26, 255
+  %cond13 = icmp eq i64 %conv57, 1
+  %.35 = select i1 %cond13, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str368164, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0)
+  br label %return
+
+sw.bb60:                                          ; preds = %sw.bb42
+  br label %return
+
+sw.default61:                                     ; preds = %sw.bb42
+  br label %return
+
+sw.bb64:                                          ; preds = %sw.bb
+  %conv65 = and i64 %and14, 255
+  %cond9 = icmp eq i64 %conv65, 2
+  br i1 %cond9, label %sw.bb66, label %return
+
+sw.bb66:                                          ; preds = %sw.bb64
+  %conv67 = and i32 %conv20, 255
+  switch i32 %conv67, label %return [
+    i32 1, label %sw.bb68
+    i32 2, label %sw.bb75
+  ]
+
+sw.bb68:                                          ; preds = %sw.bb66
+  %3 = trunc i64 %and22 to i32
+  %conv69 = and i32 %3, 255
+  switch i32 %conv69, label %sw.default74 [
+    i32 0, label %return
+    i32 1, label %return
+    i32 126, label %return
+    i32 127, label %return
+  ]
+
+sw.default74:                                     ; preds = %sw.bb68
+  br label %return
+
+sw.bb75:                                          ; preds = %sw.bb66
+  %conv76 = and i64 %and22, 255
+  %cond10 = icmp eq i64 %conv76, 3
+  br i1 %cond10, label %sw.bb77, label %return
+
+sw.bb77:                                          ; preds = %sw.bb75
+  %conv78 = and i32 %conv28, 255
+  switch i32 %conv78, label %return [
+    i32 1, label %sw.bb79
+    i32 2, label %sw.bb84
+    i32 3, label %sw.bb92
+    i32 4, label %sw.bb96
+  ]
+
+sw.bb79:                                          ; preds = %sw.bb77
+  %conv80 = and i32 %conv32, 255
+  switch i32 %conv80, label %sw.default83 [
+    i32 1, label %return
+    i32 16, label %sw.bb82
+  ]
+
+sw.bb82:                                          ; preds = %sw.bb79
+  br label %return
+
+sw.default83:                                     ; preds = %sw.bb79
+  br label %return
+
+sw.bb84:                                          ; preds = %sw.bb77
+  %conv85 = and i32 %conv32, 255
+  switch i32 %conv85, label %sw.default91 [
+    i32 1, label %return
+    i32 4, label %sw.bb87
+    i32 5, label %sw.bb88
+    i32 6, label %sw.bb89
+    i32 28, label %sw.bb90
+  ]
+
+sw.bb87:                                          ; preds = %sw.bb84
+  br label %return
+
+sw.bb88:                                          ; preds = %sw.bb84
+  br label %return
+
+sw.bb89:                                          ; preds = %sw.bb84
+  br label %return
+
+sw.bb90:                                          ; preds = %sw.bb84
+  br label %return
+
+sw.default91:                                     ; preds = %sw.bb84
+  br label %return
+
+sw.bb92:                                          ; preds = %sw.bb77
+  %conv93 = and i64 %and30, 255
+  %cond11 = icmp eq i64 %conv93, 1
+  %.36 = select i1 %cond11, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str778205, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0)
+  br label %return
+
+sw.bb96:                                          ; preds = %sw.bb77
+  %conv97 = and i32 %conv32, 255
+  switch i32 %conv97, label %sw.default106 [
+    i32 1, label %return
+    i32 2, label %sw.bb99
+    i32 3, label %sw.bb100
+    i32 4, label %sw.bb101
+    i32 5, label %sw.bb102
+    i32 6, label %sw.bb103
+    i32 7, label %sw.bb104
+    i32 8, label %sw.bb105
+  ]
+
+sw.bb99:                                          ; preds = %sw.bb96
+  br label %return
+
+sw.bb100:                                         ; preds = %sw.bb96
+  br label %return
+
+sw.bb101:                                         ; preds = %sw.bb96
+  br label %return
+
+sw.bb102:                                         ; preds = %sw.bb96
+  br label %return
+
+sw.bb103:                                         ; preds = %sw.bb96
+  br label %return
+
+sw.bb104:                                         ; preds = %sw.bb96
+  br label %return
+
+sw.bb105:                                         ; preds = %sw.bb96
+  br label %return
+
+sw.default106:                                    ; preds = %sw.bb96
+  br label %return
+
+sw.bb112:                                         ; preds = %if.end
+  %4 = trunc i64 %and10 to i32
+  %conv113 = and i32 %4, 255
+  switch i32 %conv113, label %return [
+    i32 4, label %sw.bb114
+    i32 6, label %sw.bb127
+  ]
+
+sw.bb114:                                         ; preds = %sw.bb112
+  %conv115 = and i64 %and14, 255
+  %cond5 = icmp eq i64 %conv115, 2
+  %conv117 = and i64 %and18, 255
+  %cond6 = icmp eq i64 %conv117, 1
+  %or.cond = and i1 %cond5, %cond6
+  %conv119 = and i64 %and22, 255
+  %cond7 = icmp eq i64 %conv119, 2
+  %or.cond39 = and i1 %or.cond, %cond7
+  br i1 %or.cond39, label %sw.bb120, label %return
+
+sw.bb120:                                         ; preds = %sw.bb114
+  %conv121 = and i64 %and26, 255
+  %cond8 = icmp eq i64 %conv121, 4
+  %.37 = select i1 %cond8, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str514367, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0)
+  br label %return
+
+sw.bb127:                                         ; preds = %sw.bb112
+  %conv128 = and i64 %and14, 255
+  %cond = icmp eq i64 %conv128, 4
+  %conv130 = and i64 %and18, 255
+  %cond1 = icmp eq i64 %conv130, 1
+  %or.cond40 = and i1 %cond, %cond1
+  %conv132 = and i64 %and22, 255
+  %cond2 = icmp eq i64 %conv132, 2
+  %or.cond41 = and i1 %or.cond40, %cond2
+  %conv134 = and i64 %and26, 255
+  %cond3 = icmp eq i64 %conv134, 4
+  %or.cond42 = and i1 %or.cond41, %cond3
+  br i1 %or.cond42, label %sw.bb135, label %return
+
+sw.bb135:                                         ; preds = %sw.bb127
+  %conv136 = and i64 %and30, 255
+  %cond4 = icmp eq i64 %conv136, 2
+  %.38 = select i1 %cond4, i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str868214, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0)
+  br label %return
+
+return:                                           ; preds = %sw.bb135, %sw.bb127, %sw.bb120, %sw.bb114, %sw.bb112, %sw.default106, %sw.bb105, %sw.bb104, %sw.bb103, %sw.bb102, %sw.bb101, %sw.bb100, %sw.bb99, %sw.bb96, %sw.bb92, %sw.default91, %sw.bb90, %sw.bb89, %sw.bb88, %sw.bb87, %sw.bb84, %sw.default83, %sw.bb82, %sw.bb79, %sw.bb77, %sw.bb75, %sw.default74, %sw.bb68, %sw.bb68, %sw.bb68, %sw.bb68, %sw.bb66, %sw.bb64, %sw.default61, %sw.bb60, %sw.bb56, %sw.default54, %sw.bb53, %sw.bb52, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb44, %sw.bb42, %sw.bb39, %sw.bb37, %sw.bb35, %sw.bb, %if.end, %lor.lhs.false, %entry
+  %retval.0 = phi i8* [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.default106 ], [ getelementptr inbounds ([44 x i8], [44 x i8]* @.str858213, i32 0, i32 0), %sw.bb105 ], [ getelementptr inbounds ([29 x i8], [29 x i8]* @.str848212, i32 0, i32 0), %sw.bb104 ], [ getelementptr inbounds ([31 x i8], [31 x i8]* @.str838211, i32 0, i32 0), %sw.bb103 ], [ getelementptr inbounds ([34 x i8], [34 x i8]* @.str828210, i32 0, i32 0), %sw.bb102 ], [ getelementptr inbounds ([20 x i8], [20 x i8]* @.str818209, i32 0, i32 0), %sw.bb101 ], [ getelementptr inbounds ([24 x i8], [24 x i8]* @.str808208, i32 0, i32 0), %sw.bb100 ], [ getelementptr inbounds ([25 x i8], [25 x i8]* @.str798207, i32 0, i32 0), %sw.bb99 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.default91 ], [ getelementptr inbounds ([8 x i8], [8 x i8]* @.str768204, i32 0, i32 0), %sw.bb90 ], [ getelementptr inbounds ([21 x i8], [21 x i8]* @.str758203, i32 0, i32 0), %sw.bb89 ], [ getelementptr inbounds ([26 x i8], [26 x i8]* @.str748202, i32 0, i32 0), %sw.bb88 ], [ getelementptr inbounds ([21 x i8], [21 x i8]* @.str738201, i32 0, i32 0), %sw.bb87 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.default83 ], [ getelementptr inbounds ([9 x i8], [9 x i8]* @.str718199, i32 0, i32 0), %sw.bb82 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.default74 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.default61 ], [ getelementptr inbounds ([5 x i8], [5 x i8]* @.str514367, i32 0, i32 0), %sw.bb60 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.default54 ], [ getelementptr inbounds ([4 x i8], [4 x i8]* @.str2417235, i32 0, i32 0), %sw.bb53 ], [ getelementptr inbounds ([14 x i8], [14 x i8]* @.str2317234, i32 0, i32 0), %sw.bb52 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %lor.lhs.false ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %entry ], [ %., %sw.bb39 ], [ getelementptr inbounds ([11 x i8], [11 x i8]* @.str2017231, i32 0, i32 0), %sw.bb44 ], [ getelementptr inbounds ([11 x i8], [11 x i8]* @.str2017231, i32 0, i32 0), %sw.bb44 ], [ getelementptr inbounds ([11 x i8], [11 x i8]* @.str2017231, i32 0, i32 0), %sw.bb44 ], [ getelementptr inbounds ([11 x i8], [11 x i8]* @.str2017231, i32 0, i32 0), %sw.bb44 ], [ getelementptr inbounds ([11 x i8], [11 x i8]* @.str2017231, i32 0, i32 0), %sw.bb44 ], [ getelementptr inbounds ([11 x i8], [11 x i8]* @.str2017231, i32 0, i32 0), %sw.bb44 ], [ getelementptr inbounds ([3 x i8], [3 x i8]* @.str20216493, i32 0, i32 0), %sw.bb42 ], [ %.35, %sw.bb56 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb37 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb35 ], [ getelementptr inbounds ([4 x i8], [4 x i8]* @.str3217243, i32 0, i32 0), %sw.bb68 ], [ getelementptr inbounds ([4 x i8], [4 x i8]* @.str3217243, i32 0, i32 0), %sw.bb68 ], [ getelementptr inbounds ([4 x i8], [4 x i8]* @.str3217243, i32 0, i32 0), %sw.bb68 ], [ getelementptr inbounds ([4 x i8], [4 x i8]* @.str3217243, i32 0, i32 0), %sw.bb68 ], [ getelementptr inbounds ([6 x i8], [6 x i8]* @.str398167, i32 0, i32 0), %sw.bb79 ], [ getelementptr inbounds ([5 x i8], [5 x i8]* @.str2717238, i32 0, i32 0), %sw.bb84 ], [ %.36, %sw.bb92 ], [ getelementptr inbounds ([22 x i8], [22 x i8]* @.str788206, i32 0, i32 0), %sw.bb96 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb77 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb75 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb66 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb64 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb ], [ %.37, %sw.bb120 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb114 ], [ %.38, %sw.bb135 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb127 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.bb112 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %if.end ]
+  ret i8* %retval.0
+}
+
+; Function Attrs: nounwind readonly
+define hidden i8* @_ZN12MediaInfoLib27Mxf_Sequence_DataDefinitionEN6ZenLib7uint128E(%"class.ZenLib::uint128"* nocapture readonly %DataDefinition) #0 {
+entry:
+  %lo = getelementptr inbounds %"class.ZenLib::uint128", %"class.ZenLib::uint128"* %DataDefinition, i32 0, i32 0
+  %0 = load i64, i64* %lo, align 1
+  %and = lshr i64 %0, 32
+  %conv = trunc i64 %and to i32
+  %and2 = lshr i64 %0, 24
+  %conv5 = and i32 %conv, 255
+  switch i32 %conv5, label %return [
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb9
+  ]
+
+sw.bb:                                            ; preds = %entry
+  %conv4 = trunc i64 %and2 to i32
+  %conv6 = and i32 %conv4, 255
+  switch i32 %conv6, label %sw.default [
+    i32 1, label %return
+    i32 2, label %return
+    i32 3, label %return
+    i32 16, label %sw.bb8
+  ]
+
+sw.bb8:                                           ; preds = %sw.bb
+  br label %return
+
+sw.default:                                       ; preds = %sw.bb
+  br label %return
+
+sw.bb9:                                           ; preds = %entry
+  %1 = trunc i64 %and2 to i32
+  %conv10 = and i32 %1, 255
+  switch i32 %conv10, label %sw.default14 [
+    i32 1, label %return
+    i32 2, label %sw.bb12
+    i32 3, label %sw.bb13
+  ]
+
+sw.bb12:                                          ; preds = %sw.bb9
+  br label %return
+
+sw.bb13:                                          ; preds = %sw.bb9
+  br label %return
+
+sw.default14:                                     ; preds = %sw.bb9
+  br label %return
+
+return:                                           ; preds = %sw.default14, %sw.bb13, %sw.bb12, %sw.bb9, %sw.default, %sw.bb8, %sw.bb, %sw.bb, %sw.bb, %entry
+  %retval.0 = phi i8* [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.default14 ], [ getelementptr inbounds ([5 x i8], [5 x i8]* @.str1717689, i32 0, i32 0), %sw.bb13 ], [ getelementptr inbounds ([6 x i8], [6 x i8]* @.str928220, i32 0, i32 0), %sw.bb12 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %sw.default ], [ getelementptr inbounds ([21 x i8], [21 x i8]* @.str908218, i32 0, i32 0), %sw.bb8 ], [ getelementptr inbounds ([5 x i8], [5 x i8]* @.str9210864, i32 0, i32 0), %sw.bb ], [ getelementptr inbounds ([5 x i8], [5 x i8]* @.str9210864, i32 0, i32 0), %sw.bb ], [ getelementptr inbounds ([5 x i8], [5 x i8]* @.str9210864, i32 0, i32 0), %sw.bb ], [ getelementptr inbounds ([8 x i8], [8 x i8]* @.str918219, i32 0, i32 0), %sw.bb9 ], [ getelementptr inbounds ([1 x i8], [1 x i8]* @.str2104, i32 0, i32 0), %entry ]
+  ret i8* %retval.0
+}
+
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str, i32 0, i32 0))
+  ret i32 0
+}
+
+declare i32 @printf(i8*, ...)
+
+attributes #0 = { nounwind readonly }
+
diff --git a/test/CodeGen/JS/dead-prototypes.ll b/test/CodeGen/JS/dead-prototypes.ll
new file mode 100644
index 000000000000..6d57b5a791b2
--- /dev/null
+++ b/test/CodeGen/JS/dead-prototypes.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s | not grep printf
+
+; llc shouldn't emit any code or bookkeeping for unused declarations.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+define void @foo() {
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...)
diff --git a/test/CodeGen/JS/expand-i64.ll b/test/CodeGen/JS/expand-i64.ll
new file mode 100644
index 000000000000..30971c5ba3dd
--- /dev/null
+++ b/test/CodeGen/JS/expand-i64.ll
@@ -0,0 +1,271 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: function _add($0,$1,$2,$3) {
+; CHECK:  $4 = (_i64Add(($0|0),($1|0),($2|0),($3|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @add(i64 %a, i64 %b) {
+  %c = add i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _sub($0,$1,$2,$3) {
+; CHECK:  $4 = (_i64Subtract(($0|0),($1|0),($2|0),($3|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @sub(i64 %a, i64 %b) {
+  %c = sub i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _mul($0,$1,$2,$3) {
+; CHECK:  $4 = (___muldi3(($0|0),($1|0),($2|0),($3|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @mul(i64 %a, i64 %b) {
+  %c = mul i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _sdiv($0,$1,$2,$3) {
+; CHECK:  $4 = (___divdi3(($0|0),($1|0),($2|0),($3|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @sdiv(i64 %a, i64 %b) {
+  %c = sdiv i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _udiv($0,$1,$2,$3) {
+; CHECK:  $4 = (___udivdi3(($0|0),($1|0),($2|0),($3|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @udiv(i64 %a, i64 %b) {
+  %c = udiv i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _srem($0,$1,$2,$3) {
+; CHECK:  $4 = (___remdi3(($0|0),($1|0),($2|0),($3|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @srem(i64 %a, i64 %b) {
+  %c = srem i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _urem($0,$1,$2,$3) {
+; CHECK:  $4 = (___uremdi3(($0|0),($1|0),($2|0),($3|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @urem(i64 %a, i64 %b) {
+  %c = urem i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _and($0,$1,$2,$3) {
+; CHECK:  $4 = $0 & $2;
+; CHECK:  $5 = $1 & $3;
+; CHECK: }
+define i64 @and(i64 %a, i64 %b) {
+  %c = and i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _or($0,$1,$2,$3) {
+; CHECK:  $4 = $0 | $2;
+; CHECK:  $5 = $1 | $3;
+; CHECK: }
+define i64 @or(i64 %a, i64 %b) {
+  %c = or i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _xor($0,$1,$2,$3) {
+; CHECK:  $4 = $0 ^ $2;
+; CHECK:  $5 = $1 ^ $3;
+; CHECK: }
+define i64 @xor(i64 %a, i64 %b) {
+  %c = xor i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _lshr($0,$1,$2,$3) {
+; CHECK:  $4 = (_bitshift64Lshr(($0|0),($1|0),($2|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @lshr(i64 %a, i64 %b) {
+  %c = lshr i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _ashr($0,$1,$2,$3) {
+; CHECK:  $4 = (_bitshift64Ashr(($0|0),($1|0),($2|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @ashr(i64 %a, i64 %b) {
+  %c = ashr i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _shl($0,$1,$2,$3) {
+; CHECK:  $4 = (_bitshift64Shl(($0|0),($1|0),($2|0))|0);
+; CHECK:  $5 = tempRet0;
+; CHECK: }
+define i64 @shl(i64 %a, i64 %b) {
+  %c = shl i64 %a, %b
+  ret i64 %c
+}
+
+; CHECK: function _icmp_eq($0,$1,$2,$3) {
+; CHECK:  $4 = ($0|0)==($2|0);
+; CHECK:  $5 = ($1|0)==($3|0);
+; CHECK:  $6 = $4 & $5;
+; CHECK: }
+define i32 @icmp_eq(i64 %a, i64 %b) {
+  %c = icmp eq i64 %a, %b
+  %d = zext i1 %c to i32
+  ret i32 %d
+}
+
+; CHECK: function _icmp_ne($0,$1,$2,$3) {
+; CHECK:  $4 = ($0|0)!=($2|0);
+; CHECK:  $5 = ($1|0)!=($3|0);
+; CHECK:  $6 = $4 | $5;
+; CHECK: }
+define i32 @icmp_ne(i64 %a, i64 %b) {
+  %c = icmp ne i64 %a, %b
+  %d = zext i1 %c to i32
+  ret i32 %d
+}
+
+; CHECK: function _icmp_slt($0,$1,$2,$3) {
+; CHECK:  $4 = ($1|0)<($3|0);
+; CHECK:  $5 = ($0>>>0)<($2>>>0);
+; CHECK:  $6 = ($1|0)==($3|0);
+; CHECK:  $7 = $6 & $5;
+; CHECK:  $8 = $4 | $7;
+; CHECK: }
+define i32 @icmp_slt(i64 %a, i64 %b) {
+  %c = icmp slt i64 %a, %b
+  %d = zext i1 %c to i32
+  ret i32 %d
+}
+
+; CHECK: function _icmp_ult($0,$1,$2,$3) {
+; CHECK:  $4 = ($1>>>0)<($3>>>0);
+; CHECK:  $5 = ($0>>>0)<($2>>>0);
+; CHECK:  $6 = ($1|0)==($3|0);
+; CHECK:  $7 = $6 & $5;
+; CHECK:  $8 = $4 | $7;
+; CHECK: }
+define i32 @icmp_ult(i64 %a, i64 %b) {
+  %c = icmp ult i64 %a, %b
+  %d = zext i1 %c to i32
+  ret i32 %d
+}
+
+; CHECK: function _load($a) {
+; CHECK:  $0 = $a;
+; CHECK:  $1 = $0;
+; CHECK:  $2 = HEAP32[$1>>2]|0;
+; CHECK:  $3 = (($0) + 4)|0;
+; CHECK:  $4 = $3;
+; CHECK:  $5 = HEAP32[$4>>2]|0;
+; CHECK: }
+define i64 @load(i64 *%a) {
+  %c = load i64, i64* %a
+  ret i64 %c
+}
+
+; CHECK: function _aligned_load($a) {
+; CHECK:  $0 = $a;
+; CHECK:  $1 = $0;
+; CHECK:  $2 = HEAP32[$1>>2]|0;
+; CHECK:  $3 = (($0) + 4)|0;
+; CHECK:  $4 = $3;
+; CHECK:  $5 = HEAP32[$4>>2]|0;
+; CHECK: }
+define i64 @aligned_load(i64 *%a) {
+  %c = load i64, i64* %a, align 16
+  ret i64 %c
+}
+
+; CHECK: function _store($a,$0,$1) {
+; CHECK:  $2 = $a;
+; CHECK:  $3 = $2;
+; CHECK:  HEAP32[$3>>2] = $0;
+; CHECK:  $4 = (($2) + 4)|0;
+; CHECK:  $5 = $4;
+; CHECK:  HEAP32[$5>>2] = $1;
+; CHECK: }
+define void @store(i64 *%a, i64 %b) {
+  store i64 %b, i64* %a
+  ret void
+}
+
+; CHECK: function _aligned_store($a,$0,$1) {
+; CHECK:  $2 = $a;
+; CHECK:  $3 = $2;
+; CHECK:  HEAP32[$3>>2] = $0;
+; CHECK:  $4 = (($2) + 4)|0;
+; CHECK:  $5 = $4;
+; CHECK:  HEAP32[$5>>2] = $1;
+; CHECK: }
+define void @aligned_store(i64 *%a, i64 %b) {
+  store i64 %b, i64* %a, align 16
+  ret void
+}
+
+; CHECK: function _call($0,$1) {
+; CHECK:  $2 = (_foo(($0|0),($1|0))|0);
+; CHECK: }
+declare i64 @foo(i64 %arg)
+define i64 @call(i64 %arg) {
+  %ret = call i64 @foo(i64 %arg)
+  ret i64 %ret
+}
+
+; CHECK: function _trunc($0,$1) {
+; CHECK:   return ($0|0);
+; CHECK: }
+define i32 @trunc(i64 %x) {
+  %y = trunc i64 %x to i32
+  ret i32 %y
+}
+
+; CHECK: function _zext($x) {
+; CHECK:  tempRet0 = (0);
+; CHECL:  return ($x|0);
+; CHECK: }
+define i64 @zext(i32 %x) {
+  %y = zext i32 %x to i64
+  ret i64 %y
+}
+
+; CHECK: function _sext($x) {
+; CHECK:  $0 = ($x|0)<(0);
+; CHECK:  $1 = $0 << 31 >> 31;
+; CHECK:  tempRet0 = ($1);
+; CHECK:  return ($x|0);
+; CHECK: }
+define i64 @sext(i32 %x) {
+  %y = sext i32 %x to i64
+  ret i64 %y
+}
+
+; CHECK: function _unreachable_blocks($p) {
+; CHECK: }
+define void @unreachable_blocks(i64* %p) {
+  ret void
+
+dead:
+  %t = load i64, i64* %p
+  %s = add i64 %t, 1
+  store i64 %s, i64* %p
+  ret void
+}
+
diff --git a/test/CodeGen/JS/expand-insertextract.ll b/test/CodeGen/JS/expand-insertextract.ll
new file mode 100644
index 000000000000..984da571bde1
--- /dev/null
+++ b/test/CodeGen/JS/expand-insertextract.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK:  sp = STACKTOP;
+; CHECK:  STACKTOP = STACKTOP + 16|0;
+; CHECK:  $0 = sp;
+; CHECK:  SIMD_Float32x4_store(HEAPU8, temp_Float32x4_ptr, $p);
+; CHECK:  $1 = (($0) + ($i<<2)|0);
+; CHECK:  $2 = +HEAPF32[$1>>2];
+; CHECK:  STACKTOP = sp;return (+$2);
+; CHECK: }
+define float @ext(<4 x float> %p, i32 %i) {
+  %f = extractelement <4 x float> %p, i32 %i
+  ret float %f
+}
+
+; CHECK:  sp = STACKTOP;
+; CHECK:  STACKTOP = STACKTOP + 16|0;
+; CHECK:  $0 = sp;
+; CHECK:  SIMD_Float32x4_store(HEAPU8, temp_Float32x4_ptr, $p);
+; CHECK:  $1 = (($0) + ($i<<2)|0);
+; CHECK:  HEAPF32[$1>>2] = $f;
+; CHECK:  $2 = SIMD_Float32x4_load(HEAPU8, $0);
+; CHECK:  STACKTOP = sp;return (SIMD_Float32x4_check($2));
+; CHECK: }
+define <4 x float> @ins(<4 x float> %p, float %f, i32 %i) {
+  %v = insertelement <4 x float> %p, float %f, i32 %i
+  ret <4 x float> %v
+}
diff --git a/test/CodeGen/JS/expect-intrinsics.ll b/test/CodeGen/JS/expect-intrinsics.ll
new file mode 100644
index 000000000000..6d2cba459260
--- /dev/null
+++ b/test/CodeGen/JS/expect-intrinsics.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s | FileCheck %s
+
+; Handle the llvm.expect intrinsic.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: $expval = $x;
+; CHECK: $tobool = ($expval|0)!=(0);
+
+define void @foo(i32 %x) {
+entry:
+  %expval = call i32 @llvm.expect.i32(i32 %x, i32 0)
+  %tobool = icmp ne i32 %expval, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  call void @callee()
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.expect.i32(i32, i32) #0
+
+declare void @callee()
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/JS/ffis-f32.ll b/test/CodeGen/JS/ffis-f32.ll
new file mode 100644
index 000000000000..a6b8cf14d462
--- /dev/null
+++ b/test/CodeGen/JS/ffis-f32.ll
@@ -0,0 +1,90 @@
+; RUN: llc -emscripten-precise-f32 < %s | FileCheck %s
+
+; Use proper types to ffi calls, with float32
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+declare void @blackbox.float(float %a)
+declare void @blackbox.double(double %a)
+
+; CHECK:      (+Math_sqrt(+1));
+; CHECK-NEXT: (Math_fround(Math_sqrt(Math_fround(+1))));
+; CHECK-NEXT: (+Math_sqrt((+$d)));
+; CHECK-NEXT: (Math_fround(Math_sqrt((Math_fround($f)))));
+; CHECK-NEXT: (+Math_ceil(+1));
+; CHECK-NEXT: (Math_fround(Math_ceil(Math_fround(+1))));
+; CHECK-NEXT: (+Math_floor(+1));
+; CHECK-NEXT: (Math_fround(Math_floor(Math_fround(+1))));
+; CHECK-NEXT: (+_min(+1,+1));
+; CHECK-NEXT: (Math_fround(+(_fmin(+1,+1))));
+; CHECK-NEXT: (+_max(+1,+1));
+; CHECK-NEXT: (Math_fround(+(_fmax(+1,+1))));
+; CHECK-NEXT: (+Math_abs(+1));
+; CHECK-NEXT: (Math_fround(+(_absf(+1))));
+; CHECK-NEXT: (+Math_sin(+1));
+; CHECK-NEXT: (Math_fround(+(Math_sin(+1))));
+define void @foo(i32 %x) {
+entry:
+  %f = fadd float 1.0, 2.0
+  %d = fadd double 1.0, 2.0
+
+  %sqrtd = call double @sqrt(double 1.0)
+  %sqrtf = call float @sqrtf(float 1.0)
+  %sqrtdv = call double @sqrt(double %d) ; check vars too
+  %sqrtfv = call float @sqrtf(float %f)
+
+  %ceild = call double @ceil(double 1.0)
+  %ceilf = call float @ceilf(float 1.0)
+
+  %floord = call double @floor(double 1.0)
+  %floorf = call float @floorf(float 1.0)
+
+  ; these could be optimized in theory
+
+  %mind = call double @min(double 1.0, double 1.0)
+  %minf = call float @fmin(float 1.0, float 1.0)
+
+  %maxd = call double @max(double 1.0, double 1.0)
+  %maxf = call float @fmax(float 1.0, float 1.0)
+
+  %absd = call double @abs(double 1.0)
+  %absf = call float @absf(float 1.0)
+
+  ; sin is NOT optimizable with floats
+
+  %sind = call double @sin(double 1.0)
+  %sinf = call float @sinf(float 1.0)
+
+  call void @blackbox.float(float %sqrtf)
+  call void @blackbox.double(double %sqrtd)
+
+  call void @blackbox.float(float %sinf)
+  call void @blackbox.double(double %sind)
+
+  ret void
+}
+
+declare double @sqrt(double %x)
+declare float @sqrtf(float %x)
+
+declare double @ceil(double %x)
+declare float @ceilf(float %x)
+
+declare double @floor(double %x)
+declare float @floorf(float %x)
+
+declare double @min(double %x, double %y)
+declare float @fmin(float %x, float %y)
+
+declare double @max(double %x, double %y)
+declare float @fmax(float %x, float %y)
+
+declare double @abs(double %x)
+declare float @absf(float %x)
+
+declare double @sin(double %x)
+declare float @sinf(float %x)
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/JS/ffis.ll b/test/CodeGen/JS/ffis.ll
new file mode 100644
index 000000000000..09f2fb536091
--- /dev/null
+++ b/test/CodeGen/JS/ffis.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s | FileCheck %s
+
+; Use proper types to ffi calls, no float32
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+declare void @blackbox.float(float %a)
+declare void @blackbox.double(double %a)
+
+; CHECK:      (+Math_sqrt(+1));
+; CHECK-NEXT: (+Math_sqrt(+1));
+; CHECK-NEXT: (+Math_sqrt((+$d)));
+; CHECK-NEXT: (+Math_sqrt((+$f)));
+; CHECK-NEXT: (+Math_ceil(+1));
+; CHECK-NEXT: (+Math_ceil(+1));
+; CHECK-NEXT: (+Math_floor(+1));
+; CHECK-NEXT: (+Math_floor(+1));
+; CHECK-NEXT: (+_min(+1,+1));
+; CHECK-NEXT: (+_fmin(+1,+1));
+; CHECK-NEXT: (+_max(+1,+1));
+; CHECK-NEXT: (+_fmax(+1,+1));
+; CHECK-NEXT: (+Math_abs(+1));
+; CHECK-NEXT: (+_absf(+1));
+; CHECK-NEXT: (+Math_sin(+1));
+; CHECK-NEXT: (+Math_sin(+1));
+define void @foo(i32 %x) {
+entry:
+  %f = fadd float 1.0, 2.0
+  %d = fadd double 1.0, 2.0
+
+  %sqrtd = call double @sqrt(double 1.0)
+  %sqrtf = call float @sqrtf(float 1.0)
+  %sqrtdv = call double @sqrt(double %d) ; check vars too
+  %sqrtfv = call float @sqrtf(float %f)
+
+  %ceild = call double @ceil(double 1.0)
+  %ceilf = call float @ceilf(float 1.0)
+
+  %floord = call double @floor(double 1.0)
+  %floorf = call float @floorf(float 1.0)
+
+  ; these could be optimized in theory
+
+  %mind = call double @min(double 1.0, double 1.0)
+  %minf = call float @fmin(float 1.0, float 1.0)
+
+  %maxd = call double @max(double 1.0, double 1.0)
+  %maxf = call float @fmax(float 1.0, float 1.0)
+
+  %absd = call double @abs(double 1.0)
+  %absf = call float @absf(float 1.0)
+
+  ; sin is NOT optimizable with floats
+
+  %sind = call double @sin(double 1.0)
+  %sinf = call float @sinf(float 1.0)
+
+  call void @blackbox.float(float %sqrtf)
+  call void @blackbox.double(double %sqrtd)
+
+  call void @blackbox.float(float %sinf)
+  call void @blackbox.double(double %sind)
+
+  ret void
+}
+
+declare double @sqrt(double %x)
+declare float @sqrtf(float %x)
+
+declare double @ceil(double %x)
+declare float @ceilf(float %x)
+
+declare double @floor(double %x)
+declare float @floorf(float %x)
+
+declare double @min(double %x, double %y)
+declare float @fmin(float %x, float %y)
+
+declare double @max(double %x, double %y)
+declare float @fmax(float %x, float %y)
+
+declare double @abs(double %x)
+declare float @absf(float %x)
+
+declare double @sin(double %x)
+declare float @sinf(float %x)
+
+attributes #0 = { nounwind readnone }
+
diff --git a/test/CodeGen/JS/getelementptr.ll b/test/CodeGen/JS/getelementptr.ll
new file mode 100644
index 000000000000..1b1e15cf3f83
--- /dev/null
+++ b/test/CodeGen/JS/getelementptr.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test simple getelementptr codegen.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; Test that trailing indices are folded.
+
+; CHECK: function _getelementptr([[VAL_P:\$[a-z_]+]]) {
+; CHECK:  [[GEP:\$[a-z_]+]] = ((([[GEPINT:\$[a-z_]+]])) + 588|0);
+define i32* @getelementptr([10 x [12 x i32] ]* %p) {
+  %t = getelementptr [10 x [12 x i32]], [10 x [12 x i32]]* %p, i32 1, i32 2, i32 3
+  ret i32* %t
+}
+
+%struct.A = type { i32, [34 x i16] }
+
+@global = global [72 x i8] zeroinitializer, align 4
+
+; Fold globals into getelementptr addressing.
+
+; CHECK: function _fold_global($i) {
+; CHECK: $add = (($i) + 34)|0;
+; CHECK: $arrayidx = (12 + ($add<<1)|0);
+; CHECK: $t0 = HEAP16[$arrayidx>>1]|0;
+define i16 @fold_global(i32 %i) {
+  %add = add i32 %i, 34
+  %arrayidx = getelementptr %struct.A, %struct.A* bitcast ([72 x i8]* @global to %struct.A*), i32 0, i32 1, i32 %add
+  %t0 = load volatile i16, i16* %arrayidx, align 2
+  ret i16 %t0
+}
+
+; Don't reassociate the indices of a getelementptr, which would increase
+; the chances of creating out-of-bounds intermediate values.
+
+; CHECK: function _no_reassociate($p,$i) {
+; CHECK: $add = (($i) + 34)|0;
+; CHECK: $arrayidx = (((($p)) + 4|0) + ($add<<1)|0);
+; CHECK: $t0 = HEAP16[$arrayidx>>1]|0;
+define i16 @no_reassociate(%struct.A* %p, i32 %i) {
+  %add = add i32 %i, 34
+  %arrayidx = getelementptr %struct.A, %struct.A* %p, i32 0, i32 1, i32 %add
+  %t0 = load volatile i16, i16* %arrayidx, align 2
+  ret i16 %t0
+}
+
diff --git a/test/CodeGen/JS/global-alias.ll b/test/CodeGen/JS/global-alias.ll
new file mode 100644
index 000000000000..3049216196f4
--- /dev/null
+++ b/test/CodeGen/JS/global-alias.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s | FileCheck %s
+
+; Handle global aliases of various kinds.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+@pri = internal global [60 x i8] zeroinitializer
+@pub = global [60 x i8] zeroinitializer
+
+@pri_int = internal alias [60 x i8], [60 x i8]* @pri
+@pri_wea = weak alias [60 x i8], [60 x i8]* @pri
+@pri_nor = alias [60 x i8], [60 x i8]* @pri
+
+@pub_int = internal alias [60 x i8], [60 x i8]* @pub
+@pub_wea = weak alias [60 x i8], [60 x i8]* @pub
+@pub_nor = alias [60 x i8], [60 x i8]* @pub
+
+; CHECK: test0(
+; CHECK: return ([[PRI:[0-9]+]]|0);
+define [60 x i8]* @test0() {
+  ret [60 x i8]* @pri
+}
+; CHECK: test1(
+; CHECK: return ([[PRI]]|0);
+define [60 x i8]* @test1() {
+  ret [60 x i8]* @pri_int
+}
+; CHECK: test2(
+; CHECK: return ([[PRI]]|0);
+define [60 x i8]* @test2() {
+  ret [60 x i8]* @pri_wea
+}
+; CHECK: test3(
+; CHECK: return ([[PRI]]|0);
+define [60 x i8]* @test3() {
+  ret [60 x i8]* @pri_nor
+}
+
+; CHECK: test4(
+; CHECK: return ([[PUB:[0-9]+]]|0);
+define [60 x i8]* @test4() {
+  ret [60 x i8]* @pub
+}
+; CHECK: test5(
+; CHECK: return ([[PUB]]|0);
+define [60 x i8]* @test5() {
+  ret [60 x i8]* @pub_int
+}
+; CHECK: test6(
+; CHECK: return ([[PUB]]|0);
+define [60 x i8]* @test6() {
+  ret [60 x i8]* @pub_wea
+}
+; CHECK: test7(
+; CHECK: return ([[PUB]]|0);
+define [60 x i8]* @test7() {
+  ret [60 x i8]* @pub_nor
+}
diff --git a/test/CodeGen/JS/globals.ll b/test/CodeGen/JS/globals.ll
new file mode 100644
index 000000000000..42e57457722f
--- /dev/null
+++ b/test/CodeGen/JS/globals.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test simple global variable codegen.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: function _loads() {
+; CHECK:  [[VAR_t:\$[a-z]+]] = HEAP32[4]|0;
+; CHECK:  [[VAR_s:\$[a-z]+]] = +HEAPF64[1];
+; CHECK:  [[VAR_u:\$[a-z]+]] = HEAP8[20]|0;
+; CHECK:  [[VAR_a:\$[a-z]+]] = (~~(([[VAR_s:\$[a-z]+]]))>>>0);
+; CHECK:  [[VAR_b:\$[a-z]+]] = [[VAR_u:\$[a-z]+]] << 24 >> 24;
+; CHECK:  [[VAR_c:\$[a-z]+]] = (([[VAR_t:\$[a-z]+]]) + ([[VAR_a:\$[a-z]+]]))|0;
+; CHECK:  [[VAR_d:\$[a-z]+]] = (([[VAR_c:\$[a-z]+]]) + ([[VAR_b:\$[a-z]+]]))|0;
+; CHECK:  return ([[VAR_d:\$[a-z]+]]|0);
+define i32 @loads() {
+  %t = load i32, i32* @A
+  %s = load double, double* @B
+  %u = load i8, i8* @C
+  %a = fptoui double %s to i32
+  %b = sext i8 %u to i32
+  %c = add i32 %t, %a
+  %d = add i32 %c, %b
+  ret i32 %d
+}
+
+; CHECK: function _stores([[VAR_m:\$[a-z]+]],[[VAR_n:\$[a-z]+]],[[VAR_o:\$[a-z]+]]) {
+; CHECK:  [[VAR_m:\$[a-z]+]] = [[VAR_m:\$[a-z]+]]|0;
+; CHECK:  [[VAR_n:\$[a-z]+]] = [[VAR_n:\$[a-z]+]]|0;
+; CHECK:  [[VAR_o:\$[a-z]+]] = +[[VAR_o:\$[a-z]+]];
+; CHECK:  HEAP32[4] = [[VAR_n:\$[a-z]+]];
+; CHECK:  HEAPF64[1] = [[VAR_o:\$[a-z]+]];
+; CHECK:  HEAP8[20] = [[VAR_m:\$[a-z]+]];
+define void @stores(i8 %m, i32 %n, double %o) {
+  store i32 %n, i32* @A
+  store double %o, double* @B
+  store i8 %m, i8* @C
+  ret void
+}
+
+; CHECK: allocate([205,204,204,204,204,76,55,64,133,26,0,0,2], "i8", ALLOC_NONE, Runtime.GLOBAL_BASE);
+@A = global i32 6789
+@B = global double 23.3
+@C = global i8 2
diff --git a/test/CodeGen/JS/insertelement-chains.ll b/test/CodeGen/JS/insertelement-chains.ll
new file mode 100644
index 000000000000..e28bfb49399d
--- /dev/null
+++ b/test/CodeGen/JS/insertelement-chains.ll
@@ -0,0 +1,99 @@
+; RUN: llc -emscripten-precise-f32 < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; Basic constructor.
+
+; CHECK: function _test0($x,$y,$z,$w) {
+; CHECK:   $d = SIMD_Float32x4($x, $y, $z, $w)
+; CHECK: }
+define <4 x float> @test0(float %x, float %y, float %z, float %w) {
+    %a = insertelement <4 x float> undef, float %x, i32 0
+    %b = insertelement <4 x float> %a, float %y, i32 1
+    %c = insertelement <4 x float> %b, float %z, i32 2
+    %d = insertelement <4 x float> %c, float %w, i32 3
+    ret <4 x float> %d
+}
+
+; Same as test0 but elements inserted in a different order.
+
+; CHECK: function _test1($x,$y,$z,$w) {
+; CHECK:   $d = SIMD_Float32x4($x, $y, $z, $w)
+; CHECK: }
+define <4 x float> @test1(float %x, float %y, float %z, float %w) {
+    %a = insertelement <4 x float> undef, float %w, i32 3
+    %b = insertelement <4 x float> %a, float %y, i32 1
+    %c = insertelement <4 x float> %b, float %z, i32 2
+    %d = insertelement <4 x float> %c, float %x, i32 0
+    ret <4 x float> %d
+}
+
+; Overwriting elements.
+
+; CHECK: function _test2($x,$y,$z,$w) {
+; CHECK:   $h = SIMD_Float32x4($x, $y, $z, $w)
+; CHECK: }
+define <4 x float> @test2(float %x, float %y, float %z, float %w) {
+    %a = insertelement <4 x float> undef, float %z, i32 0
+    %b = insertelement <4 x float> %a, float %x, i32 0
+    %c = insertelement <4 x float> %b, float %w, i32 1
+    %d = insertelement <4 x float> %c, float %y, i32 1
+    %e = insertelement <4 x float> %d, float %x, i32 2
+    %f = insertelement <4 x float> %e, float %z, i32 2
+    %g = insertelement <4 x float> %f, float %y, i32 3
+    %h = insertelement <4 x float> %g, float %w, i32 3
+    ret <4 x float> %h
+}
+
+; Basic splat testcase.
+
+; CHECK: function _test3($x) {
+; CHECK:   $d = SIMD_Float32x4_splat($x)
+; CHECK: }
+define <4 x float> @test3(float %x) {
+    %a = insertelement <4 x float> undef, float %x, i32 0
+    %b = insertelement <4 x float> %a, float %x, i32 1
+    %c = insertelement <4 x float> %b, float %x, i32 2
+    %d = insertelement <4 x float> %c, float %x, i32 3
+    ret <4 x float> %d
+}
+
+; Same as test3 but elements inserted in a different order.
+
+; CHECK: function _test4($x) {
+; CHECK:   $d = SIMD_Float32x4_splat($x)
+; CHECK: }
+define <4 x float> @test4(float %x) {
+    %a = insertelement <4 x float> undef, float %x, i32 3
+    %b = insertelement <4 x float> %a, float %x, i32 1
+    %c = insertelement <4 x float> %b, float %x, i32 2
+    %d = insertelement <4 x float> %c, float %x, i32 0
+    ret <4 x float> %d
+}
+
+; Insert chain.
+
+; CHECK: function _test5($x,$y,$z,$w) {
+; CHECK:   $f = SIMD_Float32x4_replaceLane(SIMD_Float32x4_replaceLane(SIMD_Float32x4_replaceLane(SIMD_Float32x4_splat(Math_fround(0)),0,$x),1,$y),2,$z)
+; CHECK: }
+define <4 x float> @test5(float %x, float %y, float %z, float %w) {
+    %a = insertelement <4 x float> undef, float %z, i32 0
+    %b = insertelement <4 x float> %a, float %x, i32 0
+    %c = insertelement <4 x float> %b, float %w, i32 1
+    %d = insertelement <4 x float> %c, float %y, i32 1
+    %e = insertelement <4 x float> %d, float %x, i32 2
+    %f = insertelement <4 x float> %e, float %z, i32 2
+    ret <4 x float> %f
+}
+
+; Splat via insert+shuffle.
+
+; CHECK: function _test6($x) {
+; CHECK:   $b = SIMD_Float32x4_splat($x)
+; CHECK: }
+define <4 x float> @test6(float %x) {
+    %a = insertelement <4 x float> undef, float %x, i32 0
+    %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
+    ret <4 x float> %b
+}
diff --git a/test/CodeGen/JS/invariant-intrinsics.ll b/test/CodeGen/JS/invariant-intrinsics.ll
new file mode 100644
index 000000000000..121804868094
--- /dev/null
+++ b/test/CodeGen/JS/invariant-intrinsics.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s | not grep invariant
+
+; llc currently emits no code or bookkeeping for invariant intrinsic calls
+; or declarations.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+declare void @bar(i8*)
+
+define void @foo() {
+  %p = alloca i8
+  %i = call {}* @llvm.invariant.start.p0i8(i64 1, i8* %p)
+  call void @bar(i8* %p)
+  call void @llvm.invariant.end.p0i8({}* %i, i64 1, i8* %p)
+  ret void
+}
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture)
+declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture)
diff --git a/test/CodeGen/JS/lifetime-intrinsics.ll b/test/CodeGen/JS/lifetime-intrinsics.ll
new file mode 100644
index 000000000000..46f613bfa3d9
--- /dev/null
+++ b/test/CodeGen/JS/lifetime-intrinsics.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s | not grep lifetime
+
+; llc currently emits no code or bookkeeping for lifetime intrinsic calls
+; or declarations.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+declare void @bar(i8*)
+
+define void @foo() {
+  %p = alloca i8
+  call void @llvm.lifetime.start(i64 1, i8* %p)
+  call void @bar(i8* %p)
+  call void @llvm.lifetime.end(i64 1, i8* %p)
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
diff --git a/test/CodeGen/JS/lit.local.cfg b/test/CodeGen/JS/lit.local.cfg
new file mode 100644
index 000000000000..ee9b61f930fe
--- /dev/null
+++ b/test/CodeGen/JS/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'JSBackend' in targets:
+    config.unsupported = True
+
diff --git a/test/CodeGen/JS/mem-intrinsics.ll b/test/CodeGen/JS/mem-intrinsics.ll
new file mode 100644
index 000000000000..f0e21fc78d45
--- /dev/null
+++ b/test/CodeGen/JS/mem-intrinsics.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s | FileCheck %s
+
+; llc should emit small aligned memcpy and memset inline.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: test_unrolled_memcpy
+; CHECK: HEAP32[$d>>2]=HEAP32[$s>>2]|0;HEAP32[$d+4>>2]=HEAP32[$s+4>>2]|0;HEAP32[$d+8>>2]=HEAP32[$s+8>>2]|0;HEAP32[$d+12>>2]=HEAP32[$s+12>>2]|0;HEAP32[$d+16>>2]=HEAP32[$s+16>>2]|0;HEAP32[$d+20>>2]=HEAP32[$s+20>>2]|0;HEAP32[$d+24>>2]=HEAP32[$s+24>>2]|0;HEAP32[$d+28>>2]=HEAP32[$s+28>>2]|0;
+define void @test_unrolled_memcpy(i8* %d, i8* %s) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 32, i32 4, i1 false)
+  ret void
+}
+
+; CHECK: test_loop_memcpy
+; CHECK: dest=$d; src=$s; stop=dest+64|0; do { HEAP32[dest>>2]=HEAP32[src>>2]|0; dest=dest+4|0; src=src+4|0; } while ((dest|0) < (stop|0))
+define void @test_loop_memcpy(i8* %d, i8* %s) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 64, i32 4, i1 false)
+  ret void
+}
+
+; CHECK: test_call_memcpy
+; CHECK: memcpy(($d|0),($s|0),65536)
+define void @test_call_memcpy(i8* %d, i8* %s) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 65536, i32 4, i1 false)
+  ret void
+}
+
+; CHECK: test_unrolled_memset
+; CHECK:  HEAP32[$d>>2]=0|0;HEAP32[$d+4>>2]=0|0;HEAP32[$d+8>>2]=0|0;HEAP32[$d+12>>2]=0|0;HEAP32[$d+16>>2]=0|0;HEAP32[$d+20>>2]=0|0;HEAP32[$d+24>>2]=0|0;HEAP32[$d+28>>2]=0|0;
+define void @test_unrolled_memset(i8* %d, i8* %s) {
+  call void @llvm.memset.p0i8.i32(i8* %d, i8 0, i32 32, i32 4, i1 false)
+  ret void
+}
+
+; CHECK: test_loop_memset
+; CHECK: dest=$d; stop=dest+64|0; do { HEAP32[dest>>2]=0|0; dest=dest+4|0; } while ((dest|0) < (stop|0));
+define void @test_loop_memset(i8* %d, i8* %s) {
+  call void @llvm.memset.p0i8.i32(i8* %d, i8 0, i32 64, i32 4, i1 false)
+  ret void
+}
+
+; CHECK: test_call_memset
+; CHECK: memset(($d|0),0,65536)
+define void @test_call_memset(i8* %d, i8* %s) {
+  call void @llvm.memset.p0i8.i32(i8* %d, i8 0, i32 65536, i32 4, i1 false)
+  ret void
+}
+
+; Also, don't emit declarations for the intrinsic functions.
+; CHECK-NOT: p0i8
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) #0
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/JS/phi.ll b/test/CodeGen/JS/phi.ll
new file mode 100644
index 000000000000..cef01ce2f4d2
--- /dev/null
+++ b/test/CodeGen/JS/phi.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s | FileCheck %s
+
+; Phi lowering should check for dependency cycles, including looking through
+; bitcasts, and emit extra copies as needed.
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: while(1) {
+; CHECK:   $k$phi = $j;$j$phi = $k;$k = $k$phi;$j = $j$phi;
+; CHECK: }
+define void @foo(float* nocapture %p, i32* %j.init, i32* %k.init) {
+entry:
+  br label %for.body
+
+for.body:
+  %j = phi i32* [ %j.init, %entry ], [ %k.cast, %more ]
+  %k = phi i32* [ %k.init, %entry ], [ %j.cast, %more ]
+  br label %more
+
+more:
+  %j.cast = bitcast i32* %j to i32*
+  %k.cast = bitcast i32* %k to i32*
+  br label %for.body
+}
diff --git a/test/CodeGen/JS/simd-fcmp.ll b/test/CodeGen/JS/simd-fcmp.ll
new file mode 100644
index 000000000000..f6df20acd2ab
--- /dev/null
+++ b/test/CodeGen/JS/simd-fcmp.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: function _test_ueq($a,$b) {
+; CHECK: $a = SIMD_Float32x4_check($a);
+; CHECK: $b = SIMD_Float32x4_check($b);
+; CHECK: $c = SIMD_Int32x4_notEqual(SIMD_Bool32x4_or(SIMD_Bool32x4_or(SIMD_Int32x4_select(SIMD_Float32x4_notEqual($a,$a), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0)),SIMD_Int32x4_select(SIMD_Float32x4_notEqual($b,$b), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0)),SIMD_Int32x4_select(SIMD_Float32x4_equal($a,$b), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0))), SIMD_Int32x4_splat(0));
+; CHECK: return (SIMD_Bool32x4_check($c));
+; CHECK:}
+define <4 x i1> @test_ueq(<4 x float> %a, <4 x float> %b) {
+    %c = fcmp ueq <4 x float> %a, %b
+    ret <4 x i1> %c
+}
+
+; CHECK: function _test_ord($a,$b) {
+; CHECK: $a = SIMD_Float32x4_check($a);
+; CHECK: $b = SIMD_Float32x4_check($b);
+; CHECK: $c = SIMD_Int32x4_notEqual(SIMD_Bool32x4_or(SIMD_Bool32x4_or(SIMD_Int32x4_select(SIMD_Float32x4_notEqual($a,$a), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0)),SIMD_Int32x4_select(SIMD_Float32x4_notEqual($b,$b), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0)),SIMD_Int32x4_select(SIMD_Float32x4_equal($a,$b), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0))), SIMD_Int32x4_splat(0));
+; CHECK: return (SIMD_Bool32x4_check($c));
+; CHECK:}
+define <4 x i1> @test_ord(<4 x float> %a, <4 x float> %b) {
+    %c = fcmp ueq <4 x float> %a, %b
+    ret <4 x i1> %c
+}
+
+; CHECK:function _test_uno($a,$b) {
+; CHECK: $a = SIMD_Float32x4_check($a);
+; CHECK: $b = SIMD_Float32x4_check($b);
+; CHECK: $c = SIMD_Int32x4_notEqual(SIMD_Bool32x4_or(SIMD_Bool32x4_or(SIMD_Int32x4_select(SIMD_Float32x4_notEqual($a,$a), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0)),SIMD_Int32x4_select(SIMD_Float32x4_notEqual($b,$b), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0)),SIMD_Int32x4_select(SIMD_Float32x4_equal($a,$b), SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0))), SIMD_Int32x4_splat(0));
+; CHECK: return (SIMD_Bool32x4_check($c));
+; CHECK:}
+define <4 x i1> @test_uno(<4 x float> %a, <4 x float> %b) {
+    %c = fcmp ueq <4 x float> %a, %b
+    ret <4 x i1> %c
+}
diff --git a/test/CodeGen/JS/simd-loadstore.ll b/test/CodeGen/JS/simd-loadstore.ll
new file mode 100644
index 000000000000..6955d7ec6a82
--- /dev/null
+++ b/test/CodeGen/JS/simd-loadstore.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: function _fx1($p) {
+; CHECK:  $p = $p|0;
+; CHECK:  var $s = SIMD_Float32x4(0,0,0,0), $t = SIMD_Float32x4(0,0,0,0), label = 0, sp = 0, temp_Float32x4_ptr = 0;
+; CHECK:  $t = SIMD_Float32x4_load1(HEAPU8, $p);
+; CHECK:  $s = SIMD_Float32x4_add($t,SIMD_Float32x4_splat(Math_fround(+0.5)));
+; CHECK:  temp_Float32x4_ptr = $p;SIMD_Float32x4_store1(HEAPU8, temp_Float32x4_ptr, $s);
+; CHECK:  return;
+; CHECK: }
+define void @fx1(i8* %p) {
+    %q = bitcast i8* %p to <1 x float>*
+    %t = load <1 x float>, <1 x float>* %q
+    %s = fadd <1 x float> %t, <float 0.5>
+    store <1 x float> %s, <1 x float>* %q
+    ret void
+}
+
+; CHECK: function _fx2($p) {
+; CHECK:  $p = $p|0;
+; CHECK:  $s = SIMD_Float32x4(0,0,0,0), $t = SIMD_Float32x4(0,0,0,0), label = 0, sp = 0, temp_Float32x4_ptr = 0;
+; CHECK:  $t = SIMD_Float32x4_load2(HEAPU8, $p);
+; CHECK:  $s = SIMD_Float32x4_add($t,SIMD_Float32x4(Math_fround(+3.5),Math_fround(+7.5),Math_fround(+0),Math_fround(+0)));
+; CHECK:  temp_Float32x4_ptr = $p;SIMD_Float32x4_store2(HEAPU8, temp_Float32x4_ptr, $s);
+; CHECK:  return;
+; CHECK: }
+define void @fx2(i8* %p) {
+    %q = bitcast i8* %p to <2 x float>*
+    %t = load <2 x float>, <2 x float>* %q
+    %s = fadd <2 x float> %t, <float 3.5, float 7.5>
+    store <2 x float> %s, <2 x float>* %q
+    ret void
+}
+
+; CHECK: function _fx3($p) {
+; CHECK:  $p = $p|0;
+; CHECK:  var $s = SIMD_Float32x4(0,0,0,0), $t = SIMD_Float32x4(0,0,0,0), label = 0, sp = 0, temp_Float32x4_ptr = 0;
+; CHECK:  $t = SIMD_Float32x4_load3(HEAPU8, $p);
+; CHECK:  $s = SIMD_Float32x4_add($t,SIMD_Float32x4(Math_fround(+1.5),Math_fround(+4.5),Math_fround(+6.5),Math_fround(+0)));
+; CHECK:  temp_Float32x4_ptr = $p;SIMD_Float32x4_store3(HEAPU8, temp_Float32x4_ptr, $s);
+; CHECK:  return;
+; CHECK: }
+define void @fx3(i8* %p) {
+    %q = bitcast i8* %p to <3 x float>*
+    %t = load <3 x float>, <3 x float>* %q
+    %s = fadd <3 x float> %t, <float 1.5, float 4.5, float 6.5>
+    store <3 x float> %s, <3 x float>* %q
+    ret void
+}
+
+; CHECK: function _fx4($p) {
+; CHECK:  $p = $p|0;
+; CHECK:  var $s = SIMD_Float32x4(0,0,0,0), $t = SIMD_Float32x4(0,0,0,0), label = 0, sp = 0, temp_Float32x4_ptr = 0;
+; CHECK:  $t = SIMD_Float32x4_load(HEAPU8, $p);
+; CHECK:  $s = SIMD_Float32x4_add($t,SIMD_Float32x4(Math_fround(+9.5),Math_fround(+5.5),Math_fround(+1.5),Math_fround(+-3.5)));
+; CHECK:  temp_Float32x4_ptr = $p;SIMD_Float32x4_store(HEAPU8, temp_Float32x4_ptr, $s);
+; CHECK:  return;
+; CHECK: }
+define void @fx4(i8* %p) {
+    %q = bitcast i8* %p to <4 x float>*
+    %t = load <4 x float>, <4 x float>* %q
+    %s = fadd <4 x float> %t, <float 9.5, float 5.5, float 1.5, float -3.5>
+    store <4 x float> %s, <4 x float>* %q
+    ret void
+}
diff --git a/test/CodeGen/JS/simd-misc.ll b/test/CodeGen/JS/simd-misc.ll
new file mode 100644
index 000000000000..99a47131c661
--- /dev/null
+++ b/test/CodeGen/JS/simd-misc.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+declare <4 x float> @emscripten_float32x4_reciprocalApproximation(<4 x float>)
+declare <4 x float> @emscripten_float32x4_reciprocalSqrtApproximation(<4 x float>)
+
+; CHECK: function _test_rcp($a) {
+; CHECK: $a = SIMD_Float32x4_check($a);
+; CHECK: SIMD_Float32x4_reciprocalApproximation
+; CHECK:}
+define <4 x float> @test_rcp(<4 x float> %a) {
+    %c = call <4 x float> @emscripten_float32x4_reciprocalApproximation(<4 x float> %a)
+    ret <4 x float> %c
+}
+
+; CHECK: function _test_rsqrt($a) {
+; CHECK: $a = SIMD_Float32x4_check($a);
+; CHECK: SIMD_Float32x4_reciprocalSqrtApproximation
+; CHECK:}
+define <4 x float> @test_rsqrt(<4 x float> %a) {
+    %c = call <4 x float> @emscripten_float32x4_reciprocalSqrtApproximation(<4 x float> %a)
+    ret <4 x float> %c
+}
+
+; CHECK: function _sext_vec($a) {
+; CHECK:  $b = SIMD_Int32x4_select($a, SIMD_Int32x4_splat(-1), SIMD_Int32x4_splat(0));
+; CHECK: }
+define <4 x i32> @sext_vec(<4 x i1> %a) {
+    %b = sext <4 x i1> %a to <4 x i32>
+    ret <4 x i32> %b
+}
+
+; CHECK: function _zext_vec($a) {
+; CHECK:  $b = SIMD_Int32x4_select($a, SIMD_Int32x4_splat(1), SIMD_Int32x4_splat(0));
+; CHECK: }
+define <4 x i32> @zext_vec(<4 x i1> %a) {
+    %b = zext <4 x i1> %a to <4 x i32>
+    ret <4 x i32> %b
+}
diff --git a/test/CodeGen/JS/simd-select.ll b/test/CodeGen/JS/simd-select.ll
new file mode 100644
index 000000000000..d3f133428792
--- /dev/null
+++ b/test/CodeGen/JS/simd-select.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: function _test0($a,$b,$cond) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  $cond = SIMD_Bool32x4_check($cond);
+; CHECK:  $cmp = SIMD_Int32x4_select($cond,$a,$b);
+; CHECK:  return (SIMD_Int32x4_check($cmp));
+; CHECK: }
+define <4 x i32> @test0(<4 x i32> %a, <4 x i32> %b, <4 x i1> %cond) nounwind {
+entry:
+  %cmp = select <4 x i1> %cond, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %cmp
+}
+
+; CHECK: function _test1($a,$b,$cond) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  $cond = SIMD_Bool32x4_check($cond);
+; CHECK:  $cmp = SIMD_Float32x4_select($cond,$a,$b);
+; CHECK:  return (SIMD_Float32x4_check($cmp));
+; CHECK: }
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b, <4 x i1> %cond) nounwind {
+entry:
+  %cmp = select <4 x i1> %cond, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %cmp
+}
+
+; CHECK: function _test2($a,$b,$cond) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  $cond = $cond|0;
+; CHECK:  $cmp = $cond ? $a : $b;
+; CHECK:  return (SIMD_Int32x4_check($cmp));
+; CHECK: }
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, i1 %cond) nounwind {
+entry:
+  %cmp = select i1 %cond, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %cmp
+}
+
+; CHECK: function _test3($a,$b,$cond) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  $cond = $cond|0;
+; CHECK:  $cmp = $cond ? $a : $b;
+; CHECK:  return (SIMD_Float32x4_check($cmp));
+; CHECK: }
+define <4 x float> @test3(<4 x float> %a, <4 x float> %b, i1 %cond) nounwind {
+entry:
+  %cmp = select i1 %cond, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %cmp
+}
diff --git a/test/CodeGen/JS/simd-shift.ll b/test/CodeGen/JS/simd-shift.ll
new file mode 100644
index 000000000000..7b1d9809c770
--- /dev/null
+++ b/test/CodeGen/JS/simd-shift.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: function _test0($a) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $shl = SIMD_Int32x4_shiftLeftByScalar($a, 3);
+; CHECK:  return (SIMD_Int32x4_check($shl));
+; CHECK: }
+define <4 x i32> @test0(<4 x i32> %a) {
+entry:
+  %shl = shl <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %shl
+}
+
+; CHECK: function _test1($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = $b|0;
+; CHECK:  SIMD_Int32x4_shiftLeftByScalar($a, $b);
+; CHECK:  return (SIMD_Int32x4_check($shl));
+; CHECK: }
+define <4 x i32> @test1(<4 x i32> %a, i32 %b) {
+entry:
+  %vecinit = insertelement <4 x i32> undef, i32 %b, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %b, i32 3
+  %shl = shl <4 x i32> %a, %vecinit3
+  ret <4 x i32> %shl
+}
+
+; CHECK: function _test2($a,$b,$c) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = $b|0;
+; CHECK:  $c = $c|0;
+; CHECK:  var $shl = SIMD_Int32x4(0,0,0,0), $vecinit3 = SIMD_Int32x4(0,0,0,0), label = 0, sp = 0;
+; CHECK:  $vecinit3 = SIMD_Int32x4($b, $b, $c, $b);
+; CHECK:  $shl = SIMD_Int32x4((SIMD_Int32x4_extractLane($a,0)|0) << (SIMD_Int32x4_extractLane($vecinit3,0)|0)|0, (SIMD_Int32x4_extractLane($a,1)|0) << (SIMD_Int32x4_extractLane($vecinit3,1)|0)|0, (SIMD_Int32x4_extractLane($a,2)|0) << (SIMD_Int32x4_extractLane($vecinit3,2)|0)|0, (SIMD_Int32x4_extractLane($a,3)|0) << (SIMD_Int32x4_extractLane($vecinit3,3)|0)|0);
+; CHECK:  return (SIMD_Int32x4_check($shl));
+; CHECK: }
+define <4 x i32> @test2(<4 x i32> %a, i32 %b, i32 %c) {
+entry:
+  %vecinit = insertelement <4 x i32> undef, i32 %b, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %c, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %b, i32 3
+  %shl = shl <4 x i32> %a, %vecinit3
+  ret <4 x i32> %shl
+}
+
+; CHECK: function _test3($a) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  SIMD_Int32x4_shiftRightArithmeticByScalar($a, 3);
+; CHECK:  return (SIMD_Int32x4_check($shr));
+; CHECK: }
+define <4 x i32> @test3(<4 x i32> %a) {
+entry:
+  %shr = ashr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %shr
+}
+
+; CHECK: function _test4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = $b|0;
+; CHECK:  SIMD_Int32x4_shiftRightArithmeticByScalar($a, $b);
+; CHECK:  return (SIMD_Int32x4_check($shr));
+; CHECK: }
+define <4 x i32> @test4(<4 x i32> %a, i32 %b) {
+entry:
+  %vecinit = insertelement <4 x i32> undef, i32 %b, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %b, i32 3
+  %shr = ashr <4 x i32> %a, %vecinit3
+  ret <4 x i32> %shr
+}
+
+; CHECK: function _test5($a,$b,$c) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = $b|0;
+; CHECK:  $c = $c|0;
+; CHECK:  var $shr = SIMD_Int32x4(0,0,0,0), $vecinit3 = SIMD_Int32x4(0,0,0,0), label = 0, sp = 0;
+; CHECK:  $vecinit3 = SIMD_Int32x4($b, $c, $b, $b);
+; CHECK:  $shr = SIMD_Int32x4((SIMD_Int32x4_extractLane($a,0)|0) >> (SIMD_Int32x4_extractLane($vecinit3,0)|0)|0, (SIMD_Int32x4_extractLane($a,1)|0) >> (SIMD_Int32x4_extractLane($vecinit3,1)|0)|0, (SIMD_Int32x4_extractLane($a,2)|0) >> (SIMD_Int32x4_extractLane($vecinit3,2)|0)|0, (SIMD_Int32x4_extractLane($a,3)|0) >> (SIMD_Int32x4_extractLane($vecinit3,3)|0)|0);
+; CHECK:  return (SIMD_Int32x4_check($shr));
+; CHECK: }
+define <4 x i32> @test5(<4 x i32> %a, i32 %b, i32 %c) {
+entry:
+  %vecinit = insertelement <4 x i32> undef, i32 %b, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %c, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %b, i32 3
+  %shr = ashr <4 x i32> %a, %vecinit3
+  ret <4 x i32> %shr
+}
+
+; CHECK: function _test6($a) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  SIMD_Int32x4_shiftRightLogicalByScalar($a, 3);
+; CHECK:  return (SIMD_Int32x4_check($lshr));
+; CHECK: }
+define <4 x i32> @test6(<4 x i32> %a) {
+entry:
+  %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %lshr
+}
+
+; CHECK: function _test7($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = $b|0;
+; CHECK:  $lshr = SIMD_Int32x4_shiftRightLogicalByScalar($a, $b);
+; CHECK:  return (SIMD_Int32x4_check($lshr));
+; CHECK: }
+define <4 x i32> @test7(<4 x i32> %a, i32 %b) {
+entry:
+  %vecinit = insertelement <4 x i32> undef, i32 %b, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %b, i32 3
+  %lshr = lshr <4 x i32> %a, %vecinit3
+  ret <4 x i32> %lshr
+}
+
+; CHECK: function _test8($a,$b,$c) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = $b|0;
+; CHECK:  $c = $c|0;
+; CHECK:  var $lshr = SIMD_Int32x4(0,0,0,0), $vecinit3 = SIMD_Int32x4(0,0,0,0), label = 0, sp = 0;
+; CHECK:  $vecinit3 = SIMD_Int32x4($b, $b, $b, $c);
+; CHECK:  $lshr = SIMD_Int32x4((SIMD_Int32x4_extractLane($a,0)|0) >>> (SIMD_Int32x4_extractLane($vecinit3,0)|0)|0, (SIMD_Int32x4_extractLane($a,1)|0) >>> (SIMD_Int32x4_extractLane($vecinit3,1)|0)|0, (SIMD_Int32x4_extractLane($a,2)|0) >>> (SIMD_Int32x4_extractLane($vecinit3,2)|0)|0, (SIMD_Int32x4_extractLane($a,3)|0) >>> (SIMD_Int32x4_extractLane($vecinit3,3)|0)|0);
+; CHECK:  return (SIMD_Int32x4_check($lshr));
+; CHECK: }
+define <4 x i32> @test8(<4 x i32> %a, i32 %b, i32 %c) {
+entry:
+  %vecinit = insertelement <4 x i32> undef, i32 %b, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %c, i32 3
+  %lshr = lshr <4 x i32> %a, %vecinit3
+  ret <4 x i32> %lshr
+}
diff --git a/test/CodeGen/JS/simd-shuffle.ll b/test/CodeGen/JS/simd-shuffle.ll
new file mode 100644
index 000000000000..88a7aa63bfb0
--- /dev/null
+++ b/test/CodeGen/JS/simd-shuffle.ll
@@ -0,0 +1,524 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: function _splat_int32x4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($a, 0, 0, 0, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @splat_int32x4(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32><i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _swizzle_int32x4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($a, 0, 3, 1, 2);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @swizzle_int32x4(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32><i32 0, i32 3, i32 1, i32 2>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _swizzlehi_int32x4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($b, 2, 1, 3, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @swizzlehi_int32x4(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32><i32 6, i32 5, i32 7, i32 4>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _shuffleXY_float32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_shuffle($a, $b, 7, 0, 0, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @shuffleXY_float32x4to3(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <3 x i32><i32 7, i32 0, i32 undef>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _shuffle_int32x4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_shuffle($a, $b, 7, 0, 5, 3);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @shuffle_int32x4(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32><i32 7, i32 0, i32 5, i32 3>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _shuffleXY_int32x4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_shuffle($a, $b, 7, 0, 0, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @shuffleXY_int32x4(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32><i32 7, i32 0, i32 undef, i32 undef>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _splat_int32x3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($a, 0, 0, 0, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @splat_int32x3(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32><i32 0, i32 0, i32 0>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _swizzle_int32x3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($a, 0, 2, 1, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @swizzle_int32x3(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32><i32 0, i32 2, i32 1>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _swizzlehi_int32x3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($b, 0, 2, 1, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @swizzlehi_int32x3(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32><i32 3, i32 5, i32 4>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _shuffle_int32x3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_shuffle($a, $b, 6, 0, 5, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @shuffle_int32x3(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32><i32 5, i32 0, i32 4>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _shuffleXY_int32x3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_shuffle($a, $b, 6, 0, 0, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @shuffleXY_int32x3(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <3 x i32><i32 5, i32 0, i32 undef>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _splat_int32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($a, 0, 0, 0, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @splat_int32x3to4(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <4 x i32><i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _swizzle_int32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($a, 0, 2, 1, 2);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @swizzle_int32x3to4(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <4 x i32><i32 0, i32 2, i32 1, i32 2>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _swizzlehi_int32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($b, 2, 1, 0, 2);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @swizzlehi_int32x3to4(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <4 x i32><i32 5, i32 4, i32 3, i32 5>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _shuffle_int32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_shuffle($a, $b, 6, 0, 5, 2);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @shuffle_int32x3to4(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <4 x i32><i32 5, i32 0, i32 4, i32 2>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _shuffleXY_int32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_shuffle($a, $b, 6, 0, 0, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <4 x i32> @shuffleXY_int32x3to4(<3 x i32> %a, <3 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x i32> %a, <3 x i32> %b, <4 x i32><i32 5, i32 0, i32 undef, i32 undef>
+  ret <4 x i32> %sel
+}
+
+; CHECK: function _splat_int32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($a, 0, 0, 0, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @splat_int32x4to3(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <3 x i32><i32 0, i32 0, i32 0>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _swizzle_int32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($a, 0, 3, 1, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @swizzle_int32x4to3(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <3 x i32><i32 0, i32 3, i32 1>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _swizzlehi_int32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_swizzle($b, 2, 1, 3, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @swizzlehi_int32x4to3(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <3 x i32><i32 6, i32 5, i32 7>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _shuffle_int32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_shuffle($a, $b, 7, 0, 5, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @shuffle_int32x4to3(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <3 x i32><i32 7, i32 0, i32 5>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _shuffleXY_int32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Int32x4_check($a);
+; CHECK:  $b = SIMD_Int32x4_check($b);
+; CHECK:  var $sel = SIMD_Int32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Int32x4_shuffle($a, $b, 7, 0, 0, 0);
+; CHECK:  return (SIMD_Int32x4_check($sel));
+; CHECK: }
+define <3 x i32> @shuffleXY_int32x4to3(<4 x i32> %a, <4 x i32> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x i32> %a, <4 x i32> %b, <3 x i32><i32 7, i32 0, i32 undef>
+  ret <3 x i32> %sel
+}
+
+; CHECK: function _splat_float32x4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($a, 0, 0, 0, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @splat_float32x4(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 0, i32 0, i32 0>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _swizzle_float32x4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($a, 0, 3, 1, 2);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @swizzle_float32x4(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 3, i32 1, i32 2>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _swizzlehi_float32x4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($b, 2, 1, 3, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @swizzlehi_float32x4(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 6, i32 5, i32 7, i32 4>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _shuffle_float32x4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_shuffle($a, $b, 7, 0, 5, 3);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @shuffle_float32x4(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 7, i32 0, i32 5, i32 3>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _shuffleXY_float32x4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_shuffle($a, $b, 7, 0, 0, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @shuffleXY_float32x4(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 7, i32 0, i32 undef, i32 undef>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _splat_float32x3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($a, 0, 0, 0, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @splat_float32x3(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <3 x i32><i32 0, i32 0, i32 0>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _swizzle_float32x3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($a, 0, 2, 1, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @swizzle_float32x3(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <3 x i32><i32 0, i32 2, i32 1>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _swizzlehi_float32x3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($b, 0, 2, 1, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @swizzlehi_float32x3(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <3 x i32><i32 3, i32 5, i32 4>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _shuffle_float32x3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_shuffle($a, $b, 6, 0, 5, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @shuffle_float32x3(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <3 x i32><i32 5, i32 0, i32 4>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _shuffleXY_float32x3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_shuffle($a, $b, 6, 0, 0, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @shuffleXY_float32x3(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <3 x i32><i32 5, i32 0, i32 undef>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _splat_float32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($a, 0, 0, 0, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @splat_float32x3to4(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <4 x i32><i32 0, i32 0, i32 0, i32 0>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _swizzle_float32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($a, 0, 2, 1, 2);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @swizzle_float32x3to4(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <4 x i32><i32 0, i32 2, i32 1, i32 2>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _swizzlehi_float32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($b, 2, 1, 0, 2);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @swizzlehi_float32x3to4(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <4 x i32><i32 5, i32 4, i32 3, i32 5>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _shuffle_float32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_shuffle($a, $b, 6, 0, 5, 2);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @shuffle_float32x3to4(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <4 x i32><i32 5, i32 0, i32 4, i32 2>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _shuffleXY_float32x3to4($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_shuffle($a, $b, 6, 0, 0, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <4 x float> @shuffleXY_float32x3to4(<3 x float> %a, <3 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <3 x float> %a, <3 x float> %b, <4 x i32><i32 5, i32 0, i32 undef, i32 undef>
+  ret <4 x float> %sel
+}
+
+; CHECK: function _splat_float32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($a, 0, 0, 0, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @splat_float32x4to3(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <3 x i32><i32 0, i32 0, i32 0>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _swizzle_float32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($a, 0, 3, 1, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @swizzle_float32x4to3(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <3 x i32><i32 0, i32 3, i32 1>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _swizzlehi_float32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_swizzle($b, 2, 1, 3, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @swizzlehi_float32x4to3(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <3 x i32><i32 6, i32 5, i32 7>
+  ret <3 x float> %sel
+}
+
+; CHECK: function _shuffle_float32x4to3($a,$b) {
+; CHECK:  $a = SIMD_Float32x4_check($a);
+; CHECK:  $b = SIMD_Float32x4_check($b);
+; CHECK:  var $sel = SIMD_Float32x4(0,0,0,0)
+; CHECK:  $sel = SIMD_Float32x4_shuffle($a, $b, 7, 0, 5, 0);
+; CHECK:  return (SIMD_Float32x4_check($sel));
+; CHECK: }
+define <3 x float> @shuffle_float32x4to3(<4 x float> %a, <4 x float> %b) nounwind {
+entry:
+  %sel = shufflevector <4 x float> %a, <4 x float> %b, <3 x i32><i32 7, i32 0, i32 5>
+  ret <3 x float> %sel
+}
diff --git a/test/CodeGen/JS/splat-precise-f32.ll b/test/CodeGen/JS/splat-precise-f32.ll
new file mode 100644
index 000000000000..d3f5e08ff186
--- /dev/null
+++ b/test/CodeGen/JS/splat-precise-f32.ll
@@ -0,0 +1,46 @@
+; RUN: llc -emscripten-precise-f32=false < %s | FileCheck %s
+; RUN: llc -emscripten-precise-f32=true < %s | FileCheck --check-prefix=CHECK-PRECISE_F32 %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; SIMD_Float32x4_splat needs a float32 input even if we're not in precise-f32 mode.
+
+; CHECK: test(
+; CHECK: $d = SIMD_Float32x4_splat(Math_fround($f));
+; CHECK-PRECISE_F32: test(
+; CHECK-PRECISE_F32: $f = Math_fround($f);
+; CHECK-PRECISE_F32: $d = SIMD_Float32x4_splat($f);
+define <4 x float> @test(float %f) {
+  %a = insertelement <4 x float> undef, float %f, i32 0
+  %b = insertelement <4 x float> %a, float %f, i32 1
+  %c = insertelement <4 x float> %b, float %f, i32 2
+  %d = insertelement <4 x float> %c, float %f, i32 3
+  ret <4 x float> %d
+}
+
+; CHECK: test_insert(
+; CHECK: $a = SIMD_Float32x4_replaceLane($v,0,Math_fround($g));
+; CHECK-PRECISE_F32: test_insert(
+; CHECK-PRECISE_F32: $g = Math_fround($g);
+; CHECK-PRECISE_F32: $a = SIMD_Float32x4_replaceLane($v,0,$g);
+define <4 x float> @test_insert(<4 x float> %v, float %g) {
+  %a = insertelement <4 x float> %v, float %g, i32 0
+  ret <4 x float> %a
+}
+
+; CHECK: test_ctor(
+; CHECK: $d = SIMD_Float32x4(Math_fround($x), Math_fround($y), Math_fround($z), Math_fround($w));
+; CHECK-PRECISE_F32: test_ctor(
+; CHECK-PRECISE_F32: $x = Math_fround($x);
+; CHECK-PRECISE_F32: $y = Math_fround($y);
+; CHECK-PRECISE_F32: $z = Math_fround($z);
+; CHECK-PRECISE_F32: $w = Math_fround($w);
+; CHECK-PRECISE_F32: $d = SIMD_Float32x4($x, $y, $z, $w);
+define <4 x float> @test_ctor(<4 x float> %v, float %x, float %y, float %z, float %w) {
+  %a = insertelement <4 x float> undef, float %x, i32 0
+  %b = insertelement <4 x float> %a, float %y, i32 1
+  %c = insertelement <4 x float> %b, float %z, i32 2
+  %d = insertelement <4 x float> %c, float %w, i32 3
+  ret <4 x float> %d
+}
diff --git a/test/CodeGen/JS/unrolled-simd.ll b/test/CodeGen/JS/unrolled-simd.ll
new file mode 100644
index 000000000000..1d169a4a76ab
--- /dev/null
+++ b/test/CodeGen/JS/unrolled-simd.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:128-n32-S128"
+target triple = "asmjs-unknown-emscripten"
+
+; CHECK: SIMD_Int32x4((SIMD_Int32x4_extractLane($a,0)|0) / (SIMD_Int32x4_extractLane($b,0)|0)|0, (SIMD_Int32x4_extractLane($a,1)|0) / (SIMD_Int32x4_extractLane($b,1)|0)|0, (SIMD_Int32x4_extractLane($a,2)|0) / (SIMD_Int32x4_extractLane($b,2)|0)|0, (SIMD_Int32x4_extractLane($a,3)|0) / (SIMD_Int32x4_extractLane($b,3)|0)|0);
+define <4 x i32> @signed_div(<4 x i32> %a, <4 x i32> %b) {
+  %c = sdiv <4 x i32> %a, %b
+  ret <4 x i32> %c
+}
+
+; CHECK: SIMD_Int32x4((SIMD_Int32x4_extractLane($a,0)>>>0) / (SIMD_Int32x4_extractLane($b,0)>>>0)>>>0, (SIMD_Int32x4_extractLane($a,1)>>>0) / (SIMD_Int32x4_extractLane($b,1)>>>0)>>>0, (SIMD_Int32x4_extractLane($a,2)>>>0) / (SIMD_Int32x4_extractLane($b,2)>>>0)>>>0, (SIMD_Int32x4_extractLane($a,3)>>>0) / (SIMD_Int32x4_extractLane($b,3)>>>0)>>>0);
+define <4 x i32> @un_div(<4 x i32> %a, <4 x i32> %b) {
+  %c = udiv <4 x i32> %a, %b
+  ret <4 x i32> %c
+}
+
+; CHECK: SIMD_Int32x4((SIMD_Int32x4_extractLane($a,0)|0) % (SIMD_Int32x4_extractLane($b,0)|0)|0, (SIMD_Int32x4_extractLane($a,1)|0) % (SIMD_Int32x4_extractLane($b,1)|0)|0, (SIMD_Int32x4_extractLane($a,2)|0) % (SIMD_Int32x4_extractLane($b,2)|0)|0, (SIMD_Int32x4_extractLane($a,3)|0) % (SIMD_Int32x4_extractLane($b,3)|0)|0);
+define <4 x i32> @signed_rem(<4 x i32> %a, <4 x i32> %b) {
+  %c = srem <4 x i32> %a, %b
+  ret <4 x i32> %c
+}
+
+; CHECK: SIMD_Int32x4((SIMD_Int32x4_extractLane($a,0)>>>0) % (SIMD_Int32x4_extractLane($b,0)>>>0)>>>0, (SIMD_Int32x4_extractLane($a,1)>>>0) % (SIMD_Int32x4_extractLane($b,1)>>>0)>>>0, (SIMD_Int32x4_extractLane($a,2)>>>0) % (SIMD_Int32x4_extractLane($b,2)>>>0)>>>0, (SIMD_Int32x4_extractLane($a,3)>>>0) % (SIMD_Int32x4_extractLane($b,3)>>>0)>>>0);
+define <4 x i32> @un_rem(<4 x i32> %a, <4 x i32> %b) {
+  %c = urem <4 x i32> %a, %b
+  ret <4 x i32> %c
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 1db0256e8e38..2e115df096df 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -388,7 +388,7 @@ define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    vpaddb LCPI34_0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddb LCPI32_0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovdqu %xmm0, (%eax)
 ; CHECK-NEXT:    retl
   %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
diff --git a/test/Transforms/GVN/hoist-pr28606.ll b/test/Transforms/GVN/hoist-pr28606.ll
new file mode 100644
index 000000000000..2c588283ea91
--- /dev/null
+++ b/test/Transforms/GVN/hoist-pr28606.ll
@@ -0,0 +1,50 @@
+; RUN: opt -gvn-hoist -S < %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc18.0.0"
+
+%struct.S = type { i8* }
+
+declare void @f(<{ %struct.S }>* inalloca)
+
+
+; Check that we don't clone the %x alloca and insert it in the live range of
+; %argmem, which would break the inalloca contract.
+;
+; CHECK-LABEL: @test
+; CHECK: alloca i8
+; CHECK: stacksave
+; CHECK: alloca inalloca
+; CHECK-NOT: alloca i8
+
+; Check that store instructions are hoisted.
+; CHECK: store i8
+; CHECK-NOT: store i8
+; CHECK: stackrestore
+
+define void @test(i1 %b) {
+entry:
+  %x = alloca i8
+  %inalloca.save = call i8* @llvm.stacksave()
+  %argmem = alloca inalloca <{ %struct.S }>, align 4
+  %0 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0
+  br i1 %b, label %true, label %false
+
+true:
+  %p = getelementptr inbounds %struct.S, %struct.S* %0, i32 0, i32 0
+  store i8* %x, i8** %p, align 4
+  br label %exit
+
+false:
+  %p2 = getelementptr inbounds %struct.S, %struct.S* %0, i32 0, i32 0
+  store i8* %x, i8** %p2, align 4
+  br label %exit
+
+exit:
+  call void @f(<{ %struct.S }>* inalloca %argmem)
+  call void @llvm.stackrestore(i8* %inalloca.save)
+  ret void
+}
+
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
diff --git a/test/Transforms/GVN/pr28626.ll b/test/Transforms/GVN/pr28626.ll
new file mode 100644
index 000000000000..7930e6948253
--- /dev/null
+++ b/test/Transforms/GVN/pr28626.ll
@@ -0,0 +1,42 @@
+; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1(i1 %a, i1** %d) {
+entry:
+  %0 = load i1*, i1** %d, align 8
+  br i1 %a, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %c.0 = phi i1 [ 1, %if.then ], [ 0, %if.else ]
+  br i1 %c.0, label %if.then2, label %if.else3
+
+if.then2:                                         ; preds = %if.end
+  %rc = getelementptr inbounds i1, i1* %0, i64 0
+  store i1 %c.0, i1* %rc, align 4
+  br label %if.end6
+
+if.else3:                                         ; preds = %if.end
+  %rc5 = getelementptr inbounds i1, i1* %0, i64 0
+  store i1 %c.0, i1* %rc5, align 4
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.else3, %if.then2
+  ret void
+}
+
+; CHECK-LABEL: define void @test1(
+; CHECK:  %[[load:.*]] = load i1*, i1** %d, align 8
+; CHECK:  %[[phi:.*]] = phi i1 [ true, {{.*}} ], [ false, {{.*}} ]
+
+; CHECK: %[[gep0:.*]] = getelementptr inbounds i1, i1* %[[load]], i64 0
+; CHECK: store i1 %[[phi]], i1* %[[gep0]], align 4
+
+; Check that store instructions are hoisted.
+; CHECK-NOT: store
\ No newline at end of file
diff --git a/test/Transforms/GlobalOpt/integer-bool.ll b/test/Transforms/GlobalOpt/integer-bool.ll
index 617febdc0166..3bdf92273b77 100644
--- a/test/Transforms/GlobalOpt/integer-bool.ll
+++ b/test/Transforms/GlobalOpt/integer-bool.ll
@@ -4,17 +4,17 @@
 @G = internal addrspace(1) global i32 0
 ; CHECK: @G
 ; CHECK: addrspace(1)
-; CHECK: global i1 false
+; CHECK: global i32 0
 
 define void @set1() {
   store i32 0, i32 addrspace(1)* @G
-; CHECK: store i1 false
+; CHECK: store i32 0
   ret void
 }
 
 define void @set2() {
   store i32 1, i32 addrspace(1)* @G
-; CHECK: store i1 true
+; CHECK: store i32 1
   ret void
 }
 
diff --git a/test/Transforms/NaCl/add-pnacl-external-decls.ll b/test/Transforms/NaCl/add-pnacl-external-decls.ll
new file mode 100644
index 000000000000..1f525a9268cd
--- /dev/null
+++ b/test/Transforms/NaCl/add-pnacl-external-decls.ll
@@ -0,0 +1,6 @@
+; RUN: opt < %s -add-pnacl-external-decls -S | FileCheck %s
+
+declare void @foobar(i32)
+
+; CHECK: declare i32 @setjmp(i8*)
+; CHECK: declare void @longjmp(i8*, i32)
diff --git a/test/Transforms/NaCl/atomic/atomic-seq-cst-only.ll b/test/Transforms/NaCl/atomic/atomic-seq-cst-only.ll
new file mode 100644
index 000000000000..4c620a6d3bbc
--- /dev/null
+++ b/test/Transforms/NaCl/atomic/atomic-seq-cst-only.ll
@@ -0,0 +1,51 @@
+; RUN: opt -nacl-rewrite-atomics -pnacl-memory-order-seq-cst-only=true -S < %s | FileCheck %s
+;
+; Verify that -pnacl-memory-order-seq-cst-only=true ensures all atomic memory
+; orderings become seq_cst (enum value 6).
+;
+; Note that monotonic doesn't exist in C11/C++11, and consume isn't implemented
+; in LLVM yet.
+
+target datalayout = "p:32:32:32"
+
+; CHECK-LABEL: @test_atomic_store_monotonic_i32
+define void @test_atomic_store_monotonic_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  store atomic i32 %value, i32* %ptr monotonic, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_store_unordered_i32
+define void @test_atomic_store_unordered_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  store atomic i32 %value, i32* %ptr unordered, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_acquire_i32
+define i32 @test_atomic_load_acquire_i32(i32* %ptr) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %res = load atomic i32, i32* %ptr acquire, align 4
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_release_i32
+define void @test_atomic_store_release_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  store atomic i32 %value, i32* %ptr release, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_fetch_and_add_i32
+define i32 @test_fetch_and_add_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %value, i32 6)
+  %res = atomicrmw add i32* %ptr, i32 %value acq_rel
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_seq_cst_i32
+define void @test_atomic_store_seq_cst_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  store atomic i32 %value, i32* %ptr seq_cst, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
diff --git a/test/Transforms/NaCl/atomic/atomic_others.ll b/test/Transforms/NaCl/atomic/atomic_others.ll
new file mode 100644
index 000000000000..ae2498340fdf
--- /dev/null
+++ b/test/Transforms/NaCl/atomic/atomic_others.ll
@@ -0,0 +1,130 @@
+; RUN: opt -nacl-rewrite-atomics -S < %s | FileCheck %s
+;
+; Validate that atomic non-{acquire/release/acq_rel/seq_cst} loads/stores get
+; rewritten into NaCl atomic builtins with sequentially consistent memory
+; ordering (enum value 6), and that acquire/release/acq_rel remain as-is (enum
+; values 3/4/5).
+;
+; Note that monotonic doesn't exist in C11/C++11, and consume isn't implemented
+; in LLVM yet.
+
+target datalayout = "p:32:32:32"
+
+; CHECK-LABEL: @test_atomic_load_monotonic_i32
+define i32 @test_atomic_load_monotonic_i32(i32* %ptr) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %res = load atomic i32, i32* %ptr monotonic, align 4
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_monotonic_i32
+define void @test_atomic_store_monotonic_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  store atomic i32 %value, i32* %ptr monotonic, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_unordered_i32
+define i32 @test_atomic_load_unordered_i32(i32* %ptr) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %res = load atomic i32, i32* %ptr unordered, align 4
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_unordered_i32
+define void @test_atomic_store_unordered_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  store atomic i32 %value, i32* %ptr unordered, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_acquire_i32
+define i32 @test_atomic_load_acquire_i32(i32* %ptr) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 3)
+  %res = load atomic i32, i32* %ptr acquire, align 4
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_release_i32
+define void @test_atomic_store_release_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 4)
+  store atomic i32 %value, i32* %ptr release, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_fetch_and_add_i32
+define i32 @test_fetch_and_add_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %value, i32 5)
+  %res = atomicrmw add i32* %ptr, i32 %value acq_rel
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; Test all the valid cmpxchg orderings for success and failure.
+
+; CHECK-LABEL: @test_cmpxchg_seqcst_seqcst
+define { i32, i1 } @test_cmpxchg_seqcst_seqcst(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 6, i32 6)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value seq_cst seq_cst
+  ret { i32, i1 } %res
+}
+
+; CHECK-LABEL: @test_cmpxchg_seqcst_acquire
+define { i32, i1 } @test_cmpxchg_seqcst_acquire(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 6, i32 3)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value seq_cst acquire
+  ret { i32, i1 } %res
+}
+
+; CHECK-LABEL: @test_cmpxchg_seqcst_relaxed
+define { i32, i1 } @test_cmpxchg_seqcst_relaxed(i32* %ptr, i32 %value) {
+  ; Failure ordering is upgraded.
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 6, i32 6)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value seq_cst monotonic
+  ret { i32, i1 } %res
+}
+
+; CHECK-LABEL: @test_cmpxchg_acqrel_acquire
+define { i32, i1 } @test_cmpxchg_acqrel_acquire(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 5, i32 3)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value acq_rel acquire
+  ret { i32, i1 } %res
+}
+
+; CHECK-LABEL: @test_cmpxchg_acqrel_relaxed
+define { i32, i1 } @test_cmpxchg_acqrel_relaxed(i32* %ptr, i32 %value) {
+  ; Success and failure ordering are upgraded.
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 6, i32 6)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value acq_rel monotonic
+  ret { i32, i1 } %res
+}
+
+; CHECK-LABEL: @test_cmpxchg_release_relaxed
+define { i32, i1 } @test_cmpxchg_release_relaxed(i32* %ptr, i32 %value) {
+  ; Success and failure ordering are upgraded.
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 6, i32 6)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value release monotonic
+  ret { i32, i1 } %res
+}
+
+; CHECK-LABEL: @test_cmpxchg_acquire_acquire
+define { i32, i1 } @test_cmpxchg_acquire_acquire(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 3, i32 3)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value acquire acquire
+  ret { i32, i1 } %res
+}
+
+; CHECK-LABEL: @test_cmpxchg_acquire_relaxed
+define { i32, i1 } @test_cmpxchg_acquire_relaxed(i32* %ptr, i32 %value) {
+  ; Failure ordering is upgraded.
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 3, i32 3)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value acquire monotonic
+  ret { i32, i1 } %res
+}
+
+; CHECK-LABEL: @test_cmpxchg_relaxed_relaxed
+define { i32, i1 } @test_cmpxchg_relaxed_relaxed(i32* %ptr, i32 %value) {
+  ; Failure ordering is upgraded.
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0, i32 %value, i32 6, i32 6)
+  %res = cmpxchg i32* %ptr, i32 0, i32 %value monotonic monotonic
+  ret { i32, i1 } %res
+}
diff --git a/test/Transforms/NaCl/atomic/atomic_seq_cst.ll b/test/Transforms/NaCl/atomic/atomic_seq_cst.ll
new file mode 100644
index 000000000000..a6125a4da685
--- /dev/null
+++ b/test/Transforms/NaCl/atomic/atomic_seq_cst.ll
@@ -0,0 +1,99 @@
+; RUN: opt -nacl-rewrite-atomics -S < %s | FileCheck %s
+;
+; Validate that sequentially consistent atomic loads/stores get rewritten into
+; NaCl atomic builtins with sequentially-consistent memory ordering (enum value
+; 6).
+
+target datalayout = "p:32:32:32"
+
+; CHECK-LABEL: @test_atomic_load_i8
+define zeroext i8 @test_atomic_load_i8(i8* %ptr) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6)
+  %res = load atomic i8, i8* %ptr seq_cst, align 1
+  ret i8 %res  ; CHECK-NEXT: ret i8 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_i8
+define void @test_atomic_store_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i8(i8 %value, i8* %ptr, i32 6)
+  store atomic i8 %value, i8* %ptr seq_cst, align 1
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_i16
+define zeroext i16 @test_atomic_load_i16(i16* %ptr) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6)
+  %res = load atomic i16, i16* %ptr seq_cst, align 2
+  ret i16 %res  ; CHECK-NEXT: ret i16 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_i16
+define void @test_atomic_store_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i16(i16 %value, i16* %ptr, i32 6)
+  store atomic i16 %value, i16* %ptr seq_cst, align 2
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_i32
+define i32 @test_atomic_load_i32(i32* %ptr) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %res = load atomic i32, i32* %ptr seq_cst, align 4
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_i32
+define void @test_atomic_store_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  store atomic i32 %value, i32* %ptr seq_cst, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_i64
+define i64 @test_atomic_load_i64(i64* %ptr) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr, i32 6)
+  %res = load atomic i64, i64* %ptr seq_cst, align 8
+  ret i64 %res  ; CHECK-NEXT: ret i64 %res
+}
+
+; CHECK-LABEL: @test_atomic_store_i64
+define void @test_atomic_store_i64(i64* %ptr, i64 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i64(i64 %value, i64* %ptr, i32 6)
+  store atomic i64 %value, i64* %ptr seq_cst, align 8
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_i32_pointer
+define i32* @test_atomic_load_i32_pointer(i32** %ptr) {
+  ; CHECK-NEXT: %ptr.cast = bitcast i32** %ptr to i32*
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr.cast, i32 6)
+  ; CHECK-NEXT: %res.cast = inttoptr i32 %res to i32*
+  %res = load atomic i32*, i32** %ptr seq_cst, align 4
+  ret i32* %res  ; CHECK-NEXT: ret i32* %res.cast
+}
+
+; CHECK-LABEL: @test_atomic_store_i32_pointer
+define void @test_atomic_store_i32_pointer(i32** %ptr, i32* %value) {
+  ; CHECK-NEXT: %ptr.cast = bitcast i32** %ptr to i32*
+  ; CHECK-NEXT: %value.cast = ptrtoint i32* %value to i32
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value.cast, i32* %ptr.cast, i32 6)
+  store atomic i32* %value, i32** %ptr seq_cst, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_double_pointer
+define double* @test_atomic_load_double_pointer(double** %ptr) {
+  ; CHECK-NEXT: %ptr.cast = bitcast double** %ptr to i32*
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr.cast, i32 6)
+  ; CHECK-NEXT: %res.cast = inttoptr i32 %res to double*
+  %res = load atomic double*, double** %ptr seq_cst, align 4
+  ret double* %res  ; CHECK-NEXT: ret double* %res.cast
+}
+
+; CHECK-LABEL: @test_atomic_store_double_pointer
+define void @test_atomic_store_double_pointer(double** %ptr, double* %value) {
+  ; CHECK-NEXT: %ptr.cast = bitcast double** %ptr to i32*
+  ; CHECK-NEXT: %value.cast = ptrtoint double* %value to i32
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value.cast, i32* %ptr.cast, i32 6)
+  store atomic double* %value, double** %ptr seq_cst, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
diff --git a/test/Transforms/NaCl/atomic/fetch_and_.ll b/test/Transforms/NaCl/atomic/fetch_and_.ll
new file mode 100644
index 000000000000..62f7d8c10d19
--- /dev/null
+++ b/test/Transforms/NaCl/atomic/fetch_and_.ll
@@ -0,0 +1,154 @@
+; RUN: opt -nacl-rewrite-atomics -S < %s | FileCheck %s
+
+; Each of these tests validates that the corresponding legacy GCC-style builtins
+; are properly rewritten to NaCl atomic builtins. Only the GCC-style builtins
+; that have corresponding primitives in C11/C++11 and which emit different code
+; are tested. These legacy GCC-builtins only support sequential-consistency
+; (enum value 6).
+;
+; test_* tests the corresponding __sync_* builtin. See:
+; http://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/_005f_005fsync-Builtins.html
+
+target datalayout = "p:32:32:32"
+
+; CHECK-LABEL: @test_fetch_and_add_i8
+define zeroext i8 @test_fetch_and_add_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.rmw.i8(i32 1, i8* %ptr, i8 %value, i32 6)
+  %res = atomicrmw add i8* %ptr, i8 %value seq_cst
+  ret i8 %res  ; CHECK-NEXT: ret i8 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_add_i16
+define zeroext i16 @test_fetch_and_add_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.rmw.i16(i32 1, i16* %ptr, i16 %value, i32 6)
+  %res = atomicrmw add i16* %ptr, i16 %value seq_cst
+  ret i16 %res  ; CHECK-NEXT: ret i16 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_add_i32
+define i32 @test_fetch_and_add_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %value, i32 6)
+  %res = atomicrmw add i32* %ptr, i32 %value seq_cst
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_add_i64
+define i64 @test_fetch_and_add_i64(i64* %ptr, i64 %value) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.rmw.i64(i32 1, i64* %ptr, i64 %value, i32 6)
+  %res = atomicrmw add i64* %ptr, i64 %value seq_cst
+  ret i64 %res  ; CHECK-NEXT: ret i64 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_sub_i8
+define zeroext i8 @test_fetch_and_sub_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.rmw.i8(i32 2, i8* %ptr, i8 %value, i32 6)
+  %res = atomicrmw sub i8* %ptr, i8 %value seq_cst
+  ret i8 %res  ; CHECK-NEXT: ret i8 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_sub_i16
+define zeroext i16 @test_fetch_and_sub_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.rmw.i16(i32 2, i16* %ptr, i16 %value, i32 6)
+  %res = atomicrmw sub i16* %ptr, i16 %value seq_cst
+  ret i16 %res  ; CHECK-NEXT: ret i16 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_sub_i32
+define i32 @test_fetch_and_sub_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %value, i32 6)
+  %res = atomicrmw sub i32* %ptr, i32 %value seq_cst
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_sub_i64
+define i64 @test_fetch_and_sub_i64(i64* %ptr, i64 %value) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.rmw.i64(i32 2, i64* %ptr, i64 %value, i32 6)
+  %res = atomicrmw sub i64* %ptr, i64 %value seq_cst
+  ret i64 %res  ; CHECK-NEXT: ret i64 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_or_i8
+define zeroext i8 @test_fetch_and_or_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.rmw.i8(i32 3, i8* %ptr, i8 %value, i32 6)
+  %res = atomicrmw or i8* %ptr, i8 %value seq_cst
+  ret i8 %res  ; CHECK-NEXT: ret i8 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_or_i16
+define zeroext i16 @test_fetch_and_or_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.rmw.i16(i32 3, i16* %ptr, i16 %value, i32 6)
+  %res = atomicrmw or i16* %ptr, i16 %value seq_cst
+  ret i16 %res  ; CHECK-NEXT: ret i16 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_or_i32
+define i32 @test_fetch_and_or_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %value, i32 6)
+  %res = atomicrmw or i32* %ptr, i32 %value seq_cst
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_or_i64
+define i64 @test_fetch_and_or_i64(i64* %ptr, i64 %value) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.rmw.i64(i32 3, i64* %ptr, i64 %value, i32 6)
+  %res = atomicrmw or i64* %ptr, i64 %value seq_cst
+  ret i64 %res  ; CHECK-NEXT: ret i64 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_and_i8
+define zeroext i8 @test_fetch_and_and_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.rmw.i8(i32 4, i8* %ptr, i8 %value, i32 6)
+  %res = atomicrmw and i8* %ptr, i8 %value seq_cst
+  ret i8 %res  ; CHECK-NEXT: ret i8 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_and_i16
+define zeroext i16 @test_fetch_and_and_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.rmw.i16(i32 4, i16* %ptr, i16 %value, i32 6)
+  %res = atomicrmw and i16* %ptr, i16 %value seq_cst
+  ret i16 %res  ; CHECK-NEXT: ret i16 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_and_i32
+define i32 @test_fetch_and_and_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %value, i32 6)
+  %res = atomicrmw and i32* %ptr, i32 %value seq_cst
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_and_i64
+define i64 @test_fetch_and_and_i64(i64* %ptr, i64 %value) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.rmw.i64(i32 4, i64* %ptr, i64 %value, i32 6)
+  %res = atomicrmw and i64* %ptr, i64 %value seq_cst
+  ret i64 %res  ; CHECK-NEXT: ret i64 %res
+
+}
+
+; CHECK-LABEL: @test_fetch_and_xor_i8
+define zeroext i8 @test_fetch_and_xor_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.rmw.i8(i32 5, i8* %ptr, i8 %value, i32 6)
+  %res = atomicrmw xor i8* %ptr, i8 %value seq_cst
+  ret i8 %res  ; CHECK-NEXT: ret i8 %res
+
+}
+
+; CHECK-LABEL: @test_fetch_and_xor_i16
+define zeroext i16 @test_fetch_and_xor_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.rmw.i16(i32 5, i16* %ptr, i16 %value, i32 6)
+  %res = atomicrmw xor i16* %ptr, i16 %value seq_cst
+  ret i16 %res  ; CHECK-NEXT: ret i16 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_xor_i32
+define i32 @test_fetch_and_xor_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %value, i32 6)
+  %res = atomicrmw xor i32* %ptr, i32 %value seq_cst
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_fetch_and_xor_i64
+define i64 @test_fetch_and_xor_i64(i64* %ptr, i64 %value) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.rmw.i64(i32 5, i64* %ptr, i64 %value, i32 6)
+  %res = atomicrmw xor i64* %ptr, i64 %value seq_cst
+  ret i64 %res  ; CHECK-NEXT: ret i64 %res
+}
diff --git a/test/Transforms/NaCl/atomic/lock_.ll b/test/Transforms/NaCl/atomic/lock_.ll
new file mode 100644
index 000000000000..6f66f6706122
--- /dev/null
+++ b/test/Transforms/NaCl/atomic/lock_.ll
@@ -0,0 +1,68 @@
+; RUN: opt -nacl-rewrite-atomics -S < %s | FileCheck %s
+
+; Each of these tests validates that the corresponding legacy GCC-style builtins
+; are properly rewritten to NaCl atomic builtins. Only the GCC-style builtins
+; that have corresponding primitives in C11/C++11 and which emit different code
+; are tested. These legacy GCC-builtins only support sequential-consistency
+; (enum value 6).
+;
+; test_* tests the corresponding __sync_* builtin. See:
+; http://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/_005f_005fsync-Builtins.html
+
+target datalayout = "p:32:32:32"
+
+; CHECK-LABEL: @test_lock_test_and_set_i8
+define zeroext i8 @test_lock_test_and_set_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.rmw.i8(i32 6, i8* %ptr, i8 %value, i32 6)
+  %res = atomicrmw xchg i8* %ptr, i8 %value seq_cst
+  ret i8 %res  ; CHECK-NEXT: ret i8 %res
+}
+
+; CHECK-LABEL: @test_lock_release_i8
+define void @test_lock_release_i8(i8* %ptr) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i8(i8 0, i8* %ptr, i32 4)
+  store atomic i8 0, i8* %ptr release, align 1
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_lock_test_and_set_i16
+define zeroext i16 @test_lock_test_and_set_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.rmw.i16(i32 6, i16* %ptr, i16 %value, i32 6)
+  %res = atomicrmw xchg i16* %ptr, i16 %value seq_cst
+  ret i16 %res  ; CHECK-NEXT: ret i16 %res
+}
+
+; CHECK-LABEL: @test_lock_release_i16
+define void @test_lock_release_i16(i16* %ptr) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i16(i16 0, i16* %ptr, i32 4)
+  store atomic i16 0, i16* %ptr release, align 2
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_lock_test_and_set_i32
+define i32 @test_lock_test_and_set_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %value, i32 6)
+  %res = atomicrmw xchg i32* %ptr, i32 %value seq_cst
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_lock_release_i32
+define void @test_lock_release_i32(i32* %ptr) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 0, i32* %ptr, i32 4)
+  store atomic i32 0, i32* %ptr release, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_lock_test_and_set_i64
+define i64 @test_lock_test_and_set_i64(i64* %ptr, i64 %value) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.rmw.i64(i32 6, i64* %ptr, i64 %value, i32 6)
+  %res = atomicrmw xchg i64* %ptr, i64 %value seq_cst
+  ret i64 %res  ; CHECK-NEXT: ret i64 %res
+}
+
+; CHECK-LABEL: @test_lock_release_i64
+define void @test_lock_release_i64(i64* %ptr) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i64(i64 0, i64* %ptr, i32 4)
+  store atomic i64 0, i64* %ptr release, align 8
+  ret void  ; CHECK-NEXT: ret void
+}
diff --git a/test/Transforms/NaCl/atomic/sync_synchronize.ll b/test/Transforms/NaCl/atomic/sync_synchronize.ll
new file mode 100644
index 000000000000..c9ef9029d3f6
--- /dev/null
+++ b/test/Transforms/NaCl/atomic/sync_synchronize.ll
@@ -0,0 +1,51 @@
+; RUN: opt -nacl-rewrite-atomics -remove-asm-memory -S < %s | FileCheck %s
+
+; Each of these tests validates that the corresponding legacy GCC-style builtins
+; are properly rewritten to NaCl atomic builtins. Only the GCC-style builtins
+; that have corresponding primitives in C11/C++11 and which emit different code
+; are tested. These legacy GCC-builtins only support sequential-consistency
+; (enum value 6).
+;
+; test_* tests the corresponding __sync_* builtin. See:
+; http://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/_005f_005fsync-Builtins.html
+
+target datalayout = "p:32:32:32"
+
+; This patterns gets emitted by C11/C++11 atomic thread fences.
+;
+; CHECK-LABEL: @test_c11_fence
+define void @test_c11_fence() {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.fence(i32 6)
+  fence seq_cst
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; This pattern gets emitted for ``__sync_synchronize`` and
+; ``asm("":::"memory")`` when Clang is configured for NaCl.
+;
+; CHECK-LABEL: @test_synchronize
+define void @test_synchronize() {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.fence.all()
+  call void asm sideeffect "", "~{memory}"()
+  fence seq_cst
+  call void asm sideeffect "", "~{memory}"()
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; Make sure the above pattern is respected and not partially-matched.
+;
+; CHECK-LABEL: @test_synchronize_bad1
+define void @test_synchronize_bad1() {
+  ; CHECK-NOT: call void @llvm.nacl.atomic.fence.all()
+  call void asm sideeffect "", "~{memory}"()
+  fence seq_cst
+  ret void
+}
+
+; CHECK-LABEL: @test_synchronize_bad2
+define void @test_synchronize_bad2() {
+  ; CHECK-NOT: call void @llvm.nacl.atomic.fence.all()
+  fence seq_cst
+  call void asm sideeffect "", "~{memory}"()
+  ret void
+}
diff --git a/test/Transforms/NaCl/atomic/val_compare_and_swap.ll b/test/Transforms/NaCl/atomic/val_compare_and_swap.ll
new file mode 100644
index 000000000000..d30eba2ebdd0
--- /dev/null
+++ b/test/Transforms/NaCl/atomic/val_compare_and_swap.ll
@@ -0,0 +1,112 @@
+; RUN: opt -nacl-rewrite-atomics -S < %s | FileCheck %s
+
+; Each of these tests validates that the corresponding legacy GCC-style builtins
+; are properly rewritten to NaCl atomic builtins. Only the GCC-style builtins
+; that have corresponding primitives in C11/C++11 and which emit different code
+; are tested. These legacy GCC-builtins only support sequential-consistency
+; (enum value 6).
+;
+; test_* tests the corresponding __sync_* builtin. See:
+; http://gcc.gnu.org/onlinedocs/gcc-4.8.1/gcc/_005f_005fsync-Builtins.html
+
+target datalayout = "p:32:32:32"
+
+; __sync_val_compare_and_swap
+
+; CHECK-LABEL: @test_val_compare_and_swap_i8
+define zeroext i8 @test_val_compare_and_swap_i8(i8* %ptr, i8 zeroext %oldval, i8 zeroext %newval) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.cmpxchg.i8(i8* %ptr, i8 %oldval, i8 %newval, i32 6, i32 6)
+  ; CHECK-NEXT: %success = icmp eq i8 %res, %oldval
+  ; CHECK-NEXT: %res.insert.value = insertvalue { i8, i1 } undef, i8 %res, 0
+  ; CHECK-NEXT: %res.insert.success = insertvalue { i8, i1 } %res.insert.value, i1 %success, 1
+  ; CHECK-NEXT: %val = extractvalue { i8, i1 } %res.insert.success, 0
+  %res = cmpxchg i8* %ptr, i8 %oldval, i8 %newval seq_cst seq_cst
+  %val = extractvalue { i8, i1 } %res, 0
+  ret i8 %val  ; CHECK-NEXT: ret i8 %val
+}
+
+; CHECK-LABEL: @test_val_compare_and_swap_i16
+define zeroext i16 @test_val_compare_and_swap_i16(i16* %ptr, i16 zeroext %oldval, i16 zeroext %newval) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.cmpxchg.i16(i16* %ptr, i16 %oldval, i16 %newval, i32 6, i32 6)
+  ; CHECK-NEXT: %success = icmp eq i16 %res, %oldval
+  ; CHECK-NEXT: %res.insert.value = insertvalue { i16, i1 } undef, i16 %res, 0
+  ; CHECK-NEXT: %res.insert.success = insertvalue { i16, i1 } %res.insert.value, i1 %success, 1
+  ; CHECK-NEXT: %val = extractvalue { i16, i1 } %res.insert.success, 0
+  %res = cmpxchg i16* %ptr, i16 %oldval, i16 %newval seq_cst seq_cst
+  %val = extractvalue { i16, i1 } %res, 0
+  ret i16 %val  ; CHECK-NEXT: ret i16 %val
+}
+
+; CHECK-LABEL: @test_val_compare_and_swap_i32
+define i32 @test_val_compare_and_swap_i32(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  ; CHECK-NEXT: %success = icmp eq i32 %res, %oldval
+  ; CHECK-NEXT: %res.insert.value = insertvalue { i32, i1 } undef, i32 %res, 0
+  ; CHECK-NEXT: %res.insert.success = insertvalue { i32, i1 } %res.insert.value, i1 %success, 1
+  ; CHECK-NEXT: %val = extractvalue { i32, i1 } %res.insert.success, 0
+  %res = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  %val = extractvalue { i32, i1 } %res, 0
+  ret i32 %val  ; CHECK-NEXT: ret i32 %val
+}
+
+; CHECK-LABEL: @test_val_compare_and_swap_i64
+define i64 @test_val_compare_and_swap_i64(i64* %ptr, i64 %oldval, i64 %newval) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %oldval, i64 %newval, i32 6, i32 6)
+  ; CHECK-NEXT: %success = icmp eq i64 %res, %oldval
+  ; CHECK-NEXT: %res.insert.value = insertvalue { i64, i1 } undef, i64 %res, 0
+  ; CHECK-NEXT: %res.insert.success = insertvalue { i64, i1 } %res.insert.value, i1 %success, 1
+  ; CHECK-NEXT: %val = extractvalue { i64, i1 } %res.insert.success, 0
+  %res = cmpxchg i64* %ptr, i64 %oldval, i64 %newval seq_cst seq_cst
+  %val = extractvalue { i64, i1 } %res, 0
+  ret i64 %val  ; CHECK-NEXT: ret i64 %val
+}
+
+; __sync_bool_compare_and_swap
+
+; CHECK-LABEL: @test_bool_compare_and_swap_i8
+define zeroext i1 @test_bool_compare_and_swap_i8(i8* %ptr, i8 zeroext %oldval, i8 zeroext %newval) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.cmpxchg.i8(i8* %ptr, i8 %oldval, i8 %newval, i32 6, i32 6)
+  ; CHECK-NEXT: %success = icmp eq i8 %res, %oldval
+  ; CHECK-NEXT: %res.insert.value = insertvalue { i8, i1 } undef, i8 %res, 0
+  ; CHECK-NEXT: %res.insert.success = insertvalue { i8, i1 } %res.insert.value, i1 %success, 1
+  ; CHECK-NEXT: %suc = extractvalue { i8, i1 } %res.insert.success, 1
+  %res = cmpxchg i8* %ptr, i8 %oldval, i8 %newval seq_cst seq_cst
+  %suc = extractvalue { i8, i1 } %res, 1
+  ret i1 %suc  ; CHECK-NEXT: ret i1 %suc
+}
+
+; CHECK-LABEL: @test_bool_compare_and_swap_i16
+define zeroext i1 @test_bool_compare_and_swap_i16(i16* %ptr, i16 zeroext %oldval, i16 zeroext %newval) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.cmpxchg.i16(i16* %ptr, i16 %oldval, i16 %newval, i32 6, i32 6)
+  ; CHECK-NEXT: %success = icmp eq i16 %res, %oldval
+  ; CHECK-NEXT: %res.insert.value = insertvalue { i16, i1 } undef, i16 %res, 0
+  ; CHECK-NEXT: %res.insert.success = insertvalue { i16, i1 } %res.insert.value, i1 %success, 1
+  ; CHECK-NEXT: %suc = extractvalue { i16, i1 } %res.insert.success, 1
+  %res = cmpxchg i16* %ptr, i16 %oldval, i16 %newval seq_cst seq_cst
+  %suc = extractvalue { i16, i1 } %res, 1
+  ret i1 %suc  ; CHECK-NEXT: ret i1 %suc
+}
+
+; CHECK-LABEL: @test_bool_compare_and_swap_i32
+define i1 @test_bool_compare_and_swap_i32(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  ; CHECK-NEXT: %success = icmp eq i32 %res, %oldval
+  ; CHECK-NEXT: %res.insert.value = insertvalue { i32, i1 } undef, i32 %res, 0
+  ; CHECK-NEXT: %res.insert.success = insertvalue { i32, i1 } %res.insert.value, i1 %success, 1
+  ; CHECK-NEXT: %suc = extractvalue { i32, i1 } %res.insert.success, 1
+  %res = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  %suc = extractvalue { i32, i1 } %res, 1
+  ret i1 %suc  ; CHECK-NEXT: ret i1 %suc
+}
+
+; CHECK-LABEL: @test_bool_compare_and_swap_i64
+define i1 @test_bool_compare_and_swap_i64(i64* %ptr, i64 %oldval, i64 %newval) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.cmpxchg.i64(i64* %ptr, i64 %oldval, i64 %newval, i32 6, i32 6)
+  ; CHECK-NEXT: %success = icmp eq i64 %res, %oldval
+  ; CHECK-NEXT: %res.insert.value = insertvalue { i64, i1 } undef, i64 %res, 0
+  ; CHECK-NEXT: %res.insert.success = insertvalue { i64, i1 } %res.insert.value, i1 %success, 1
+  ; CHECK-NEXT: %suc = extractvalue { i64, i1 } %res.insert.success, 1
+  %res = cmpxchg i64* %ptr, i64 %oldval, i64 %newval seq_cst seq_cst
+  %suc = extractvalue { i64, i1 } %res, 1
+  ret i1 %suc  ; CHECK-NEXT: ret i1 %suc
+}
diff --git a/test/Transforms/NaCl/atomic/volatile.ll b/test/Transforms/NaCl/atomic/volatile.ll
new file mode 100644
index 000000000000..58a3d9aecfe3
--- /dev/null
+++ b/test/Transforms/NaCl/atomic/volatile.ll
@@ -0,0 +1,171 @@
+; RUN: opt -nacl-rewrite-atomics -S < %s | FileCheck %s
+;
+; Validate that volatile loads/stores get rewritten into NaCl atomic builtins.
+; The memory ordering for volatile loads/stores could technically be constrained
+; to sequential consistency (enum value 6), or left as relaxed.
+
+target datalayout = "p:32:32:32"
+
+; CHECK-LABEL: @test_volatile_load_i8
+define zeroext i8 @test_volatile_load_i8(i8* %ptr) {
+  ; CHECK-NEXT: %res = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6)
+  %res = load volatile i8, i8* %ptr, align 1
+  ret i8 %res  ; CHECK-NEXT: ret i8 %res
+}
+
+; CHECK-LABEL: @test_volatile_store_i8
+define void @test_volatile_store_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i8(i8 %value, i8* %ptr, i32 6)
+  store volatile i8 %value, i8* %ptr, align 1
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_i16
+define zeroext i16 @test_volatile_load_i16(i16* %ptr) {
+  ; CHECK-NEXT: %res = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6)
+  %res = load volatile i16, i16* %ptr, align 2
+  ret i16 %res  ; CHECK-NEXT: ret i16 %res
+}
+
+; CHECK-LABEL: @test_volatile_store_i16
+define void @test_volatile_store_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i16(i16 %value, i16* %ptr, i32 6)
+  store volatile i16 %value, i16* %ptr, align 2
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_i32
+define i32 @test_volatile_load_i32(i32* %ptr) {
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  %res = load volatile i32, i32* %ptr, align 4
+  ret i32 %res  ; CHECK-NEXT: ret i32 %res
+}
+
+; CHECK-LABEL: @test_volatile_store_i32
+define void @test_volatile_store_i32(i32* %ptr, i32 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  store volatile i32 %value, i32* %ptr, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_i64
+define i64 @test_volatile_load_i64(i64* %ptr) {
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr, i32 6)
+  %res = load volatile i64, i64* %ptr, align 8
+  ret i64 %res  ; CHECK-NEXT: ret i64 %res
+}
+
+; CHECK-LABEL: @test_volatile_store_i64
+define void @test_volatile_store_i64(i64* %ptr, i64 %value) {
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i64(i64 %value, i64* %ptr, i32 6)
+  store volatile i64 %value, i64* %ptr, align 8
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_float
+define float @test_volatile_load_float(float* %ptr) {
+  ; CHECK-NEXT: %ptr.cast = bitcast float* %ptr to i32*
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr.cast, i32 6)
+  ; CHECK-NEXT: %res.cast = bitcast i32 %res to float
+  %res = load volatile float, float* %ptr, align 4
+  ret float %res  ; CHECK-NEXT: ret float %res.cast
+}
+
+; CHECK-LABEL: @test_volatile_store_float
+define void @test_volatile_store_float(float* %ptr, float %value) {
+  ; CHECK-NEXT: %ptr.cast = bitcast float* %ptr to i32*
+  ; CHECK-NEXT: %value.cast = bitcast float %value to i32
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value.cast, i32* %ptr.cast, i32 6)
+  store volatile float %value, float* %ptr, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_double
+define double @test_volatile_load_double(double* %ptr) {
+  ; CHECK-NEXT: %ptr.cast = bitcast double* %ptr to i64*
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr.cast, i32 6)
+  ; CHECK-NEXT: %res.cast = bitcast i64 %res to double
+  %res = load volatile double, double* %ptr, align 8
+  ret double %res  ; CHECK-NEXT: ret double %res.cast
+}
+
+; CHECK-LABEL: @test_volatile_store_double
+define void @test_volatile_store_double(double* %ptr, double %value) {
+  ; CHECK-NEXT: %ptr.cast = bitcast double* %ptr to i64*
+  ; CHECK-NEXT: %value.cast = bitcast double %value to i64
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i64(i64 %value.cast, i64* %ptr.cast, i32 6)
+  store volatile double %value, double* %ptr, align 8
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_i32_pointer
+define i32* @test_volatile_load_i32_pointer(i32** %ptr) {
+  ; CHECK-NEXT: %ptr.cast = bitcast i32** %ptr to i32*
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr.cast, i32 6)
+  ; CHECK-NEXT: %res.cast = inttoptr i32 %res to i32*
+  %res = load volatile i32*, i32** %ptr, align 4
+  ret i32* %res  ; CHECK-NEXT: ret i32* %res.cast
+}
+
+; CHECK-LABEL: @test_volatile_store_i32_pointer
+define void @test_volatile_store_i32_pointer(i32** %ptr, i32* %value) {
+  ; CHECK-NEXT: %ptr.cast = bitcast i32** %ptr to i32*
+  ; CHECK-NEXT: %value.cast = ptrtoint i32* %value to i32
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value.cast, i32* %ptr.cast, i32 6)
+  store volatile i32* %value, i32** %ptr, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_double_pointer
+define double* @test_volatile_load_double_pointer(double** %ptr) {
+  ; CHECK-NEXT: %ptr.cast = bitcast double** %ptr to i32*
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr.cast, i32 6)
+  ; CHECK-NEXT: %res.cast = inttoptr i32 %res to double*
+  %res = load volatile double*, double** %ptr, align 4
+  ret double* %res  ; CHECK-NEXT: ret double* %res.cast
+}
+
+; CHECK-LABEL: @test_volatile_store_double_pointer
+define void @test_volatile_store_double_pointer(double** %ptr, double* %value) {
+  ; CHECK-NEXT: %ptr.cast = bitcast double** %ptr to i32*
+  ; CHECK-NEXT: %value.cast = ptrtoint double* %value to i32
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value.cast, i32* %ptr.cast, i32 6)
+  store volatile double* %value, double** %ptr, align 4
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_v4i8
+define <4 x i8> @test_volatile_load_v4i8(<4 x i8>* %ptr) {
+  ; CHECK-NEXT: %ptr.cast = bitcast <4 x i8>* %ptr to i32*
+  ; CHECK-NEXT: %res = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr.cast, i32 6)
+  ; CHECK-NEXT: %res.cast = bitcast i32 %res to <4 x i8>
+  %res = load volatile <4 x i8>, <4 x i8>* %ptr, align 8
+  ret <4 x i8> %res  ; CHECK-NEXT: ret <4 x i8> %res.cast
+}
+
+; CHECK-LABEL: @test_volatile_store_v4i8
+define void @test_volatile_store_v4i8(<4 x i8>* %ptr, <4 x i8> %value) {
+  ; CHECK-NEXT: %ptr.cast = bitcast <4 x i8>* %ptr to i32*
+  ; CHECK-NEXT: %value.cast = bitcast <4 x i8> %value to i32
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i32(i32 %value.cast, i32* %ptr.cast, i32 6)
+  store volatile <4 x i8> %value, <4 x i8>* %ptr, align 8
+  ret void  ; CHECK-NEXT: ret void
+}
+
+; CHECK-LABEL: @test_volatile_load_v4i16
+define <4 x i16> @test_volatile_load_v4i16(<4 x i16>* %ptr) {
+  ; CHECK-NEXT: %ptr.cast = bitcast <4 x i16>* %ptr to i64*
+  ; CHECK-NEXT: %res = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr.cast, i32 6)
+  ; CHECK-NEXT: %res.cast = bitcast i64 %res to <4 x i16>
+  %res = load volatile <4 x i16>, <4 x i16>* %ptr, align 8
+  ret <4 x i16> %res  ; CHECK-NEXT: ret <4 x i16> %res.cast
+}
+
+; CHECK-LABEL: @test_volatile_store_v4i16
+define void @test_volatile_store_v4i16(<4 x i16>* %ptr, <4 x i16> %value) {
+  ; CHECK-NEXT: %ptr.cast = bitcast <4 x i16>* %ptr to i64*
+  ; CHECK-NEXT: %value.cast = bitcast <4 x i16> %value to i64
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.store.i64(i64 %value.cast, i64* %ptr.cast, i32 6)
+  store volatile <4 x i16> %value, <4 x i16>* %ptr, align 8
+  ret void  ; CHECK-NEXT: ret void
+}
diff --git a/test/Transforms/NaCl/canonicalize-mem-intrinsics.ll b/test/Transforms/NaCl/canonicalize-mem-intrinsics.ll
new file mode 100644
index 000000000000..9c263fd15e71
--- /dev/null
+++ b/test/Transforms/NaCl/canonicalize-mem-intrinsics.ll
@@ -0,0 +1,45 @@
+; RUN: opt %s -canonicalize-mem-intrinsics -S | FileCheck %s
+; RUN: opt %s -canonicalize-mem-intrinsics -S \
+; RUN:     | FileCheck %s -check-prefix=CLEANED
+
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+; CLEANED-NOT: @llvm.mem{{.*}}i64
+
+
+define void @memset_caller(i8* %dest, i8 %char, i64 %size) {
+  call void @llvm.memset.p0i8.i64(i8* %dest, i8 %char, i64 %size, i32 1, i1 0)
+  ret void
+}
+; CHECK: define void @memset_caller
+; CHECK-NEXT: %mem_len_truncate = trunc i64 %size to i32
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %dest, i8 %char, i32 %mem_len_truncate, i32 1, i1 false)
+
+
+define void @memcpy_caller(i8* %dest, i8* %src, i64 %size) {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 %size, i32 1, i1 0)
+  ret void
+}
+; CHECK: define void @memcpy_caller
+; CHECK-NEXT: %mem_len_truncate = trunc i64 %size to i32
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %mem_len_truncate, i32 1, i1 false)
+
+
+define void @memmove_caller(i8* %dest, i8* %src, i64 %size) {
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* %dest, i8* %src, i64 %size, i32 1, i1 0)
+  ret void
+}
+; CHECK: define void @memmove_caller
+; CHECK-NEXT: %mem_len_truncate = trunc i64 %size to i32
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %mem_len_truncate, i32 1, i1 false)
+
+
+; Check that constant sizes remain as constants.
+
+define void @memset_caller_const(i8* %dest, i8 %char) {
+  call void @llvm.memset.p0i8.i64(i8* %dest, i8 %char, i64 123, i32 1, i1 0)
+  ret void
+}
+; CHECK: define void @memset_caller
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %dest, i8 %char, i32 123, i32 1, i1 false)
diff --git a/test/Transforms/NaCl/cleanup-used-globals-metadata.ll b/test/Transforms/NaCl/cleanup-used-globals-metadata.ll
new file mode 100644
index 000000000000..4c9d13ba6b0e
--- /dev/null
+++ b/test/Transforms/NaCl/cleanup-used-globals-metadata.ll
@@ -0,0 +1,15 @@
+; RUN: opt %s -cleanup-used-globals-metadata  -S | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64"
+target triple = "le32-unknown-nacl"
+
+@llvm.used = appending global [1 x i8*] [i8* bitcast (void ()* @foo to i8*)], section "llvm.metadata"
+; The used list is removed.
+; CHECK-NOT: @llvm.used
+
+
+define internal void @foo() #0 {
+  ret void
+}
+; The global (@foo) is still present.
+; CHECK-LABEL: define internal void @foo
diff --git a/test/Transforms/NaCl/combine-shuffle-vector.ll b/test/Transforms/NaCl/combine-shuffle-vector.ll
new file mode 100644
index 000000000000..70cc17efc118
--- /dev/null
+++ b/test/Transforms/NaCl/combine-shuffle-vector.ll
@@ -0,0 +1,69 @@
+; RUN: opt -expand-shufflevector %s -S | \
+; RUN:   opt -backend-canonicalize -S | FileCheck %s
+
+; Test that shufflevector is re-created after having been expanded to
+; insertelement / extractelement: shufflevector isn't part of the stable
+; PNaCl ABI but insertelement / extractelement are. Re-creating
+; shufflevector allows the backend to generate more efficient code.
+;
+; TODO(jfb) Narrow and widen aren't tested since the underlying types
+;           are currently not supported by the PNaCl ABI.
+
+; The datalayout is needed to fold global constants.
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+define <4 x i32> @test_splat_lo_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_splat_lo_4xi32
+  ; CHECK-NEXT: %[[R:[0-9]+]] = shufflevector <4 x i32> %lhs, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ; CHECK-NEXT: ret <4 x i32> %[[R]]
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_splat_hi_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_splat_hi_4xi32
+  ; CHECK-NEXT: %[[R:[0-9]+]] = shufflevector <4 x i32> %rhs, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  ; CHECK-NEXT: ret <4 x i32> %[[R]]
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_id_lo_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_id_lo_4xi32
+  ; CHECK-NEXT: %[[R:[0-9]+]] = shufflevector <4 x i32> %lhs, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NEXT: ret <4 x i32> %[[R]]
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_id_hi_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_id_hi_4xi32
+  ; CHECK-NEXT: %[[R:[0-9]+]] = shufflevector <4 x i32> %rhs, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ; CHECK-NEXT: ret <4 x i32> %[[R]]
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_interleave_lo_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_interleave_lo_4xi32
+  ; CHECK-NEXT: %[[R:[0-9]+]] = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ; CHECK-NEXT: ret <4 x i32> %[[R]]
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_interleave_hi_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_interleave_hi_4xi32
+  ; CHECK-NEXT: %[[R:[0-9]+]] = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ; CHECK-NEXT: ret <4 x i32> %[[R]]
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_with_constant(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_with_constant
+  ; CHECK-NEXT: %[[R:[0-9]+]] = shufflevector <4 x i32> %lhs, <4 x i32> <i32 0, i32 -1, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> <i32 0, i32 -1, i32 undef, i32 undef>, <4 x i32> <i32 4, i32 0, i32 1, i32 5>
+  ; CHECK-NEXT: ret <4 x i32> %[[R]]
+  ret <4 x i32> %res
+}
diff --git a/test/Transforms/NaCl/constant-insert-extract-element-index.ll b/test/Transforms/NaCl/constant-insert-extract-element-index.ll
new file mode 100644
index 000000000000..4c53afa50940
--- /dev/null
+++ b/test/Transforms/NaCl/constant-insert-extract-element-index.ll
@@ -0,0 +1,425 @@
+; RUN: opt -constant-insert-extract-element-index %s -S | FileCheck %s
+
+; The datalayout is needed to determine the alignment of the load/stores.
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+
+; The following insert/extract elements are all indexed with an in-range
+; constant, they should remain unchanged.
+
+define void @test_16xi1_unchanged(<16 x i1> %in) {
+  ; CHECK-LABEL: test_16xi1_unchanged
+  ; CHECK-NOT: alloca
+  ; CHECK: extractelement
+  %e.0 = extractelement <16 x i1> %in, i32 0
+  %e.1 = extractelement <16 x i1> %in, i32 1
+  %e.2 = extractelement <16 x i1> %in, i32 2
+  %e.3 = extractelement <16 x i1> %in, i32 3
+  %e.4 = extractelement <16 x i1> %in, i32 4
+  %e.5 = extractelement <16 x i1> %in, i32 5
+  %e.6 = extractelement <16 x i1> %in, i32 6
+  %e.7 = extractelement <16 x i1> %in, i32 7
+  %e.8 = extractelement <16 x i1> %in, i32 8
+  %e.9 = extractelement <16 x i1> %in, i32 9
+  %e.10 = extractelement <16 x i1> %in, i32 10
+  %e.11 = extractelement <16 x i1> %in, i32 11
+  %e.12 = extractelement <16 x i1> %in, i32 12
+  %e.13 = extractelement <16 x i1> %in, i32 13
+  %e.14 = extractelement <16 x i1> %in, i32 14
+  %e.15 = extractelement <16 x i1> %in, i32 15
+  ; CHECK-NOT: alloca
+  ; CHECK: insertelement
+  %i.0 = insertelement <16 x i1> %in, i1 true, i32 0
+  %i.1 = insertelement <16 x i1> %in, i1 true, i32 1
+  %i.2 = insertelement <16 x i1> %in, i1 true, i32 2
+  %i.3 = insertelement <16 x i1> %in, i1 true, i32 3
+  %i.4 = insertelement <16 x i1> %in, i1 true, i32 4
+  %i.5 = insertelement <16 x i1> %in, i1 true, i32 5
+  %i.6 = insertelement <16 x i1> %in, i1 true, i32 6
+  %i.7 = insertelement <16 x i1> %in, i1 true, i32 7
+  %i.8 = insertelement <16 x i1> %in, i1 true, i32 8
+  %i.9 = insertelement <16 x i1> %in, i1 true, i32 9
+  %i.10 = insertelement <16 x i1> %in, i1 true, i32 10
+  %i.11 = insertelement <16 x i1> %in, i1 true, i32 11
+  %i.12 = insertelement <16 x i1> %in, i1 true, i32 12
+  %i.13 = insertelement <16 x i1> %in, i1 true, i32 13
+  %i.14 = insertelement <16 x i1> %in, i1 true, i32 14
+  %i.15 = insertelement <16 x i1> %in, i1 true, i32 15
+  ; CHECK-NOT: alloca
+  ret void
+}
+
+define void @test_8xi1_unchanged(<8 x i1> %in) {
+  ; CHECK-LABEL: test_8xi1_unchanged
+  ; CHECK-NOT: alloca
+  ; CHECK: extractelement
+  %e.0 = extractelement <8 x i1> %in, i32 0
+  %e.1 = extractelement <8 x i1> %in, i32 1
+  %e.2 = extractelement <8 x i1> %in, i32 2
+  %e.3 = extractelement <8 x i1> %in, i32 3
+  %e.4 = extractelement <8 x i1> %in, i32 4
+  %e.5 = extractelement <8 x i1> %in, i32 5
+  %e.6 = extractelement <8 x i1> %in, i32 6
+  %e.7 = extractelement <8 x i1> %in, i32 7
+  ; CHECK-NOT: alloca
+  ; CHECK: insertelement
+  %i.0 = insertelement <8 x i1> %in, i1 true, i32 0
+  %i.1 = insertelement <8 x i1> %in, i1 true, i32 1
+  %i.2 = insertelement <8 x i1> %in, i1 true, i32 2
+  %i.3 = insertelement <8 x i1> %in, i1 true, i32 3
+  %i.4 = insertelement <8 x i1> %in, i1 true, i32 4
+  %i.5 = insertelement <8 x i1> %in, i1 true, i32 5
+  %i.6 = insertelement <8 x i1> %in, i1 true, i32 6
+  %i.7 = insertelement <8 x i1> %in, i1 true, i32 7
+  ; CHECK-NOT: alloca
+  ret void
+}
+
+define void @test_4xi1_unchanged(<4 x i1> %in) {
+  ; CHECK-LABEL: test_4xi1_unchanged
+  ; CHECK-NOT: alloca
+  ; CHECK: extractelement
+  %e.0 = extractelement <4 x i1> %in, i32 0
+  %e.1 = extractelement <4 x i1> %in, i32 1
+  %e.2 = extractelement <4 x i1> %in, i32 2
+  %e.3 = extractelement <4 x i1> %in, i32 3
+  ; CHECK-NOT: alloca
+  ; CHECK: insertelement
+  %i.0 = insertelement <4 x i1> %in, i1 true, i32 0
+  %i.1 = insertelement <4 x i1> %in, i1 true, i32 1
+  %i.2 = insertelement <4 x i1> %in, i1 true, i32 2
+  %i.3 = insertelement <4 x i1> %in, i1 true, i32 3
+  ; CHECK-NOT: alloca
+  ret void
+}
+
+define void @test_16xi8_unchanged(<16 x i8> %in) {
+  ; CHECK-LABEL: test_16xi8_unchanged
+  ; CHECK-NOT: alloca
+  ; CHECK: extractelement
+  %e.0 = extractelement <16 x i8> %in, i32 0
+  %e.1 = extractelement <16 x i8> %in, i32 1
+  %e.2 = extractelement <16 x i8> %in, i32 2
+  %e.3 = extractelement <16 x i8> %in, i32 3
+  %e.4 = extractelement <16 x i8> %in, i32 4
+  %e.5 = extractelement <16 x i8> %in, i32 5
+  %e.6 = extractelement <16 x i8> %in, i32 6
+  %e.7 = extractelement <16 x i8> %in, i32 7
+  %e.8 = extractelement <16 x i8> %in, i32 8
+  %e.9 = extractelement <16 x i8> %in, i32 9
+  %e.10 = extractelement <16 x i8> %in, i32 10
+  %e.11 = extractelement <16 x i8> %in, i32 11
+  %e.12 = extractelement <16 x i8> %in, i32 12
+  %e.13 = extractelement <16 x i8> %in, i32 13
+  %e.14 = extractelement <16 x i8> %in, i32 14
+  %e.15 = extractelement <16 x i8> %in, i32 15
+  ; CHECK-NOT: alloca
+  ; CHECK: insertelement
+  %i.0 = insertelement <16 x i8> %in, i8 42, i32 0
+  %i.1 = insertelement <16 x i8> %in, i8 42, i32 1
+  %i.2 = insertelement <16 x i8> %in, i8 42, i32 2
+  %i.3 = insertelement <16 x i8> %in, i8 42, i32 3
+  %i.4 = insertelement <16 x i8> %in, i8 42, i32 4
+  %i.5 = insertelement <16 x i8> %in, i8 42, i32 5
+  %i.6 = insertelement <16 x i8> %in, i8 42, i32 6
+  %i.7 = insertelement <16 x i8> %in, i8 42, i32 7
+  %i.8 = insertelement <16 x i8> %in, i8 42, i32 8
+  %i.9 = insertelement <16 x i8> %in, i8 42, i32 9
+  %i.10 = insertelement <16 x i8> %in, i8 42, i32 10
+  %i.11 = insertelement <16 x i8> %in, i8 42, i32 11
+  %i.12 = insertelement <16 x i8> %in, i8 42, i32 12
+  %i.13 = insertelement <16 x i8> %in, i8 42, i32 13
+  %i.14 = insertelement <16 x i8> %in, i8 42, i32 14
+  %i.15 = insertelement <16 x i8> %in, i8 42, i32 15
+  ; CHECK-NOT: alloca
+  ret void
+}
+
+define void @test_8xi16_unchanged(<8 x i16> %in) {
+  ; CHECK-LABEL: test_8xi16_unchanged
+  ; CHECK-NOT: alloca
+  ; CHECK: extractelement
+  %e.0 = extractelement <8 x i16> %in, i32 0
+  %e.1 = extractelement <8 x i16> %in, i32 1
+  %e.2 = extractelement <8 x i16> %in, i32 2
+  %e.3 = extractelement <8 x i16> %in, i32 3
+  %e.4 = extractelement <8 x i16> %in, i32 4
+  %e.5 = extractelement <8 x i16> %in, i32 5
+  %e.6 = extractelement <8 x i16> %in, i32 6
+  %e.7 = extractelement <8 x i16> %in, i32 7
+  ; CHECK-NOT: alloca
+  ; CHECK: insertelement
+  %i.0 = insertelement <8 x i16> %in, i16 42, i32 0
+  %i.1 = insertelement <8 x i16> %in, i16 42, i32 1
+  %i.2 = insertelement <8 x i16> %in, i16 42, i32 2
+  %i.3 = insertelement <8 x i16> %in, i16 42, i32 3
+  %i.4 = insertelement <8 x i16> %in, i16 42, i32 4
+  %i.5 = insertelement <8 x i16> %in, i16 42, i32 5
+  %i.6 = insertelement <8 x i16> %in, i16 42, i32 6
+  %i.7 = insertelement <8 x i16> %in, i16 42, i32 7
+  ; CHECK-NOT: alloca
+  ret void
+}
+
+define void @test_4xi32_unchanged(<4 x i32> %in) {
+  ; CHECK-LABEL: test_4xi32_unchanged
+  ; CHECK-NOT: alloca
+  ; CHECK: extractelement
+  %e.0 = extractelement <4 x i32> %in, i32 0
+  %e.1 = extractelement <4 x i32> %in, i32 1
+  %e.2 = extractelement <4 x i32> %in, i32 2
+  %e.3 = extractelement <4 x i32> %in, i32 3
+  ; CHECK-NOT: alloca
+  ; CHECK: insertelement
+  %i.0 = insertelement <4 x i32> %in, i32 42, i32 0
+  %i.1 = insertelement <4 x i32> %in, i32 42, i32 1
+  %i.2 = insertelement <4 x i32> %in, i32 42, i32 2
+  %i.3 = insertelement <4 x i32> %in, i32 42, i32 3
+  ; CHECK-NOT: alloca
+  ret void
+}
+
+define void @test_4xfloat_unchanged(<4 x float> %in) {
+  ; CHECK-LABEL: test_4xfloat_unchanged
+  ; CHECK-NOT: alloca
+  ; CHECK: extractelement
+  %e.0 = extractelement <4 x float> %in, i32 0
+  %e.1 = extractelement <4 x float> %in, i32 1
+  %e.2 = extractelement <4 x float> %in, i32 2
+  %e.3 = extractelement <4 x float> %in, i32 3
+  ; CHECK-NOT: alloca
+  ; CHECK: insertelement
+  %i.0 = insertelement <4 x float> %in, float 42.0, i32 0
+  %i.1 = insertelement <4 x float> %in, float 42.0, i32 1
+  %i.2 = insertelement <4 x float> %in, float 42.0, i32 2
+  %i.3 = insertelement <4 x float> %in, float 42.0, i32 3
+  ; CHECK-NOT: alloca
+  ret void
+}
+
+
+; The following insert/extract elements are all indexed with an
+; out-of-range constant, they should get modified so that the constant
+; is now in-range.
+
+define <16 x i1> @test_16xi1_out_of_range(<16 x i1> %in) {
+  ; CHECK-LABEL: test_16xi1_out_of_range
+  ; CHECK-NEXT: extractelement <16 x i1> %in, i32 0
+  %e.16 = extractelement <16 x i1> %in, i32 16
+  ; CHECK-NEXT: %i.16 = insertelement <16 x i1> %in, i1 %e.16, i32 0
+  %i.16 = insertelement <16 x i1> %in, i1 %e.16, i32 16
+  ; CHECK-NEXT: ret <16 x i1> %i.16
+  ret <16 x i1> %i.16
+}
+
+define <8 x i1> @test_8xi1_out_of_range(<8 x i1> %in) {
+  ; CHECK-LABEL: test_8xi1_out_of_range
+  ; CHECK-NEXT: %e.8 = extractelement <8 x i1> %in, i32 0
+  %e.8 = extractelement <8 x i1> %in, i32 8
+  ; CHECK-NEXT: %i.8 = insertelement <8 x i1> %in, i1 %e.8, i32 0
+  %i.8 = insertelement <8 x i1> %in, i1 %e.8, i32 8
+  ; CHECK-NEXT: ret <8 x i1> %i.8
+  ret <8 x i1> %i.8
+}
+
+define <4 x i1> @test_4xi1_out_of_range(<4 x i1> %in) {
+  ; CHECK-LABEL: test_4xi1_out_of_range
+  ; CHECK-NEXT: %e.4 = extractelement <4 x i1> %in, i32 0
+  %e.4 = extractelement <4 x i1> %in, i32 4
+  ; CHECK-NEXT: %i.4 = insertelement <4 x i1> %in, i1 %e.4, i32 0
+  %i.4 = insertelement <4 x i1> %in, i1 %e.4, i32 4
+  ; CHECK-NEXT: ret <4 x i1> %i.4
+  ret <4 x i1> %i.4
+}
+
+define <16 x i8> @test_16xi8_out_of_range(<16 x i8> %in) {
+  ; CHECK-LABEL: test_16xi8_out_of_range
+  ; CHECK-NEXT: %e.16 = extractelement <16 x i8> %in, i32 0
+  %e.16 = extractelement <16 x i8> %in, i32 16
+  ; CHECK-NEXT: %i.16 = insertelement <16 x i8> %in, i8 %e.16, i32 0
+  %i.16 = insertelement <16 x i8> %in, i8 %e.16, i32 16
+  ; CHECK-NEXT: ret <16 x i8> %i.16
+  ret <16 x i8> %i.16
+}
+
+define <8 x i16> @test_8xi16_out_of_range(<8 x i16> %in) {
+  ; CHECK-LABEL: test_8xi16_out_of_range
+  ; CHECK-NEXT: %e.8 = extractelement <8 x i16> %in, i32 0
+  %e.8 = extractelement <8 x i16> %in, i32 8
+  ; CHECK-NEXT: %i.8 = insertelement <8 x i16> %in, i16 %e.8, i32 0
+  %i.8 = insertelement <8 x i16> %in, i16 %e.8, i32 8
+  ; CHECK-NEXT: ret <8 x i16> %i.8
+  ret <8 x i16> %i.8
+}
+
+define <4 x i32> @test_4xi32_out_of_range(<4 x i32> %in) {
+  ; CHECK-LABEL: test_4xi32_out_of_range
+  ; CHECK-NEXT: %e.4 = extractelement <4 x i32> %in, i32 0
+  %e.4 = extractelement <4 x i32> %in, i32 4
+  ; CHECK-NEXT: %i.4 = insertelement <4 x i32> %in, i32 %e.4, i32 0
+  %i.4 = insertelement <4 x i32> %in, i32 %e.4, i32 4
+  ; CHECK-NEXT: ret <4 x i32> %i.4
+  ret <4 x i32> %i.4
+}
+
+define <4 x float> @test_4xfloat_out_of_range(<4 x float> %in) {
+  ; CHECK-LABEL: test_4xfloat_out_of_range
+  ; CHECK-NEXT: %e.4 = extractelement <4 x float> %in, i32 0
+  %e.4 = extractelement <4 x float> %in, i32 4
+  ; CHECK-NEXT: %i.4 = insertelement <4 x float> %in, float %e.4, i32 0
+  %i.4 = insertelement <4 x float> %in, float %e.4, i32 4
+  ; CHECK-NEXT: ret <4 x float> %i.4
+  ret <4 x float> %i.4
+}
+
+define <4 x i32> @test_4xi32_out_of_range_urem(<4 x i32> %in) {
+  ; CHECK-LABEL: test_4xi32_out_of_range_urem
+  %e.4 = extractelement <4 x i32> %in, i32 4 ; CHECK-NEXT: {{.*}} extractelement {{.*}} i32 0
+  %e.5 = extractelement <4 x i32> %in, i32 5 ; CHECK-NEXT: {{.*}} extractelement {{.*}} i32 1
+  %e.6 = extractelement <4 x i32> %in, i32 6 ; CHECK-NEXT: {{.*}} extractelement {{.*}} i32 2
+  %e.7 = extractelement <4 x i32> %in, i32 7 ; CHECK-NEXT: {{.*}} extractelement {{.*}} i32 3
+  %e.8 = extractelement <4 x i32> %in, i32 8 ; CHECK-NEXT: {{.*}} extractelement {{.*}} i32 0
+  %i.4 = insertelement <4 x i32> %in, i32 %e.4, i32 4 ; CHECK-NEXT: {{.*}} insertelement {{.*}} i32 0
+  %i.5 = insertelement <4 x i32> %in, i32 %e.5, i32 5 ; CHECK-NEXT: {{.*}} insertelement {{.*}} i32 1
+  %i.6 = insertelement <4 x i32> %in, i32 %e.6, i32 6 ; CHECK-NEXT: {{.*}} insertelement {{.*}} i32 2
+  %i.7 = insertelement <4 x i32> %in, i32 %e.7, i32 7 ; CHECK-NEXT: {{.*}} insertelement {{.*}} i32 3
+  %i.8 = insertelement <4 x i32> %in, i32 %e.8, i32 8 ; CHECK-NEXT: {{.*}} insertelement {{.*}} i32 0
+  ; CHECK-NEXT: ret <4 x i32> %i.4
+  ret <4 x i32> %i.4
+}
+
+; The following insert/extract elements are all indexed with a variable,
+; they should get modified.
+
+define <16 x i1> @test_16xi1_variable(<16 x i1> %in, i32 %idx) {
+  ; CHECK-LABEL: test_16xi1_variable
+  ; CHECK-NEXT: %[[EALLOCA:[0-9]+]] = alloca i1, i32 16, align 16
+  ; CHECK-NEXT: %[[ECAST:[0-9]+]] = bitcast i1* %[[EALLOCA]] to <16 x i1>*
+  ; CHECK-NEXT: store <16 x i1> %in, <16 x i1>* %[[ECAST]], align 16
+  ; CHECK-NEXT: %[[EGEP:[0-9]+]] = getelementptr i1, i1* %[[EALLOCA]], i32 %idx
+  ; CHECK-NEXT: %[[ELOAD:[0-9]+]] = load i1, i1* %[[EGEP]], align 1
+  %e.16 = extractelement <16 x i1> %in, i32 %idx
+  ; CHECK-NEXT: %[[IALLOCA:[0-9]+]] = alloca i1, i32 16, align 16
+  ; CHECK-NEXT: %[[ICAST:[0-9]+]] = bitcast i1* %[[IALLOCA]] to <16 x i1>*
+  ; CHECK-NEXT: store <16 x i1> %in, <16 x i1>* %[[ICAST]], align 16
+  ; CHECK-NEXT: %[[IGEP:[0-9]+]] = getelementptr i1, i1* %[[IALLOCA]], i32 %idx
+  ; CHECK-NEXT: store i1 %[[ELOAD]], i1* %[[IGEP]], align 1
+  ; CHECK-NEXT: %[[ILOAD:[0-9]+]] = load <16 x i1>, <16 x i1>* %[[ICAST]], align 16
+  %i.16 = insertelement <16 x i1> %in, i1 %e.16, i32 %idx
+  ; CHECK-NEXT: ret <16 x i1> %[[ILOAD]]
+  ret <16 x i1> %i.16
+}
+
+define <8 x i1> @test_8xi1_variable(<8 x i1> %in, i32 %idx) {
+  ; CHECK-LABEL: test_8xi1_variable
+  ; CHECK-NEXT: %[[EALLOCA:[0-9]+]] = alloca i1, i32 8, align 8
+  ; CHECK-NEXT: %[[ECAST:[0-9]+]] = bitcast i1* %[[EALLOCA]] to <8 x i1>*
+  ; CHECK-NEXT: store <8 x i1> %in, <8 x i1>* %[[ECAST]], align 8
+  ; CHECK-NEXT: %[[EGEP:[0-9]+]] = getelementptr i1, i1* %[[EALLOCA]], i32 %idx
+  ; CHECK-NEXT: %[[ELOAD:[0-9]+]] = load i1, i1* %[[EGEP]], align 1
+  %e.8 = extractelement <8 x i1> %in, i32 %idx
+  ; CHECK-NEXT: %[[IALLOCA:[0-9]+]] = alloca i1, i32 8, align 8
+  ; CHECK-NEXT: %[[ICAST:[0-9]+]] = bitcast i1* %[[IALLOCA]] to <8 x i1>*
+  ; CHECK-NEXT: store <8 x i1> %in, <8 x i1>* %[[ICAST]], align 8
+  ; CHECK-NEXT: %[[IGEP:[0-9]+]] = getelementptr i1, i1* %[[IALLOCA]], i32 %idx
+  ; CHECK-NEXT: store i1 %[[ELOAD]], i1* %[[IGEP]], align 1
+  ; CHECK-NEXT: %[[ILOAD:[0-9]+]] = load <8 x i1>, <8 x i1>* %[[ICAST]], align 8
+  %i.8 = insertelement <8 x i1> %in, i1 %e.8, i32 %idx
+  ; CHECK-NEXT: ret <8 x i1> %[[ILOAD]]
+  ret <8 x i1> %i.8
+}
+
+define <4 x i1> @test_4xi1_variable(<4 x i1> %in, i32 %idx) {
+  ; CHECK-LABEL: test_4xi1_variable
+  ; CHECK-NEXT: %[[EALLOCA:[0-9]+]] = alloca i1, i32 4, align 4
+  ; CHECK-NEXT: %[[ECAST:[0-9]+]] = bitcast i1* %[[EALLOCA]] to <4 x i1>*
+  ; CHECK-NEXT: store <4 x i1> %in, <4 x i1>* %[[ECAST]], align 4
+  ; CHECK-NEXT: %[[EGEP:[0-9]+]] = getelementptr i1, i1* %[[EALLOCA]], i32 %idx
+  ; CHECK-NEXT: %[[ELOAD:[0-9]+]] = load i1, i1* %[[EGEP]], align 1
+  %e.4 = extractelement <4 x i1> %in, i32 %idx
+  ; CHECK-NEXT: %[[IALLOCA:[0-9]+]] = alloca i1, i32 4, align 4
+  ; CHECK-NEXT: %[[ICAST:[0-9]+]] = bitcast i1* %[[IALLOCA]] to <4 x i1>*
+  ; CHECK-NEXT: store <4 x i1> %in, <4 x i1>* %[[ICAST]], align 4
+  ; CHECK-NEXT: %[[IGEP:[0-9]+]] = getelementptr i1, i1* %[[IALLOCA]], i32 %idx
+  ; CHECK-NEXT: store i1 %[[ELOAD]], i1* %[[IGEP]], align 1
+  ; CHECK-NEXT: %[[ILOAD:[0-9]+]] = load <4 x i1>, <4 x i1>* %[[ICAST]], align 4
+  %i.4 = insertelement <4 x i1> %in, i1 %e.4, i32 %idx
+  ; CHECK-NEXT: ret <4 x i1> %[[ILOAD]]
+  ret <4 x i1> %i.4
+}
+
+define <16 x i8> @test_16xi8_variable(<16 x i8> %in, i32 %idx) {
+  ; CHECK-LABEL: test_16xi8_variable
+  ; CHECK-NEXT: %[[EALLOCA:[0-9]+]] = alloca i8, i32 16, align 4
+  ; CHECK-NEXT: %[[ECAST:[0-9]+]] = bitcast i8* %[[EALLOCA]] to <16 x i8>*
+  ; CHECK-NEXT: store <16 x i8> %in, <16 x i8>* %[[ECAST]], align 4
+  ; CHECK-NEXT: %[[EGEP:[0-9]+]] = getelementptr i8, i8* %[[EALLOCA]], i32 %idx
+  ; CHECK-NEXT: %[[ELOAD:[0-9]+]] = load i8, i8* %[[EGEP]], align 1
+  %e.16 = extractelement <16 x i8> %in, i32 %idx
+  ; CHECK-NEXT: %[[IALLOCA:[0-9]+]] = alloca i8, i32 16, align 4
+  ; CHECK-NEXT: %[[ICAST:[0-9]+]] = bitcast i8* %[[IALLOCA]] to <16 x i8>*
+  ; CHECK-NEXT: store <16 x i8> %in, <16 x i8>* %[[ICAST]], align 4
+  ; CHECK-NEXT: %[[IGEP:[0-9]+]] = getelementptr i8, i8* %[[IALLOCA]], i32 %idx
+  ; CHECK-NEXT: store i8 %[[ELOAD]], i8* %[[IGEP]], align 1
+  ; CHECK-NEXT: %[[ILOAD:[0-9]+]] = load <16 x i8>, <16 x i8>* %[[ICAST]], align 4
+  %i.16 = insertelement <16 x i8> %in, i8 %e.16, i32 %idx
+  ; CHECK-NEXT: ret <16 x i8> %[[ILOAD]]
+  ret <16 x i8> %i.16
+}
+
+define <8 x i16> @test_8xi16_variable(<8 x i16> %in, i32 %idx) {
+  ; CHECK-LABEL: test_8xi16_variable
+  ; CHECK-NEXT: %[[EALLOCA:[0-9]+]] = alloca i16, i32 8, align 4
+  ; CHECK-NEXT: %[[ECAST:[0-9]+]] = bitcast i16* %[[EALLOCA]] to <8 x i16>*
+  ; CHECK-NEXT: store <8 x i16> %in, <8 x i16>* %[[ECAST]], align 4
+  ; CHECK-NEXT: %[[EGEP:[0-9]+]] = getelementptr i16, i16* %[[EALLOCA]], i32 %idx
+  ; CHECK-NEXT: %[[ELOAD:[0-9]+]] = load i16, i16* %[[EGEP]], align 2
+  %e.8 = extractelement <8 x i16> %in, i32 %idx
+  ; CHECK-NEXT: %[[IALLOCA:[0-9]+]] = alloca i16, i32 8, align 4
+  ; CHECK-NEXT: %[[ICAST:[0-9]+]] = bitcast i16* %[[IALLOCA]] to <8 x i16>*
+  ; CHECK-NEXT: store <8 x i16> %in, <8 x i16>* %[[ICAST]], align 4
+  ; CHECK-NEXT: %[[IGEP:[0-9]+]] = getelementptr i16, i16* %[[IALLOCA]], i32 %idx
+  ; CHECK-NEXT: store i16 %[[ELOAD]], i16* %[[IGEP]], align 2
+  ; CHECK-NEXT: %[[ILOAD:[0-9]+]] = load <8 x i16>, <8 x i16>* %[[ICAST]], align 4
+  %i.8 = insertelement <8 x i16> %in, i16 %e.8, i32 %idx
+  ; CHECK-NEXT: ret <8 x i16> %[[ILOAD]]
+  ret <8 x i16> %i.8
+}
+
+define <4 x i32> @test_4xi32_variable(<4 x i32> %in, i32 %idx) {
+  ; CHECK-LABEL: test_4xi32_variable
+  ; CHECK-NEXT: %[[EALLOCA:[0-9]+]] = alloca i32, i32 4, align 4
+  ; CHECK-NEXT: %[[ECAST:[0-9]+]] = bitcast i32* %[[EALLOCA]] to <4 x i32>*
+  ; CHECK-NEXT: store <4 x i32> %in, <4 x i32>* %[[ECAST]], align 4
+  ; CHECK-NEXT: %[[EGEP:[0-9]+]] = getelementptr i32, i32* %[[EALLOCA]], i32 %idx
+  ; CHECK-NEXT: %[[ELOAD:[0-9]+]] = load i32, i32* %[[EGEP]], align 4
+  %e.4 = extractelement <4 x i32> %in, i32 %idx
+  ; CHECK-NEXT: %[[IALLOCA:[0-9]+]] = alloca i32, i32 4, align 4
+  ; CHECK-NEXT: %[[ICAST:[0-9]+]] = bitcast i32* %[[IALLOCA]] to <4 x i32>*
+  ; CHECK-NEXT: store <4 x i32> %in, <4 x i32>* %[[ICAST]], align 4
+  ; CHECK-NEXT: %[[IGEP:[0-9]+]] = getelementptr i32, i32* %[[IALLOCA]], i32 %idx
+  ; CHECK-NEXT: store i32 %[[ELOAD]], i32* %[[IGEP]], align 4
+  ; CHECK-NEXT: %[[ILOAD:[0-9]+]] = load <4 x i32>, <4 x i32>* %[[ICAST]], align 4
+  %i.4 = insertelement <4 x i32> %in, i32 %e.4, i32 %idx
+  ; CHECK-NEXT: ret <4 x i32> %[[ILOAD]]
+  ret <4 x i32> %i.4
+}
+
+define <4 x float> @test_4xfloat_variable(<4 x float> %in, i32 %idx) {
+  ; CHECK-LABEL: test_4xfloat_variable
+  ; CHECK-NEXT: %[[EALLOCA:[0-9]+]] = alloca float, i32 4, align 4
+  ; CHECK-NEXT: %[[ECAST:[0-9]+]] = bitcast float* %[[EALLOCA]] to <4 x float>*
+  ; CHECK-NEXT: store <4 x float> %in, <4 x float>* %[[ECAST]], align 4
+  ; CHECK-NEXT: %[[EGEP:[0-9]+]] = getelementptr float, float* %[[EALLOCA]], i32 %idx
+  ; CHECK-NEXT: %[[ELOAD:[0-9]+]] = load float, float* %[[EGEP]], align 4
+  %e.4 = extractelement <4 x float> %in, i32 %idx
+  ; CHECK-NEXT: %[[IALLOCA:[0-9]+]] = alloca float, i32 4, align 4
+  ; CHECK-NEXT: %[[ICAST:[0-9]+]] = bitcast float* %[[IALLOCA]] to <4 x float>*
+  ; CHECK-NEXT: store <4 x float> %in, <4 x float>* %[[ICAST]], align 4
+  ; CHECK-NEXT: %[[IGEP:[0-9]+]] = getelementptr float, float* %[[IALLOCA]], i32 %idx
+  ; CHECK-NEXT: store float %[[ELOAD]], float* %[[IGEP]], align 4
+  ; CHECK-NEXT: %[[ILOAD:[0-9]+]] = load <4 x float>, <4 x float>* %[[ICAST]], align 4
+  %i.4 = insertelement <4 x float> %in, float %e.4, i32 %idx
+  ; CHECK-NEXT: ret <4 x float> %[[ILOAD]]
+  ret <4 x float> %i.4
+}
diff --git a/test/Transforms/NaCl/constant-vector-rematerialization.ll b/test/Transforms/NaCl/constant-vector-rematerialization.ll
new file mode 100644
index 000000000000..625c8c5817e5
--- /dev/null
+++ b/test/Transforms/NaCl/constant-vector-rematerialization.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -backend-canonicalize -S | FileCheck %s
+
+; Test that constant vectors that were globalized get rematerialized properly.
+
+; The datalayout is needed to determine the alignment of the globals.
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+@veci32 = internal constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
+@veci32zero = internal constant [4 x i32] zeroinitializer
+
+@veci8 = internal constant [16 x i8] [i8 255, i8 255, i8 255, i8 0, i8 255, i8 255, i8 0, i8 255, i8 255, i8 0, i8 255, i8 255, i8 0, i8 255, i8 255, i8 255]
+@veci8zero = internal constant [16 x i8] zeroinitializer
+
+define <4 x i32> @test_vec_i32() {
+  %bc = bitcast [4 x i32]* @veci32 to <4 x i32>*
+  %v = load <4 x i32>, <4 x i32>* %bc
+  ret <4 x i32> %v
+}
+; CHECK-LABEL: @test_vec_i32(
+; CHECK-NEXT: ret <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+
+define <4 x i32> @test_vec_i32_zero() {
+  %bc = bitcast [4 x i32]* @veci32zero to <4 x i32>*
+  %v = load <4 x i32>, <4 x i32>* %bc
+  ret <4 x i32> %v
+}
+; CHECK-LABEL: @test_vec_i32_zero(
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
+
+define <4 x i32> @test_vec_i8() {
+  %bc = bitcast [16 x i8]* @veci8 to <4 x i32>*
+  %v = load <4 x i32>, <4 x i32>* %bc
+  ret <4 x i32> %v
+}
+; CHECK-LABEL: @test_vec_i8(
+; CHECK-NEXT: ret <4 x i32> <i32 16777215, i32 -16711681, i32 -65281, i32 -256>
+
+define <4 x i32> @test_vec_i8_zero() {
+  %bc = bitcast [16 x i8]* @veci8zero to <4 x i32>*
+  %v = load <4 x i32>, <4 x i32>* %bc
+  ret <4 x i32> %v
+}
+; CHECK-LABEL: @test_vec_i8_zero(
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
diff --git a/test/Transforms/NaCl/expand-arith-with-overflow.ll b/test/Transforms/NaCl/expand-arith-with-overflow.ll
new file mode 100644
index 000000000000..228e6646da9a
--- /dev/null
+++ b/test/Transforms/NaCl/expand-arith-with-overflow.ll
@@ -0,0 +1,299 @@
+; RUN: opt %s -expand-arith-with-overflow -expand-struct-regs -S | FileCheck %s
+; RUN: opt %s -expand-arith-with-overflow -expand-struct-regs -S | \
+; RUN:     FileCheck %s -check-prefix=CLEANUP
+
+declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8)
+declare {i8, i1} @llvm.ssub.with.overflow.i8(i8, i8)
+declare {i16, i1} @llvm.uadd.with.overflow.i16(i16, i16)
+declare {i16, i1} @llvm.usub.with.overflow.i16(i16, i16)
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32)
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64)
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64)
+
+; CLEANUP-NOT: with.overflow
+; CLEANUP-NOT: extractvalue
+; CLEANUP-NOT: insertvalue
+
+
+define void @umul32_by_zero(i32 %x, i32* %result_val, i1* %result_overflow) {
+  %pair = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 0)
+  %val = extractvalue {i32, i1} %pair, 0
+  %overflow = extractvalue {i32, i1} %pair, 1
+
+  store i32 %val, i32* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; Make sure it doesn't segfault because of a division by zero.
+; CHECK: define void @umul32_by_zero(
+; CHECK-NEXT: %pair.arith = mul i32 %x, 0
+; CHECK-NEXT: store i32 %pair.arith, i32* %result_val
+; CHECK-NEXT: store i1 false, i1* %result_overflow
+
+
+define void @umul32_by_const(i32 %x, i32* %result_val, i1* %result_overflow) {
+  %pair = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 256)
+  %val = extractvalue {i32, i1} %pair, 0
+  %overflow = extractvalue {i32, i1} %pair, 1
+
+  store i32 %val, i32* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; The bound is 16777215 == 0xffffff == ((1 << 32) - 1) / 256
+; CHECK: define void @umul32_by_const(
+; CHECK-NEXT: %pair.arith = mul i32 %x, 256
+; CHECK-NEXT: %pair.overflow = icmp ugt i32 %x, 16777215
+; CHECK-NEXT: store i32 %pair.arith, i32* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+
+; Check that the pass can expand multiple uses of the same intrinsic.
+define void @umul32_by_const2(i32 %x, i32* %result_val, i1* %result_overflow) {
+  %pair = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %x, i32 65536)
+  %val = extractvalue {i32, i1} %pair, 0
+  ; Check that the pass can expand multiple uses of %pair.
+  %overflow1 = extractvalue {i32, i1} %pair, 1
+  %overflow2 = extractvalue {i32, i1} %pair, 1
+
+  store i32 %val, i32* %result_val
+  store i1 %overflow1, i1* %result_overflow
+  store i1 %overflow2, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @umul32_by_const2(
+; CHECK-NEXT: %pair.arith = mul i32 %x, 65536
+; CHECK-NEXT: %pair.overflow = icmp ugt i32 %x, 65535
+; CHECK-NEXT: store i32 %pair.arith, i32* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+
+define void @umul64_by_const(i64 %x, i64* %result_val, i1* %result_overflow) {
+  ; Multiply by 1 << 55.
+  %pair = call {i64, i1} @llvm.umul.with.overflow.i64(i64 36028797018963968, i64 %x)
+  %val = extractvalue {i64, i1} %pair, 0
+  %overflow = extractvalue {i64, i1} %pair, 1
+
+  store i64 %val, i64* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @umul64_by_const(
+; CHECK-NEXT: %pair.arith = mul i64 36028797018963968, %x
+; CHECK-NEXT: %pair.overflow = icmp ugt i64 %x, 511
+; CHECK-NEXT: store i64 %pair.arith, i64* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+
+define void @umul64_by_var(i64 %x, i64 %y, i64* %result_val, i1* %result_overflow) {
+  %pair = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %x, i64 %y)
+  %val = extractvalue {i64, i1} %pair, 0
+  %overflow = extractvalue {i64, i1} %pair, 1
+
+  store i64 %val, i64* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @umul64_by_var(
+; CHECK-NEXT: %pair.arith = mul i64 %x, %y
+; CHECK-NEXT: %pair.iszero = icmp eq i64 %y, 0
+; CHECK-NEXT: %pair.denom = select i1 %pair.iszero, i64 1, i64 %y
+; CHECK-NEXT: %pair.div = udiv i64 %pair.arith, %pair.denom
+; CHECK-NEXT: %pair.same = icmp ne i64 %pair.div, %x
+; CHECK-NEXT: %pair.overflow = select i1 %pair.iszero, i1 false, i1 %pair.same
+; CHECK-NEXT: store i64 %pair.arith, i64* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+
+define void @smul64_by_var(i64 %x, i64 %y, i64* %result_val, i1* %result_overflow) {
+  %pair = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %x, i64 %y)
+  %val = extractvalue {i64, i1} %pair, 0
+  %overflow = extractvalue {i64, i1} %pair, 1
+
+  store i64 %val, i64* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @smul64_by_var(
+; CHECK-NEXT: %pair.arith = mul i64 %x, %y
+; CHECK-NEXT: %pair.iszero = icmp eq i64 %y, 0
+; CHECK-NEXT: %pair.denom = select i1 %pair.iszero, i64 1, i64 %y
+; CHECK-NEXT: %pair.div = sdiv i64 %pair.arith, %pair.denom
+; CHECK-NEXT: %pair.same = icmp ne i64 %pair.div, %x
+; CHECK-NEXT: %pair.overflow = select i1 %pair.iszero, i1 false, i1 %pair.same
+; CHECK-NEXT: store i64 %pair.arith, i64* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+
+define void @uadd16_with_const(i16 %x, i16* %result_val, i1* %result_overflow) {
+  %pair = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %x, i16 35)
+  %val = extractvalue {i16, i1} %pair, 0
+  %overflow = extractvalue {i16, i1} %pair, 1
+
+  store i16 %val, i16* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @uadd16_with_const(
+; CHECK-NEXT: %pair.arith = add i16 %x, 35
+; CHECK-NEXT: %pair.overflow = icmp ugt i16 %x, -36
+; CHECK-NEXT: store i16 %pair.arith, i16* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+
+define void @uadd16_with_var(i16 %x, i16 %y, i16* %result_val, i1* %result_overflow) {
+  %pair = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %x, i16 %y)
+  %val = extractvalue {i16, i1} %pair, 0
+  %overflow = extractvalue {i16, i1} %pair, 1
+
+  store i16 %val, i16* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @uadd16_with_var(
+; CHECK-NEXT: %pair.arith = add i16 %x, %y
+; CHECK-NEXT: %pair.overflow = icmp ult i16 %pair.arith, %x
+; CHECK-NEXT: store i16 %pair.arith, i16* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+define void @usub16_with_const(i16 %x, i16* %result_val, i1* %result_overflow) {
+  %pair = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %x, i16 35)
+  %val = extractvalue {i16, i1} %pair, 0
+  %overflow = extractvalue {i16, i1} %pair, 1
+
+  store i16 %val, i16* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @usub16_with_const(
+; CHECK-NEXT: %pair.arith = sub i16 %x, 35
+; CHECK-NEXT: %pair.overflow = icmp ult i16 %x, 35
+; CHECK-NEXT: store i16 %pair.arith, i16* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+
+define void @usub16_with_var(i16 %x, i16 %y, i16* %result_val, i1* %result_overflow) {
+  %pair = call {i16, i1} @llvm.usub.with.overflow.i16(i16 %x, i16 %y)
+  %val = extractvalue {i16, i1} %pair, 0
+  %overflow = extractvalue {i16, i1} %pair, 1
+
+  store i16 %val, i16* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @usub16_with_var(
+; CHECK-NEXT: %pair.arith = sub i16 %x, %y
+; CHECK-NEXT: %pair.overflow = icmp ult i16 %x, %y
+; CHECK-NEXT: store i16 %pair.arith, i16* %result_val
+; CHECK-NEXT: store i1 %pair.overflow, i1* %result_overflow
+
+define void @sadd8_with_const(i8 %x, i8* %result_val, i1* %result_overflow) {
+  %pair = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %x, i8 35)
+  %val = extractvalue {i8, i1} %pair, 0
+  %overflow = extractvalue {i8, i1} %pair, 1
+
+  store i8 %val, i8* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @sadd8_with_const(
+; CHECK-NEXT: %pair.arith = add i8 %x, 35
+; CHECK-NEXT: %pair.postemp = add i8 %x, -128
+; CHECK-NEXT: %pair.negtemp = add i8 %x, 127
+; CHECK-NEXT: %pair.poscheck = icmp slt i8 %pair.arith, %pair.postemp
+; CHECK-NEXT: %pair.negcheck = icmp sgt i8 %pair.arith, %pair.negtemp
+; CHECK-NEXT: %pair.ispos = icmp sge i8 %x, 0
+; CHECK-NEXT: %pair.select = select i1 %pair.ispos, i1 %pair.poscheck, i1 %pair.negcheck
+; CHECK-NEXT: store i8 %pair.arith, i8* %result_val
+; CHECK-NEXT: store i1 %pair.select, i1* %result_overflow
+
+
+define void @sadd8_with_const_min(i8* %result_val, i1* %result_overflow) {
+  %pair = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 0, i8 -128)
+  %val = extractvalue {i8, i1} %pair, 0
+  %overflow = extractvalue {i8, i1} %pair, 1
+
+  store i8 %val, i8* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @sadd8_with_const_min(
+; CHECK-NEXT: store i8 -128, i8* %result_val
+; CHECK-NEXT: store i1 false, i1* %result_overflow
+
+
+define void @sadd8_with_var(i8 %x, i8 %y, i8* %result_val, i1* %result_overflow) {
+  %pair = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %x, i8 %y)
+  %val = extractvalue {i8, i1} %pair, 0
+  %overflow = extractvalue {i8, i1} %pair, 1
+
+  store i8 %val, i8* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @sadd8_with_var(
+; CHECK-NEXT: %pair.arith = add i8 %x, %y
+; CHECK-NEXT: %pair.postemp = add i8 %x, -128
+; CHECK-NEXT: %pair.negtemp = add i8 %x, 127
+; CHECK-NEXT: %pair.poscheck = icmp slt i8 %pair.arith, %pair.postemp
+; CHECK-NEXT: %pair.negcheck = icmp sgt i8 %pair.arith, %pair.negtemp
+; CHECK-NEXT: %pair.ispos = icmp sge i8 %x, 0
+; CHECK-NEXT: %pair.select = select i1 %pair.ispos, i1 %pair.poscheck, i1 %pair.negcheck
+; CHECK-NEXT: store i8 %pair.arith, i8* %result_val
+; CHECK-NEXT: store i1 %pair.select, i1* %result_overflow
+
+
+define void @ssub8_with_const(i8 %x, i8* %result_val, i1* %result_overflow) {
+  %pair = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %x, i8 35)
+  %val = extractvalue {i8, i1} %pair, 0
+  %overflow = extractvalue {i8, i1} %pair, 1
+
+  store i8 %val, i8* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @ssub8_with_const(
+; CHECK-NEXT: %pair.arith = sub i8 %x, 35
+; CHECK-NEXT: %pair.postemp = add i8 %x, -127
+; CHECK-NEXT: %pair.negtemp = add i8 %x, -128
+; CHECK-NEXT: %pair.poscheck = icmp slt i8 %pair.arith, %pair.postemp
+; CHECK-NEXT: %pair.negcheck = icmp sgt i8 %pair.arith, %pair.negtemp
+; CHECK-NEXT: %pair.ispos = icmp sge i8 %x, 0
+; CHECK-NEXT: %pair.select = select i1 %pair.ispos, i1 %pair.poscheck, i1 %pair.negcheck
+; CHECK-NEXT: store i8 %pair.arith, i8* %result_val
+; CHECK-NEXT: store i1 %pair.select, i1* %result_overflow
+
+
+define void @ssub8_with_const_min(i8* %result_val, i1* %result_overflow) {
+  %pair = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 0, i8 -128)
+  %val = extractvalue {i8, i1} %pair, 0
+  %overflow = extractvalue {i8, i1} %pair, 1
+
+  store i8 %val, i8* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @ssub8_with_const_min(
+; CHECK: store i1 true, i1* %result_overflow
+
+
+define void @ssub8_with_var(i8 %x, i8 %y, i8* %result_val, i1* %result_overflow) {
+  %pair = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %x, i8 %y)
+  %val = extractvalue {i8, i1} %pair, 0
+  %overflow = extractvalue {i8, i1} %pair, 1
+
+  store i8 %val, i8* %result_val
+  store i1 %overflow, i1* %result_overflow
+  ret void
+}
+; CHECK: define void @ssub8_with_var(
+; CHECK-NEXT: %pair.arith = sub i8 %x, %y
+; CHECK-NEXT: %pair.postemp = add i8 %x, -127
+; CHECK-NEXT: %pair.negtemp = add i8 %x, -128
+; CHECK-NEXT: %pair.poscheck = icmp slt i8 %pair.arith, %pair.postemp
+; CHECK-NEXT: %pair.negcheck = icmp sgt i8 %pair.arith, %pair.negtemp
+; CHECK-NEXT: %pair.ispos = icmp sge i8 %x, 0
+; CHECK-NEXT: %pair.select = select i1 %pair.ispos, i1 %pair.poscheck, i1 %pair.negcheck
+; CHECK-NEXT: store i8 %pair.arith, i8* %result_val
+; CHECK-NEXT: store i1 %pair.select, i1* %result_overflow
diff --git a/test/Transforms/NaCl/expand-byval.ll b/test/Transforms/NaCl/expand-byval.ll
new file mode 100644
index 000000000000..a526173ef0c3
--- /dev/null
+++ b/test/Transforms/NaCl/expand-byval.ll
@@ -0,0 +1,122 @@
+; RUN: opt -expand-byval %s -S | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+%MyStruct = type { i32, i8, i32 }
+%AlignedStruct = type { double, double }
+
+
+; Removal of "byval" attribute for passing structs arguments by value
+
+declare void @ext_func(%MyStruct*)
+
+define void @byval_receiver(%MyStruct* byval align 32 %ptr) {
+  call void @ext_func(%MyStruct* %ptr)
+  ret void
+}
+; Strip the "byval" and "align" attributes.
+; CHECK: define void @byval_receiver(%MyStruct* noalias %ptr) {
+; CHECK-NEXT: call void @ext_func(%MyStruct* %ptr)
+
+
+declare void @ext_byval_func(%MyStruct* byval)
+; CHECK: declare void @ext_byval_func(%MyStruct* noalias)
+
+define void @byval_caller(%MyStruct* %ptr) {
+  call void @ext_byval_func(%MyStruct* byval %ptr)
+  ret void
+}
+; CHECK: define void @byval_caller(%MyStruct* %ptr) {
+; CHECK-NEXT: %ptr.byval_copy = alloca %MyStruct, align 4
+; CHECK: call void @llvm.lifetime.start(i64 12, i8* %{{.*}})
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 12, i32 4, i1 false)
+; CHECK-NEXT: call void @ext_byval_func(%MyStruct* noalias %ptr.byval_copy)
+
+
+define void @byval_tail_caller(%MyStruct* %ptr) {
+  tail call void @ext_byval_func(%MyStruct* byval %ptr)
+  ret void
+}
+; CHECK: define void @byval_tail_caller(%MyStruct* %ptr) {
+; CHECK: {{^}} call void @ext_byval_func(%MyStruct* noalias %ptr.byval_copy)
+
+
+define void @byval_invoke(%MyStruct* %ptr) {
+  invoke void @ext_byval_func(%MyStruct* byval align 32 %ptr)
+      to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret void
+}
+; CHECK: define void @byval_invoke(%MyStruct* %ptr) {
+; CHECK: %ptr.byval_copy = alloca %MyStruct, align 32
+; CHECK: call void @llvm.lifetime.start(i64 12, i8* %{{.*}})
+; CHECK: invoke void @ext_byval_func(%MyStruct* noalias %ptr.byval_copy)
+; CHECK: cont:
+; CHECK: call void @llvm.lifetime.end(i64 12, i8* %{{.*}})
+; CHECK: lpad:
+; CHECK: call void @llvm.lifetime.end(i64 12, i8* %{{.*}})
+
+
+; Check handling of alignment
+
+; Check that "align" is stripped for declarations too.
+declare void @ext_byval_func_align(%MyStruct* byval align 32)
+; CHECK: declare void @ext_byval_func_align(%MyStruct* noalias)
+
+define void @byval_caller_align_via_attr(%MyStruct* %ptr) {
+  call void @ext_byval_func(%MyStruct* byval align 32 %ptr)
+  ret void
+}
+; CHECK: define void @byval_caller_align_via_attr(%MyStruct* %ptr) {
+; CHECK-NEXT: %ptr.byval_copy = alloca %MyStruct, align 32
+; The memcpy may assume that %ptr is 32-byte-aligned.
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 12, i32 32, i1 false)
+
+declare void @ext_byval_func_align_via_type(%AlignedStruct* byval)
+
+; %AlignedStruct contains a double so requires an alignment of 8 bytes.
+; Looking at the alignment of %AlignedStruct is a workaround for a bug
+; in pnacl-clang:
+; https://code.google.com/p/nativeclient/issues/detail?id=3403
+define void @byval_caller_align_via_type(%AlignedStruct* %ptr) {
+  call void @ext_byval_func_align_via_type(%AlignedStruct* byval %ptr)
+  ret void
+}
+; CHECK: define void @byval_caller_align_via_type(%AlignedStruct* %ptr) {
+; CHECK-NEXT: %ptr.byval_copy = alloca %AlignedStruct, align 8
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* %{{.*}}, i64 16, i32 8, i1 false)
+
+
+; Removal of "sret" attribute for returning structs by value
+
+declare void @ext_sret_func(%MyStruct* sret align 32)
+; CHECK: declare void @ext_sret_func(%MyStruct*)
+
+define void @sret_func(%MyStruct* sret align 32 %buf) {
+  ret void
+}
+; CHECK: define void @sret_func(%MyStruct* %buf) {
+
+define void @sret_caller(%MyStruct* %buf) {
+  call void @ext_sret_func(%MyStruct* sret align 32 %buf)
+  ret void
+}
+; CHECK: define void @sret_caller(%MyStruct* %buf) {
+; CHECK-NEXT: call void @ext_sret_func(%MyStruct* %buf)
+
+
+; Check that other attributes are preserved
+
+define void @inreg_attr(%MyStruct* inreg %ptr) {
+  ret void
+}
+; CHECK: define void @inreg_attr(%MyStruct* inreg %ptr) {
+
+declare void @func_attrs() #0
+; CHECK: declare void @func_attrs() #0
+
+attributes #0 = { noreturn nounwind }
+; CHECK: attributes #0 = { noreturn nounwind }
diff --git a/test/Transforms/NaCl/expand-constantexpr.ll b/test/Transforms/NaCl/expand-constantexpr.ll
new file mode 100644
index 000000000000..e8786d4cac7a
--- /dev/null
+++ b/test/Transforms/NaCl/expand-constantexpr.ll
@@ -0,0 +1,109 @@
+; RUN: opt < %s -expand-constant-expr -S | FileCheck %s
+
+@global_var1 = global i32 123
+@global_var2 = global i32 123
+
+
+define i8* @constantexpr_bitcast() {
+  ret i8* bitcast (i32* @global_var1 to i8*)
+}
+; CHECK: @constantexpr_bitcast
+; CHECK: %expanded = bitcast i32* @global_var1 to i8*
+; CHECK: ret i8* %expanded
+
+
+define i32 @constantexpr_nested() {
+  ret i32 add (i32 ptrtoint (i32* @global_var1 to i32),
+               i32 ptrtoint (i32* @global_var2 to i32))
+}
+; CHECK: @constantexpr_nested
+; CHECK: %expanded1 = ptrtoint i32* @global_var1 to i32
+; CHECK: %expanded2 = ptrtoint i32* @global_var2 to i32
+; CHECK: %expanded = add i32 %expanded1, %expanded2
+; CHECK: ret i32 %expanded
+
+
+define i32 @constantexpr_nested2() {
+  ret i32 mul (i32 add (i32 ptrtoint (i32* @global_var1 to i32),
+                        i32 ptrtoint (i32* @global_var2 to i32)), i32 2)
+}
+; CHECK: @constantexpr_nested2
+; CHECK: %expanded2 = ptrtoint i32* @global_var1 to i32
+; CHECK: %expanded3 = ptrtoint i32* @global_var2 to i32
+; CHECK: %expanded1 = add i32 %expanded2, %expanded3
+; CHECK: %expanded = mul i32 %expanded1, 2
+; CHECK: ret i32 %expanded
+
+
+define i32 @constantexpr_phi() {
+entry:
+  br label %label
+label:
+  %result = phi i32 [ ptrtoint (i32* @global_var1 to i32), %entry ]
+  ret i32 %result
+}
+; CHECK: @constantexpr_phi
+; CHECK: entry:
+; CHECK: %expanded = ptrtoint i32* @global_var1 to i32
+; CHECK: br label %label
+; CHECK: label:
+; CHECK: %result = phi i32 [ %expanded, %entry ]
+
+
+; This tests that ExpandConstantExpr correctly handles a PHI node that
+; contains the same ConstantExpr twice.
+; Using replaceAllUsesWith() is not correct on a PHI node when the
+; new instruction has to be added to an incoming block.
+define i32 @constantexpr_phi_twice(i1 %arg) {
+  br i1 %arg, label %iftrue, label %iffalse
+iftrue:
+  br label %exit
+iffalse:
+  br label %exit
+exit:
+  %result = phi i32 [ ptrtoint (i32* @global_var1 to i32), %iftrue ],
+                    [ ptrtoint (i32* @global_var1 to i32), %iffalse ]
+  ret i32 %result
+}
+; CHECK: @constantexpr_phi_twice
+; CHECK: iftrue:
+; CHECK: %expanded = ptrtoint i32* @global_var1 to i32
+; CHECK: iffalse:
+; CHECK: %expanded1 = ptrtoint i32* @global_var1 to i32
+; CHECK: exit:
+
+
+define i32 @constantexpr_phi_multiple_entry(i1 %arg) {
+entry:
+  br i1 %arg, label %done, label %done
+done:
+  %result = phi i32 [ ptrtoint (i32* @global_var1 to i32), %entry ],
+                    [ ptrtoint (i32* @global_var1 to i32), %entry ]
+  ret i32 %result
+}
+; CHECK: @constantexpr_phi_multiple_entry
+; CHECK: entry:
+; CHECK: %expanded = ptrtoint i32* @global_var1 to i32
+; CHECK: br i1 %arg, label %done, label %done
+; CHECK: done:
+; CHECK: %result = phi i32 [ %expanded, %entry ], [ %expanded, %entry ]
+
+
+
+declare void @external_func()
+declare void @personality_func()
+
+define void @test_landingpad() {
+  invoke void @external_func() to label %ok unwind label %onerror
+ok:
+  ret void
+onerror:
+  %lp = landingpad i32
+      personality i8* bitcast (void ()* @personality_func to i8*)
+      catch i32* null
+  ret void
+}
+; landingpad can only accept a ConstantExpr, so this should remain
+; unmodified.
+; CHECK: @test_landingpad
+; CHECK: personality i8* bitcast (void ()* @personality_func to i8*)
diff --git a/test/Transforms/NaCl/expand-ctors-empty.ll b/test/Transforms/NaCl/expand-ctors-empty.ll
new file mode 100644
index 000000000000..f0788a0873e4
--- /dev/null
+++ b/test/Transforms/NaCl/expand-ctors-empty.ll
@@ -0,0 +1,12 @@
+; Currently we do not define __{init,fini}_array_end as named aliases.
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s -check-prefix=NO_CTORS
+; NO_CTORS-NOT: __init_array_end
+; NO_CTORS-NOT: __fini_array_end
+
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s
+
+; If llvm.global_ctors is not present, it is treated as if it is an
+; empty array, and __{init,fini}_array_start are defined anyway.
+
+; CHECK: @__init_array_start = internal constant [0 x void ()*] zeroinitializer
+; CHECK: @__fini_array_start = internal constant [0 x void ()*] zeroinitializer
diff --git a/test/Transforms/NaCl/expand-ctors-emptylist.ll b/test/Transforms/NaCl/expand-ctors-emptylist.ll
new file mode 100644
index 000000000000..6ab68852b9d3
--- /dev/null
+++ b/test/Transforms/NaCl/expand-ctors-emptylist.ll
@@ -0,0 +1,13 @@
+; RUN: opt %s -nacl-expand-ctors -S | FileCheck %s -check-prefix=NO_CTORS
+; NO_CTORS-NOT: __init_array_end
+; NO_CTORS-NOT: __fini_array_end
+; NO_CTORS-NOT: llvm.global_ctors
+
+; RUN: opt %s -nacl-expand-ctors -S | FileCheck %s
+
+; Check that the pass works when the initializer is "[]", which gets
+; converted into "undef" by the reader.
+@llvm.global_ctors = appending global [0 x { i32, void ()* }] []
+
+; CHECK: @__init_array_start = internal constant [0 x void ()*] zeroinitializer
+; CHECK: @__fini_array_start = internal constant [0 x void ()*] zeroinitializer
diff --git a/test/Transforms/NaCl/expand-ctors-zeroinit.ll b/test/Transforms/NaCl/expand-ctors-zeroinit.ll
new file mode 100644
index 000000000000..824b2b23b72d
--- /dev/null
+++ b/test/Transforms/NaCl/expand-ctors-zeroinit.ll
@@ -0,0 +1,17 @@
+; Currently we do not define __{init,fini}_array_end as named aliases.
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s -check-prefix=NO_CTORS
+; NO_CTORS-NOT: __init_array_end
+; NO_CTORS-NOT: __fini_array_end
+
+; We expect this symbol to be removed:
+; RUN: opt < %s -nacl-expand-ctors -S | not grep llvm.global_ctors
+
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s
+
+; If llvm.global_ctors is zeroinitializer, it should be treated the
+; same as an empty array.
+
+@llvm.global_ctors = appending global [0 x { i32, void ()* }] zeroinitializer
+
+; CHECK: @__init_array_start = internal constant [0 x void ()*] zeroinitializer
+; CHECK: @__fini_array_start = internal constant [0 x void ()*] zeroinitializer
diff --git a/test/Transforms/NaCl/expand-ctors.ll b/test/Transforms/NaCl/expand-ctors.ll
new file mode 100644
index 000000000000..89aeda00a663
--- /dev/null
+++ b/test/Transforms/NaCl/expand-ctors.ll
@@ -0,0 +1,37 @@
+; We expect these symbol names to be removed:
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s -check-prefix=NO_CTORS
+; NO_CTORS-NOT: llvm.global.ctors
+; NO_CTORS-NOT: __init_array_end
+; NO_CTORS-NOT: __fini_array_end
+
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s
+
+@llvm.global_ctors = appending global [3 x { i32, void ()* }]
+  [{ i32, void ()* } { i32 300, void ()* @init_func_A },
+   { i32, void ()* } { i32 100, void ()* @init_func_B },
+   { i32, void ()* } { i32 200, void ()* @init_func_C }]
+
+@__init_array_start = extern_weak global [0 x void ()*]
+@__init_array_end = extern_weak global [0 x void ()*]
+
+; CHECK: @__init_array_start = internal constant [3 x void ()*] [void ()* @init_func_B, void ()* @init_func_C, void ()* @init_func_A]
+; CHECK: @__fini_array_start = internal constant [0 x void ()*] zeroinitializer
+
+define void @init_func_A() { ret void }
+define void @init_func_B() { ret void }
+define void @init_func_C() { ret void }
+
+define [0 x void ()*]* @get_array_start() {
+  ret [0 x void ()*]* @__init_array_start;
+}
+; CHECK: @get_array_start()
+; CHECK: ret {{.*}} @__init_array_start
+
+define [0 x void ()*]* @get_array_end() {
+  ret [0 x void ()*]* @__init_array_end;
+}
+
+; @get_array_end() is converted to use a GetElementPtr that returns
+; the end of the generated array:
+; CHECK: @get_array_end()
+; CHECK: ret {{.*}} bitcast ([3 x void ()*]* getelementptr inbounds ([3 x void ()*], [3 x void ()*]* @__init_array_start, i32 1)
diff --git a/test/Transforms/NaCl/expand-getelementptr.ll b/test/Transforms/NaCl/expand-getelementptr.ll
new file mode 100644
index 000000000000..cb849f8e0144
--- /dev/null
+++ b/test/Transforms/NaCl/expand-getelementptr.ll
@@ -0,0 +1,123 @@
+; RUN: opt < %s -expand-getelementptr -S | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+%MyStruct = type { i8, i32, i8 }
+%MyArray = type { [100 x i64] }
+%MyArrayOneByte = type { [100 x i8] }
+
+
+; Test indexing struct field
+define i8* @test_struct_field(%MyStruct* %ptr) {
+  %addr = getelementptr %MyStruct, %MyStruct* %ptr, i32 0, i32 2
+  ret i8* %addr
+}
+; CHECK: @test_struct_field
+; CHECK-NEXT: %gep_int = ptrtoint %MyStruct* %ptr to i32
+; CHECK-NEXT: %gep = add i32 %gep_int, 8
+; CHECK-NEXT: %addr = inttoptr i32 %gep to i8*
+; CHECK-NEXT: ret i8* %addr
+
+
+; Test non-constant index into an array
+define i64* @test_array_index(%MyArray* %ptr, i32 %index) {
+  %addr = getelementptr %MyArray, %MyArray* %ptr, i32 0, i32 0, i32 %index
+  ret i64* %addr
+}
+; CHECK: @test_array_index
+; CHECK-NEXT: %gep_int = ptrtoint %MyArray* %ptr to i32
+; CHECK-NEXT: %gep_array = mul i32 %index, 8
+; CHECK-NEXT: %gep = add i32 %gep_int, %gep_array
+; CHECK-NEXT: %addr = inttoptr i32 %gep to i64*
+; CHECK-NEXT: ret i64* %addr
+
+
+; Test constant index into an array (as a pointer)
+define %MyStruct* @test_ptr_add(%MyStruct* %ptr) {
+  %addr = getelementptr %MyStruct, %MyStruct* %ptr, i32 2
+  ret %MyStruct* %addr
+}
+; CHECK: @test_ptr_add
+; CHECK-NEXT: %gep_int = ptrtoint %MyStruct* %ptr to i32
+; CHECK-NEXT: %gep = add i32 %gep_int, 24
+; CHECK-NEXT: %addr = inttoptr i32 %gep to %MyStruct*
+; CHECK-NEXT: ret %MyStruct* %addr
+
+
+; Test that additions and multiplications are combined properly
+define i64* @test_add_and_index(%MyArray* %ptr, i32 %index) {
+  %addr = getelementptr %MyArray, %MyArray* %ptr, i32 1, i32 0, i32 %index
+  ret i64* %addr
+}
+; CHECK: @test_add_and_index
+; CHECK-NEXT: %gep_int = ptrtoint %MyArray* %ptr to i32
+; CHECK-NEXT: %gep = add i32 %gep_int, 800
+; CHECK-NEXT: %gep_array = mul i32 %index, 8
+; CHECK-NEXT: %gep1 = add i32 %gep, %gep_array
+; CHECK-NEXT: %addr = inttoptr i32 %gep1 to i64*
+; CHECK-NEXT: ret i64* %addr
+
+
+; Test that we don't multiply by 1 unnecessarily
+define i8* @test_add_and_index_one_byte(%MyArrayOneByte* %ptr, i32 %index) {
+  %addr = getelementptr %MyArrayOneByte, %MyArrayOneByte* %ptr, i32 1, i32 0, i32 %index
+  ret i8* %addr
+}
+; CHECK: @test_add_and_index
+; CHECK-NEXT: %gep_int = ptrtoint %MyArrayOneByte* %ptr to i32
+; CHECK-NEXT: %gep = add i32 %gep_int, 100
+; CHECK-NEXT: %gep1 = add i32 %gep, %index
+; CHECK-NEXT: %addr = inttoptr i32 %gep1 to i8*
+; CHECK-NEXT: ret i8* %addr
+
+
+; Test >32-bit array index
+define i64* @test_array_index64(%MyArray* %ptr, i64 %index) {
+  %addr = getelementptr %MyArray, %MyArray* %ptr, i32 0, i32 0, i64 %index
+  ret i64* %addr
+}
+; CHECK: @test_array_index64
+; CHECK-NEXT: %gep_int = ptrtoint %MyArray* %ptr to i32
+; CHECK-NEXT: %gep_trunc = trunc i64 %index to i32
+; CHECK-NEXT: %gep_array = mul i32 %gep_trunc, 8
+; CHECK-NEXT: %gep = add i32 %gep_int, %gep_array
+; CHECK-NEXT: %addr = inttoptr i32 %gep to i64*
+; CHECK-NEXT: ret i64* %addr
+
+
+; Test <32-bit array index
+define i64* @test_array_index16(%MyArray* %ptr, i16 %index) {
+  %addr = getelementptr %MyArray, %MyArray* %ptr, i32 0, i32 0, i16 %index
+  ret i64* %addr
+}
+; CHECK: @test_array_index16
+; CHECK-NEXT: %gep_int = ptrtoint %MyArray* %ptr to i32
+; CHECK-NEXT: %gep_sext = sext i16 %index to i32
+; CHECK-NEXT: %gep_array = mul i32 %gep_sext, 8
+; CHECK-NEXT: %gep = add i32 %gep_int, %gep_array
+; CHECK-NEXT: %addr = inttoptr i32 %gep to i64*
+; CHECK-NEXT: ret i64* %addr
+
+
+; Test >32-bit constant array index
+define i64* @test_array_index64_const(%MyArray* %ptr) {
+  %addr = getelementptr %MyArray, %MyArray* %ptr, i32 0, i32 0, i64 100
+  ret i64* %addr
+}
+; CHECK: @test_array_index64_const
+; CHECK-NEXT: %gep_int = ptrtoint %MyArray* %ptr to i32
+; CHECK-NEXT: %gep = add i32 %gep_int, 800
+; CHECK-NEXT: %addr = inttoptr i32 %gep to i64*
+; CHECK-NEXT: ret i64* %addr
+
+
+; Test <32-bit constant array index -- test sign extension
+define i64* @test_array_index16_const(%MyArray* %ptr) {
+  %addr = getelementptr %MyArray, %MyArray* %ptr, i32 0, i32 0, i16 -100
+  ret i64* %addr
+}
+; CHECK: @test_array_index16_const
+; CHECK-NEXT: %gep_int = ptrtoint %MyArray* %ptr to i32
+; CHECK-NEXT: %gep = add i32 %gep_int, -800
+; CHECK-NEXT: %addr = inttoptr i32 %gep to i64*
+; CHECK-NEXT: ret i64* %addr
diff --git a/test/Transforms/NaCl/expand-indirectbr.ll b/test/Transforms/NaCl/expand-indirectbr.ll
new file mode 100644
index 000000000000..5ca53371700a
--- /dev/null
+++ b/test/Transforms/NaCl/expand-indirectbr.ll
@@ -0,0 +1,62 @@
+; RUN: opt %s -expand-indirectbr -S | FileCheck %s
+
+
+@addresses = global [2 x i8*]
+    [i8* blockaddress(@indirectbr_example, %label1),
+     i8* blockaddress(@indirectbr_example, %label2)]
+; CHECK: @addresses = global [2 x i8*] [i8* inttoptr (i32 1 to i8*), i8* inttoptr (i32 2 to i8*)]
+
+
+define i32 @indirectbr_example(i8* %addr) {
+  indirectbr i8* %addr, [label %label1, label %label2]
+label1:
+  ret i32 100
+label2:
+  ret i32 200
+}
+; CHECK: define i32 @indirectbr_example
+; CHECK-NEXT: %indirectbr_cast = ptrtoint i8* %addr to i32
+; CHECK-NEXT: switch i32 %indirectbr_cast, label %indirectbr_default [
+; CHECK-NEXT:   i32 1, label %label1
+; CHECK-NEXT:   i32 2, label %label2
+; CHECK-NEXT: ]
+; CHECK: indirectbr_default:
+; CHECK-NEXT: unreachable
+
+
+define i32 @label_appears_twice(i8* %addr) {
+entry:
+  indirectbr i8* %addr, [label %label, label %label]
+label:
+  %val = phi i32 [ 123, %entry ], [ 123, %entry ]
+  ret i32 %val
+}
+; CHECK: define i32 @label_appears_twice
+; CHECK: switch i32 %indirectbr_cast, label %indirectbr_default [
+; CHECK-NEXT:   i32 1, label %label
+; CHECK-NEXT: ]
+; CHECK: %val = phi i32 [ 123, %entry ]
+
+
+define i8* @unused_blockaddress() {
+  ret i8* blockaddress (@unused_blockaddress, %dead_label)
+dead_label:
+  ret i8* null
+}
+; CHECK: define i8* @unused_blockaddress
+; CHECK-NEXT: ret i8* inttoptr (i32 -1 to i8*)
+
+
+; Check that the label is given a consistent switch value across all
+; indirectbr expansions.
+define i32 @multiple_indirectbr(i8* %addr) {
+  indirectbr i8* %addr, [label %label]
+  indirectbr i8* %addr, [label %label]
+label:
+  ret i32 100
+}
+; CHECK: define i32 @multiple_indirectbr
+; CHECK: switch i32 %indirectbr_cast{{[0-9]*}}, label %indirectbr_default [
+; CHECK-NEXT: i32 1, label %label
+; CHECK: switch i32 %indirectbr_cast{{[0-9]*}}, label %indirectbr_default [
+; CHECK-NEXT: i32 1, label %label
diff --git a/test/Transforms/NaCl/expand-integers.ll b/test/Transforms/NaCl/expand-integers.ll
new file mode 100644
index 000000000000..d08483a3b35f
--- /dev/null
+++ b/test/Transforms/NaCl/expand-integers.ll
@@ -0,0 +1,618 @@
+; RUN: opt < %s -nacl-expand-ints -S | FileCheck %s
+; Test large integer expansion for operations required for large packed
+; bitfields.
+
+; CHECK-LABEL: @simpleload
+define void @simpleload(i32* %a) {
+; CHECK: %a96.loty = bitcast i96* %a96 to i64*
+; CHECK-NEXT: %load.lo = load i64, i64* %a96.loty
+; CHECK-NEXT: %a96.hi.gep = getelementptr i64, i64* %a96.loty, i32 1
+; CHECK-NEXT: %a96.hity = bitcast i64* %a96.hi.gep to i32*
+; CHECK-NEXT: %load.hi = load i32, i32* %a96.hity
+  %a96 = bitcast i32* %a to i96*
+  %load = load i96, i96* %a96
+
+; CHECK: %a128.loty = bitcast i128* %a128 to i64*
+; CHECK-NEXT: %load128.lo = load i64, i64* %a128.loty
+; CHECK-NEXT: %a128.hi.gep = getelementptr i64, i64* %a128.loty, i32 1
+; CHECK-NEXT: %load128.hi = load i64, i64* %a128.hi.gep
+  %a128 = bitcast i32* %a to i128*
+  %load128 = load i128, i128* %a128
+
+; CHECK: %a256.loty = bitcast i256* %a256 to i64*
+; CHECK-NEXT: %load256.lo = load i64, i64* %a256.loty
+; CHECK-NEXT: %a256.hi.gep = getelementptr i64, i64* %a256.loty, i32 1
+; CHECK-NEXT: %a256.hity = bitcast i64* %a256.hi.gep to i192*
+; intermediate expansion: %load256.hi = load i192, i192* %a256.hity
+; CHECK-NEXT: %a256.hity.loty = bitcast i192* %a256.hity to i64*
+; CHECK-NEXT: %load256.hi.lo = load i64, i64* %a256.hity.loty
+; CHECK-NEXT: %a256.hity.hi.gep = getelementptr i64, i64* %a256.hity.loty, i32 1
+; CHECK-NEXT: %a256.hity.hity = bitcast i64* %a256.hity.hi.gep to i128*
+; intermediate expansion: %load256.hi.hi = load i128, i128* %a256.hity.hity
+; CHECK-NEXT: %a256.hity.hity.loty = bitcast i128* %a256.hity.hity to i64*
+; CHECK-NEXT: %load256.hi.hi.lo = load i64, i64* %a256.hity.hity.loty
+; CHECK-NEXT: %a256.hity.hity.hi.gep = getelementptr i64, i64* %a256.hity.hity.loty, i32 1
+; CHECK-NEXT: %load256.hi.hi.hi = load i64, i64* %a256.hity.hity.hi.gep
+  %a256 = bitcast i32* %a to i256*
+  %load256 = load i256, i256* %a256
+  ret void
+}
+
+; CHECK-LABEL: @loadalign
+define void @loadalign(i32* %a) {
+  %a96 = bitcast i32* %a to i96*
+
+; CHECK: %load.lo = load{{.*}}, align 16
+; CHECK: %load.hi = load{{.*}}, align 8
+  %load = load i96, i96* %a96, align 16
+
+; CHECK: %loadnoalign.lo = load{{.*}}, align 8
+; CHECK: %loadnoalign.hi = load{{.*}}, align 8
+  %loadnoalign = load i96, i96* %a96
+
+; CHECK: %load4.lo = load{{.*}}, align 4
+; CHECK: %load4.hi = load{{.*}}, align 4
+  %load4 = load i96, i96* %a96, align 4
+
+  %a256 = bitcast i32* %a to i256*
+; CHECK: %load256.lo = load{{.*}}, align 16
+; CHECK: %load256.hi.lo = load{{.*}}, align 8
+; CHECK: %load256.hi.hi.lo = load{{.*}}, align 8
+; CHECK: %load256.hi.hi.hi = load{{.*}}, align 8
+  %load256 = load i256, i256* %a256, align 16
+  ret void
+}
+
+; CHECK-LABEL: @simplestore
+define void @simplestore(i32* %a, i32* %b) {
+  %a96 = bitcast i32* %a to i96*
+  %b96 = bitcast i32* %b to i96*
+  %load96 = load i96, i96* %a96
+; CHECK: %b96.loty = bitcast i96* %b96 to i64*
+; CHECK-NEXT: store i64 %load96.lo, i64* %b96.loty
+; CHECK-NEXT: %b96.hi.gep = getelementptr i64, i64* %b96.loty, i32 1
+; CHECK-NEXT: %b96.hity = bitcast i64* %b96.hi.gep to i32*
+; CHECK-NEXT: store i32 %load96.hi, i32* %b96.hity
+  store i96 %load96, i96* %b96
+
+  %a128 = bitcast i32* %a to i128*
+  %b128 = bitcast i32* %b to i128*
+  %load128 = load i128, i128* %a128
+; CHECK: %b128.loty = bitcast i128* %b128 to i64*
+; CHECK-NEXT: store i64 %load128.lo, i64* %b128.loty
+; CHECK-NEXT: %b128.hi.gep = getelementptr i64, i64* %b128.loty, i32 1
+; CHECK-NEXT: store i64 %load128.hi, i64* %b128.hi.gep
+  store i128 %load128, i128* %b128
+
+  %a256 = bitcast i32* %a to i256*
+  %b256 = bitcast i32* %b to i256*
+  %load256 = load i256, i256* %a256
+
+; CHECK: %b256.loty = bitcast i256* %b256 to i64*
+; CHECK-NEXT: store i64 %load256.lo, i64* %b256.loty
+; CHECK-NEXT: %b256.hi.gep = getelementptr i64, i64* %b256.loty, i32 1
+; CHECK-NEXT: %b256.hity = bitcast i64* %b256.hi.gep to i192*
+; CHECK-NEXT: %b256.hity.loty = bitcast i192* %b256.hity to i64*
+; CHECK-NEXT: store i64 %load256.hi.lo, i64* %b256.hity.loty
+; CHECK-NEXT: %b256.hity.hi.gep = getelementptr i64, i64* %b256.hity.loty, i32 1
+; CHECK-NEXT: %b256.hity.hity = bitcast i64* %b256.hity.hi.gep to i128*
+; CHECK-NEXT: %b256.hity.hity.loty = bitcast i128* %b256.hity.hity to i64*
+; CHECK-NEXT: store i64 %load256.hi.hi.lo, i64* %b256.hity.hity.loty
+; CHECK-NEXT: %b256.hity.hity.hi.gep = getelementptr i64, i64* %b256.hity.hity.loty, i32 1
+; CHECK-NEXT: store i64 %load256.hi.hi.hi, i64* %b256.hity.hity.hi.gep
+  store i256 %load256, i256* %b256
+  ret void
+}
+
+; CHECK-LABEL: @storealign
+define void @storealign(i32* %a, i32* %b) {
+  %a96 = bitcast i32* %a to i96*
+  %b96 = bitcast i32* %b to i96*
+  %load96 = load i96, i96* %a96
+
+; CHECK: store i64 %load96.lo{{.*}}, align 16
+; CHECK: store i32 %load96.hi{{.*}}, align 8
+  store i96 %load96, i96* %b96, align 16
+
+; CHECK: store i64 %load96.lo{{.*}}, align 8
+; CHECK: store i32 %load96.hi{{.*}}, align 8
+  store i96 %load96, i96* %b96
+
+; CHECK: store i64 %load96.lo{{.*}}, align 4
+; CHECK: store i32 %load96.hi{{.*}}, align 4
+  store i96 %load96, i96* %b96, align 4
+
+  %a256 = bitcast i32* %a to i256*
+  %b256 = bitcast i32* %b to i256*
+  %load256 = load i256, i256* %a256
+; CHECK: store i64 %load256.lo{{.*}}, align 16
+; CHECK: store i64 %load256.hi.lo{{.*}}, align 8
+; CHECK: store i64 %load256.hi.hi.lo{{.*}}, align 8
+; CHECK: store i64 %load256.hi.hi.hi{{.*}}, align 8
+  store i256 %load256, i256* %b256, align 16
+  ret void
+}
+
+
+; Check that forward references are handled.
+; CHECK-LABEL: @fwdref
+define void @fwdref(i32* %a, i32* %b) {
+entry:
+  br label %block1
+block2:
+  %b96 = bitcast i32* %b to i96*
+; CHECK: store i64 %load96.lo
+; CHECK: store i32 %load96.hi
+  store i96 %load96, i96* %b96
+  ret void
+block1:
+  %a96 = bitcast i32* %a to i96*
+; CHECK: load i64, i64* %a96.loty
+; CHECK: load i32, i32* %a96.hity
+  %load96 = load i96, i96* %a96
+  br label %block2
+}
+
+; The subsequent tests use loads and stores to produce and consume the expanded
+; values from the opcodes under test.
+; CHECK-LABEL: @zext
+define void @zext(i32 %a, i64 %b, i8* %p) {
+  %p96 = bitcast i8* %p to i96*
+  %a96 = zext i32 %a to i96
+; CHECK: %a96.lo = zext i32 %a to i64
+  store i96 %a96, i96* %p96
+; CHECK: store i64 %a96.lo, i64* %p96.loty
+; CHECK: store i32 0, i32* %p96.hity
+
+  %b96 = zext i64 %b to i96
+; CHECK: store i64 %b, i64* %p96.loty
+; CHECK: store i32 0, i32* %p96.hity
+  store i96 %b96, i96* %p96
+
+  %p128 = bitcast i8* %p to i128*
+  %c96 = load i96, i96* %p96
+; CHECK: %a128.hi = zext i32 %c96.hi to i64
+  %a128 = zext i96 %c96 to i128
+; CHECK: store i64 %c96.lo, i64* %p128.loty
+; CHECK: store i64 %a128.hi, i64* %p128.hi.gep
+  store i128 %a128, i128* %p128
+
+  %p256 = bitcast i8* %p to i256*
+
+; CHECK: %b256.lo = zext i32 %a to i64
+  %b256 = zext i32 %a to i256
+; CHECK: store i64 %b256.lo, i64* %p256.loty
+; CHECK: store i64 0, i64* %p256.hity.loty
+; CHECK: store i64 0, i64* %p256.hity.hity.loty
+; CHECK: store i64 0, i64* %p256.hity.hity.hi.gep
+  store i256 %b256, i256* %p256
+
+; CHECK: %c256.hi.lo = zext i32 %c96.hi to i64
+  %c256 = zext i96 %c96 to i256
+; CHECK: store i64 %c96.lo, i64* %p256.loty
+; CHECK: store i64 %c256.hi.lo, i64* %p256.hity9.loty
+; CHECK: store i64 0, i64* %p256.hity9.hity.loty
+; CHECK: store i64 0, i64* %p256.hity9.hity.hi.gep
+  store i256 %c256, i256* %p256
+   ret void
+}
+
+
+; CHECK-LABEL: @bitwise
+define void @bitwise(i32* %a) {
+  %a96p = bitcast i32* %a to i96*
+  %a96 = load i96, i96* %a96p
+  %b96 = load i96, i96* %a96p
+
+; CHECK: %c96.lo = and i64 %a96.lo, %b96.lo
+; CHECK: %c96.hi = and i32 %a96.hi, %b96.hi
+  %c96 = and i96 %a96, %b96
+; CHECK: %d96.lo = or i64 %a96.lo, %c96.lo
+; CHECK: %d96.hi = or i32 %a96.hi, %c96.hi
+  %d96 = or i96 %a96, %c96
+
+; CHECK: %x96.lo = xor i64 %a96.lo, %c96.lo
+; CHECK: %x96.hi = xor i32 %a96.hi, %c96.hi
+  %x96 = xor i96 %a96, %c96
+  ret void
+}
+
+; CHECK-LABEL: @truncs
+define void @truncs(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %a96 = load i96, i96* %p96
+
+; CHECK: %t32 = trunc i64 %a96.lo to i32
+  %t32 = trunc i96 %a96 to i32
+
+  %b96 = load i96, i96* %p96
+; Check that t64 refers directly to the low loaded value from %p96
+; CHECK: %t64 = load i64, i64* %p96.loty
+  %t64 = trunc i96 %b96 to i64
+
+  %c96 = load i96, i96* %p96
+; Use the and to get a use of %t90.lo and check that it refers directly to
+; %c96.lo
+; CHECK: %t90.hi = trunc i32 %c96.hi to i26
+; CHECK: %a90.lo = and i64 %c96.lo, %c96.lo
+  %t90 = trunc i96 %c96 to i90
+  %t90_2 = trunc i96 %c96 to i90
+  %a90 = and i90 %t90, %t90_2
+  ret void
+}
+
+; CHECK-LABEL: @shls
+define void @shls(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %a96 = load i96, i96* %p96
+  %p128 = bitcast i32* %p to i128*
+  %a128 = load i128, i128* %p128
+  %p192 = bitcast i32* %p to i192*
+  %a192 = load i192, i192* %p192
+
+; CHECK: %b96.lo = shl i64 %a96.lo, 5
+; CHECK-NEXT: %b96.lo.shr = lshr i64 %a96.lo, 59
+; CHECK-NEXT: %b96.lo.ext = trunc i64 %b96.lo.shr to i32
+; CHECK-NEXT: %b96.hi.shl = shl i32 %a96.hi, 5
+; CHECK-NEXT: %b96.or = or i32 %b96.lo.ext, %b96.hi.shl
+  %b96 = shl i96 %a96, 5
+
+; CHECK: %d96.lo = shl i64 %a96.lo, 35
+; CHECK-NEXT: %d96.lo.shr = lshr i64 %a96.lo, 29
+; CHECK-NEXT: %d96.lo.ext = trunc i64 %d96.lo.shr to i32
+; CHECK: store i64 %d96.lo, i64* %p96.loty1
+; CHECK: store i32 %d96.lo.ext, i32* %p96.hity
+  %d96 = shl i96 %a96, 35
+  store i96 %d96, i96* %p96
+
+; CHECK: %b128.lo = shl i64 %a128.lo, 35
+; CHECK-NEXT: %b128.lo.shr = lshr i64 %a128.lo, 29
+; CHECK-NEXT: %b128.hi.shl = shl i64 %a128.hi, 35
+; CHECK-NEXT: %b128.or = or i64 %b128.lo.shr, %b128.hi.shl
+  %b128 = shl i128 %a128, 35
+
+; CHECK: %c96.lo.ext = trunc i64 %a96.lo to i32
+; CHECK-NEXT: %c96.lo.shl = shl i32 %c96.lo.ext, 8
+; CHECK: store i64 0, i64* %p96.loty
+  %c96 = shl i96 %a96, 72
+  store i96 %c96, i96* %p96
+
+; CHECK: %c128.lo.shl = shl i64 %a128.lo, 36
+; CHECK: store i64 0, i64* %p128.loty
+  %c128 = shl i128 %a128, 100
+  store i128 %c128, i128* %p128
+
+; %b192.lo = shl i64 %a192.lo, 35
+; %b192.lo.shr = lshr i64 %a192.lo, 29
+; %b192.hi.shl.lo = shl i64 %a192.hi.lo, 35
+; %b192.hi.shl.lo.shr = lshr i64 %a192.hi.lo, 29
+; %b192.hi.shl.hi.shl = shl i64 %a192.hi.hi, 35
+; %b192.hi.shl.or = or i64 %b192.hi.shl.lo.shr, %b192.hi.shl.hi.shl
+; %b192.or.lo = or i64 %b192.lo.shr, %b192.hi.shl.lo
+; %b192.or.hi = or i64 0, %b192.hi.shl.or
+  %b192 = shl i192 %a192, 35
+  store i192 %b192, i192* %p192
+
+; %c192.lo.shl.lo = shl i64 %a192.lo, 36
+; %c192.lo.shl.lo.shr = lshr i64 %a192.lo, 28
+; %c192.hi.shl.lo.shl = shl i64 %a192.hi.lo, 36
+; %c192.or.lo = or i64 %c192.lo.shl.lo, 0
+; %c192.or.hi = or i64 %c192.lo.shl.lo.shr, %c192.hi.shl.lo.shl
+  %c192 = shl i192 %a192, 100
+  store i192 %c192, i192* %p192
+
+  ret void
+}
+
+; CHECK-LABEL: @lshrs
+define void @lshrs(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %a96 = load i96, i96* %p96
+  %p128 = bitcast i32* %p to i128*
+  %a128 = load i128, i128* %p128
+  %p192 = bitcast i32* %p to i192*
+  %a192 = load i192, i192* %p192
+
+; CHECK:      %b96.hi.shr = lshr i32 %a96.hi, 3
+; CHECK-NEXT: %b96.lo.ext = zext i32 %b96.hi.shr to i64
+; CHECK:      store i32 0, i32* %p96.hity
+  %b96 = lshr i96 %a96, 67
+  store i96 %b96, i96* %p96
+
+; CHECK:      %c96.hi.ext = zext i32 %a96.hi to i64
+; CHECK-NEXT: %c96.hi.shl = shl i64 %c96.hi.ext, 19
+; CHECK-NEXT: %c96.lo.shr = lshr i64 %a96.lo, 45
+; CHECK-NEXT: %c96.lo = or i64 %c96.hi.shl, %c96.lo.shr
+; CHECK:      store i32 0, i32* %p96.hity
+  %c96 = lshr i96 %a96, 45
+  store i96 %c96, i96* %p96
+
+; CHECK: %b128.hi.shr = lshr i64 %a128.hi, 3
+; CHECK: store i64 0, i64* %p128.hi.gep
+  %b128 = lshr i128 %a128, 67
+  store i128 %b128, i128* %p128
+
+; CHECK:      %d96.hi.ext = zext i32 %a96.hi to i64
+; CHECK-NEXT: %d96.hi.shl = shl i64 %d96.hi.ext, 47
+; CHECK-NEXT: %d96.lo.shr = lshr i64 %a96.lo, 17
+; CHECK-NEXT: %d96.lo = or i64 %d96.hi.shl, %d96.lo.shr
+; CHECK-NEXT: %d96.hi = lshr i32 %a96.hi, 17
+  %d96 = lshr i96 %a96, 17
+  store i96 %d96, i96* %p96
+
+; CHECK:      %c128.hi.shl = shl i64 %a128.hi, 21
+; CHECK-NEXT: %c128.lo.shr = lshr i64 %a128.lo, 43
+; CHECK-NEXT: %c128.lo = or i64 %c128.hi.shl, %c128.lo.shr
+; CHECK-NEXT: %c128.hi = lshr i64 %a128.hi, 43
+  %c128 = lshr i128 %a128, 43
+  store i128 %c128, i128* %p128
+
+  %b192 = lshr i192 %a192, 100
+  store i192 %b192, i192* %p192
+
+  ret void
+}
+
+; Make sure that the following doesn't assert out: it generates intermediate
+; `trunc` instructions which get progressively smaller and smaller as the
+; instructions are cut down. The final bitcode doesn't contain a `trunc`
+; instruction.
+;
+; CHECK-LABEL: @lshr_big
+define void @lshr_big(i32* %a) {
+  %p536 = bitcast i32* %a to i536*
+  %loaded = load i536, i536* %p536, align 4
+  %shifted = lshr i536 %loaded, 161
+  store i536 %shifted, i536* %p536
+  ret void
+}
+
+; CHECK-LABEL: @ashrs
+define void @ashrs(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %a96 = load i96, i96* %p96
+  %p128 = bitcast i32* %p to i128*
+  %a128 = load i128, i128* %p128
+
+; CHECK:      %b96.hi.shr = ashr i32 %a96.hi, 3
+; CHECK-NEXT: %b96.lo.ext = sext i32 %b96.hi.shr to i64
+; CHECK-NEXT: %b96.hi = ashr i32 %a96.hi, 31
+  %b96 = ashr i96 %a96, 67
+  store i96 %b96, i96* %p96
+
+; CHECK:      %c96.hi.ext = sext i32 %a96.hi to i64
+; CHECK-NEXT: %c96.hi.shl = shl i64 %c96.hi.ext, 19
+; CHECK-NEXT: %c96.lo.shr = lshr i64 %a96.lo, 45
+; CHECK-NEXT: %c96.lo = or i64 %c96.hi.shl, %c96.lo.shr
+; CHECK-NEXT: %c96.hi = ashr i32 %a96.hi, 31
+  %c96 = ashr i96 %a96, 45
+  store i96 %c96, i96* %p96
+
+; CHECK:      %b128.hi.shr = ashr i64 %a128.hi, 3
+; CHECK-NEXT: %b128.hi = ashr i64 %a128.hi, 63
+; CHECK:      store i64 %b128.hi, i64* %p128.hi.gep
+  %b128 = ashr i128 %a128, 67
+  store i128 %b128, i128* %p128
+
+; CHECK:      %d96.hi.ext = sext i32 %a96.hi to i64
+; CHECK-NEXT: %d96.hi.shl = shl i64 %d96.hi.ext, 47
+; CHECK-NEXT: %d96.lo.shr = lshr i64 %a96.lo, 17
+; CHECK-NEXT: %d96.lo = or i64 %d96.hi.shl, %d96.lo.shr
+; CHECK-NEXT: %d96.hi = ashr i32 %a96.hi, 17
+  %d96 = ashr i96 %a96, 17
+  store i96 %d96, i96* %p96
+
+; CHECK:      %c128.hi.shl = shl i64 %a128.hi, 21
+; CHECK-NEXT: %c128.lo.shr = lshr i64 %a128.lo, 43
+; CHECK-NEXT: %c128.lo = or i64 %c128.hi.shl, %c128.lo.shr
+; CHECK-NEXT: %c128.hi = ashr i64 %a128.hi, 43
+  %c128 = ashr i128 %a128, 43
+  store i128 %c128, i128* %p128
+
+  ret void
+}
+
+; CHECK-LABEL: @adds
+define void @adds(i32 *%dest, i32* %lhs, i32* %rhs) {
+  %d = bitcast i32* %dest to i96*
+  %lp = bitcast i32* %lhs to i96*
+  %lv = load i96, i96* %lp
+  %rp = bitcast i32* %rhs to i96*
+  %rv = load i96, i96* %rp
+
+; CHECK: %result.lo = add i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %result.cmp = icmp ult i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %result.limit = select i1 %result.cmp, i64 %rv.lo, i64 %lv.lo
+; CHECK-NEXT: %result.overflowed = icmp ult i64 %result.lo, %result.limit
+; CHECK-NEXT: %result.carry = zext i1 %result.overflowed to i32
+; CHECK-NEXT: %result.hi = add i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %result.carried = add i32 %result.hi, %result.carry
+  %result = add i96 %lv, %rv
+  store i96 %result, i96* %d
+  ret void
+}
+
+; CHECK-LABEL: @subs
+define void @subs(i32 *%dest, i32* %lhs, i32* %rhs) {
+  %d = bitcast i32* %dest to i96*
+  %lp = bitcast i32* %lhs to i96*
+  %lv = load i96, i96* %lp
+  %rp = bitcast i32* %rhs to i96*
+  %rv = load i96, i96* %rp
+
+; CHECK: %result.borrow = icmp ult i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %result.borrowing = sext i1 %result.borrow to i32
+; CHECK-NEXT: %result.lo = sub i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %result.hi = sub i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %result.borrowed = add i32 %result.hi, %result.borrowing
+  %result = sub i96 %lv, %rv
+  store i96 %result, i96* %d
+  ret void
+}
+
+; CHECK-LABEL: @icmp_equality
+define void @icmp_equality(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %a96 = load i96, i96* %p96
+  %b96 = load i96, i96* %p96
+
+; CHECK: %eq.lo = icmp eq i64 %a96.lo, %b96.lo
+; CHECK-NEXT: %eq.hi = icmp eq i32 %a96.hi, %b96.hi
+; CHECK-NEXT: %eq = and i1 %eq.lo, %eq.hi
+  %eq = icmp eq i96 %a96, %b96
+
+; CHECK: %ne.lo = icmp ne i64 %a96.lo, %b96.lo
+; CHECK-NEXT: %ne.hi = icmp ne i32 %a96.hi, %b96.hi
+; CHECK-NEXT: %ne = and i1 %ne.lo, %ne.hi
+  %ne = icmp ne i96 %a96, %b96
+  ret void
+}
+
+; CHECK-LABEL: @icmp_uge
+define void @icmp_uge(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %lv = load i96, i96* %p96
+  %rv = load i96, i96* %p96
+; Do an add.
+; CHECK: %uge.lo = add i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %uge.cmp = icmp ult i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %uge.limit = select i1 %uge.cmp, i64 %rv.lo, i64 %lv.lo
+; CHECK-NEXT: %uge.overflowed = icmp ult i64 %uge.lo, %uge.limit
+; CHECK-NEXT: %uge.carry = zext i1 %uge.overflowed to i32
+; CHECK-NEXT: %uge.hi = add i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %uge.carried = add i32 %uge.hi, %uge.carry
+; Do the hi carry.
+; CHECK-NEXT: %uge.cmp4 = icmp ult i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %uge.limit5 = select i1 %uge.cmp4, i32 %rv.hi, i32 %lv.hi
+; CHECK-NEXT: %uge = icmp ult i32 %uge.carried, %uge.limit5
+  %uge = icmp uge i96 %lv, %rv
+  ret void
+}
+
+; CHECK-LABEL: @icmp_ule
+define void @icmp_ule(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %lv = load i96, i96* %p96
+  %rv = load i96, i96* %p96
+; Do an add.
+; CHECK: %ule.lo = add i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %ule.cmp = icmp ult i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %ule.limit = select i1 %ule.cmp, i64 %rv.lo, i64 %lv.lo
+; CHECK-NEXT: %ule.overflowed = icmp ult i64 %ule.lo, %ule.limit
+; CHECK-NEXT: %ule.carry = zext i1 %ule.overflowed to i32
+; CHECK-NEXT: %ule.hi = add i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %ule.carried = add i32 %ule.hi, %ule.carry
+; Do the hi carry.
+; CHECK-NEXT: %ule.cmp4 = icmp ult i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %ule.limit5 = select i1 %ule.cmp4, i32 %rv.hi, i32 %lv.hi
+; CHECK-NEXT: %ule.overflowed6 = icmp ult i32 %ule.carried, %ule.limit5
+; Invert the carry result.
+; CHECK-NEXT: %ule = xor i1 %ule.overflowed6, true
+  %ule = icmp ule i96 %lv, %rv
+  ret void
+}
+
+; CHECK-LABEL: @icmp_ugt
+define void @icmp_ugt(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %lv = load i96, i96* %p96
+  %rv = load i96, i96* %p96
+; Do an add.
+; CHECK: %ugt.lo = add i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %ugt.cmp = icmp ult i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %ugt.limit = select i1 %ugt.cmp, i64 %rv.lo, i64 %lv.lo
+; CHECK-NEXT: %ugt.overflowed = icmp ult i64 %ugt.lo, %ugt.limit
+; CHECK-NEXT: %ugt.carry = zext i1 %ugt.overflowed to i32
+; CHECK-NEXT: %ugt.hi = add i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %ugt.carried = add i32 %ugt.hi, %ugt.carry
+; Do the hi carry.
+; CHECK-NEXT: %ugt.cmp4 = icmp ult i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %ugt.limit5 = select i1 %ugt.cmp4, i32 %rv.hi, i32 %lv.hi
+; CHECK-NEXT: %ugt.overflowed6 = icmp ult i32 %ugt.carried, %ugt.limit5
+; Equality comparison.
+; CHECK-NEXT: %ugt.lo7 = icmp eq i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %ugt.hi8 = icmp eq i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %ugt.eq = and i1 %ugt.lo7, %ugt.hi8
+; Merge the hi carry and equality comparison results.
+; CHECK-NEXT: %ugt = and i1 %ugt.overflowed6, %ugt.eq
+  %ugt = icmp ugt i96 %lv, %rv
+  ret void
+}
+
+; CHECK-LABEL: @icmp_ult
+define void @icmp_ult(i32* %p) {
+  %p96 = bitcast i32* %p to i96*
+  %lv = load i96, i96* %p96
+  %rv = load i96, i96* %p96
+; Do an add.
+; CHECK: %ult.lo = add i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %ult.cmp = icmp ult i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %ult.limit = select i1 %ult.cmp, i64 %rv.lo, i64 %lv.lo
+; CHECK-NEXT: %ult.overflowed = icmp ult i64 %ult.lo, %ult.limit
+; CHECK-NEXT: %ult.carry = zext i1 %ult.overflowed to i32
+; CHECK-NEXT: %ult.hi = add i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %ult.carried = add i32 %ult.hi, %ult.carry
+; Do the hi carry.
+; CHECK-NEXT: %ult.cmp4 = icmp ult i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %ult.limit5 = select i1 %ult.cmp4, i32 %rv.hi, i32 %lv.hi
+; CHECK-NEXT: %ult.overflowed6 = icmp ult i32 %ult.carried, %ult.limit5
+; Invert the carry result.
+; CHECK-NEXT: %ult7 = xor i1 %ult.overflowed6, true
+; Equality comparison.
+; CHECK-NEXT: %ult.lo8 = icmp eq i64 %lv.lo, %rv.lo
+; CHECK-NEXT: %ult.hi9 = icmp eq i32 %lv.hi, %rv.hi
+; CHECK-NEXT: %ult.eq = and i1 %ult.lo8, %ult.hi9
+; Merge the hi carry and equality comparison results.
+; CHECK-NEXT: %ult = and i1 %ult7, %ult.eq
+  %ult = icmp ult i96 %lv, %rv
+  ret void
+}
+
+; CHECK-LABEL: @selects
+define void @selects(i1 %c, i32* %pl, i32* %pr) {
+  %pl96 = bitcast i32* %pl to i96*
+  %pr96 = bitcast i32* %pr to i96*
+  %l = load i96, i96* %pl96
+  %r = load i96, i96* %pr96
+
+; CHECK: %result.lo = select i1 %c, i64 %l.lo, i64 %r.lo
+; CHECK-NEXT: %result.hi = select i1 %c, i32 %l.hi, i32 %r.hi
+  %result = select i1 %c, i96 %l, i96 %r
+  ret void
+}
+
+; CHECK-LABEL: @phis1
+define void @phis1() {
+entry:
+  br label %label1
+label1:
+  br i1 undef, label %label2, label %end
+label2:
+  br label %end
+end:
+; CHECK: %foo.lo = phi i64 [ undef, %label1 ], [ undef, %label2 ]
+; CHECK-NEXT: %foo.hi = phi i8 [ undef, %label1 ], [ undef, %label2 ]
+; CHECK-NEXT: %bar.lo = and i64 %foo.lo, 137438953472
+; CHECK-NEXT: %bar.hi = and i8 %foo.hi, 0
+  %foo = phi i72 [ undef, %label1 ], [ undef, %label2 ]
+  %bar = and i72 %foo, 137438953472
+  br i1 undef, label %label1, label %label2
+}
+
+; CHECK-LABEL: @phis2
+define void @phis2() {
+entry:
+  br label %label1
+label1:
+; CHECK: %foo.lo = phi i64 [ %bar.lo, %label2 ], [ undef, %entry ]
+; CHECK-NEXT:  %foo.hi = phi i8 [ %bar.hi, %label2 ], [ undef, %entry ]
+  %foo = phi i72 [ %bar, %label2 ], [ undef, %entry ]
+  br i1 undef, label %label2, label %end
+label2:
+; CHECK: %bar.lo = load i64, i64* undef, align 4
+; CHECK-NEXT: %bar.hi = load i8, i8* undef, align 4
+  %bar = load i72, i72* undef, align 4
+  br label %label1
+end:
+  ret void
+}
diff --git a/test/Transforms/NaCl/expand-shuffle-vector.ll b/test/Transforms/NaCl/expand-shuffle-vector.ll
new file mode 100644
index 000000000000..3c274979abd9
--- /dev/null
+++ b/test/Transforms/NaCl/expand-shuffle-vector.ll
@@ -0,0 +1,138 @@
+; RUN: opt -expand-shufflevector %s -S | FileCheck %s
+
+; Test that shufflevector is expanded to insertelement / extractelement.
+
+define <4 x i32> @test_splat_lo_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_splat_lo_4xi32
+  ; CHECK-NEXT: %1 = extractelement <4 x i32> %lhs, i32 0
+  ; CHECK-NEXT: %2 = extractelement <4 x i32> %lhs, i32 0
+  ; CHECK-NEXT: %3 = extractelement <4 x i32> %lhs, i32 0
+  ; CHECK-NEXT: %4 = extractelement <4 x i32> %lhs, i32 0
+  ; CHECK-NEXT: %5 = insertelement <4 x i32> undef, i32 %1, i32 0
+  ; CHECK-NEXT: %6 = insertelement <4 x i32> %5, i32 %2, i32 1
+  ; CHECK-NEXT: %7 = insertelement <4 x i32> %6, i32 %3, i32 2
+  ; CHECK-NEXT: %8 = insertelement <4 x i32> %7, i32 %4, i32 3
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ; CHECK-NEXT: ret <4 x i32> %8
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_splat_hi_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_splat_hi_4xi32
+  ; CHECK-NEXT: %1 = extractelement <4 x i32> %rhs, i32 0
+  ; CHECK-NEXT: %2 = extractelement <4 x i32> %rhs, i32 0
+  ; CHECK-NEXT: %3 = extractelement <4 x i32> %rhs, i32 0
+  ; CHECK-NEXT: %4 = extractelement <4 x i32> %rhs, i32 0
+  ; CHECK-NEXT: %5 = insertelement <4 x i32> undef, i32 %1, i32 0
+  ; CHECK-NEXT: %6 = insertelement <4 x i32> %5, i32 %2, i32 1
+  ; CHECK-NEXT: %7 = insertelement <4 x i32> %6, i32 %3, i32 2
+  ; CHECK-NEXT: %8 = insertelement <4 x i32> %7, i32 %4, i32 3
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+  ; CHECK-NEXT: ret <4 x i32> %8
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_id_lo_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_id_lo_4xi32
+  ; CHECK-NEXT: %1 = extractelement <4 x i32> %lhs, i32 0
+  ; CHECK-NEXT: %2 = extractelement <4 x i32> %lhs, i32 1
+  ; CHECK-NEXT: %3 = extractelement <4 x i32> %lhs, i32 2
+  ; CHECK-NEXT: %4 = extractelement <4 x i32> %lhs, i32 3
+  ; CHECK-NEXT: %5 = insertelement <4 x i32> undef, i32 %1, i32 0
+  ; CHECK-NEXT: %6 = insertelement <4 x i32> %5, i32 %2, i32 1
+  ; CHECK-NEXT: %7 = insertelement <4 x i32> %6, i32 %3, i32 2
+  ; CHECK-NEXT: %8 = insertelement <4 x i32> %7, i32 %4, i32 3
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK-NEXT: ret <4 x i32> %8
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_id_hi_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_id_hi_4xi32
+  ; CHECK-NEXT: %1 = extractelement <4 x i32> %rhs, i32 0
+  ; CHECK-NEXT: %2 = extractelement <4 x i32> %rhs, i32 1
+  ; CHECK-NEXT: %3 = extractelement <4 x i32> %rhs, i32 2
+  ; CHECK-NEXT: %4 = extractelement <4 x i32> %rhs, i32 3
+  ; CHECK-NEXT: %5 = insertelement <4 x i32> undef, i32 %1, i32 0
+  ; CHECK-NEXT: %6 = insertelement <4 x i32> %5, i32 %2, i32 1
+  ; CHECK-NEXT: %7 = insertelement <4 x i32> %6, i32 %3, i32 2
+  ; CHECK-NEXT: %8 = insertelement <4 x i32> %7, i32 %4, i32 3
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ; CHECK-NEXT: ret <4 x i32> %8
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_interleave_lo_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_interleave_lo_4xi32
+  ; CHECK-NEXT: %1 = extractelement <4 x i32> %lhs, i32 0
+  ; CHECK-NEXT: %2 = extractelement <4 x i32> %rhs, i32 0
+  ; CHECK-NEXT: %3 = extractelement <4 x i32> %lhs, i32 1
+  ; CHECK-NEXT: %4 = extractelement <4 x i32> %rhs, i32 1
+  ; CHECK-NEXT: %5 = insertelement <4 x i32> undef, i32 %1, i32 0
+  ; CHECK-NEXT: %6 = insertelement <4 x i32> %5, i32 %2, i32 1
+  ; CHECK-NEXT: %7 = insertelement <4 x i32> %6, i32 %3, i32 2
+  ; CHECK-NEXT: %8 = insertelement <4 x i32> %7, i32 %4, i32 3
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ; CHECK-NEXT: ret <4 x i32> %8
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_interleave_hi_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_interleave_hi_4xi32
+  ; CHECK-NEXT: %1 = extractelement <4 x i32> %lhs, i32 1
+  ; CHECK-NEXT: %2 = extractelement <4 x i32> %rhs, i32 1
+  ; CHECK-NEXT: %3 = extractelement <4 x i32> %lhs, i32 3
+  ; CHECK-NEXT: %4 = extractelement <4 x i32> %rhs, i32 3
+  ; CHECK-NEXT: %5 = insertelement <4 x i32> undef, i32 %1, i32 0
+  ; CHECK-NEXT: %6 = insertelement <4 x i32> %5, i32 %2, i32 1
+  ; CHECK-NEXT: %7 = insertelement <4 x i32> %6, i32 %3, i32 2
+  ; CHECK-NEXT: %8 = insertelement <4 x i32> %7, i32 %4, i32 3
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ; CHECK-NEXT: ret <4 x i32> %8
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_undef_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_undef_4xi32
+  ; CHECK-NEXT: %1 = insertelement <4 x i32> undef, i32 undef, i32 0
+  ; CHECK-NEXT: %2 = insertelement <4 x i32> %1, i32 undef, i32 1
+  ; CHECK-NEXT: %3 = insertelement <4 x i32> %2, i32 undef, i32 2
+  ; CHECK-NEXT: %4 = insertelement <4 x i32> %3, i32 undef, i32 3
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <4 x i32> undef
+  ; CHECK-NEXT: ret <4 x i32> %4
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @test_narrow_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_narrow_4xi32
+  ; CHECK-NEXT: %1 = extractelement <4 x i32> %lhs, i32 0
+  ; CHECK-NEXT: %2 = extractelement <4 x i32> %rhs, i32 0
+  ; CHECK-NEXT: %3 = insertelement <2 x i32> undef, i32 %1, i32 0
+  ; CHECK-NEXT: %4 = insertelement <2 x i32> %3, i32 %2, i32 1
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <2 x i32> <i32 0, i32 4>
+  ; CHECK-NEXT: ret <2 x i32> %4
+  ret <2 x i32> %res
+}
+
+define <8 x i32> @test_widen_4xi32(<4 x i32> %lhs, <4 x i32> %rhs) {
+  ; CHECK-LABEL: test_widen_4xi32
+  ; CHECK-NEXT: %1 = extractelement <4 x i32> %rhs, i32 3
+  ; CHECK-NEXT: %2 = extractelement <4 x i32> %rhs, i32 2
+  ; CHECK-NEXT: %3 = extractelement <4 x i32> %rhs, i32 1
+  ; CHECK-NEXT: %4 = extractelement <4 x i32> %rhs, i32 0
+  ; CHECK-NEXT: %5 = extractelement <4 x i32> %lhs, i32 3
+  ; CHECK-NEXT: %6 = extractelement <4 x i32> %lhs, i32 2
+  ; CHECK-NEXT: %7 = extractelement <4 x i32> %lhs, i32 1
+  ; CHECK-NEXT: %8 = extractelement <4 x i32> %lhs, i32 0
+  ; CHECK-NEXT: %9 = insertelement <8 x i32> undef, i32 %1, i32 0
+  ; CHECK-NEXT: %10 = insertelement <8 x i32> %9, i32 %2, i32 1
+  ; CHECK-NEXT: %11 = insertelement <8 x i32> %10, i32 %3, i32 2
+  ; CHECK-NEXT: %12 = insertelement <8 x i32> %11, i32 %4, i32 3
+  ; CHECK-NEXT: %13 = insertelement <8 x i32> %12, i32 %5, i32 4
+  ; CHECK-NEXT: %14 = insertelement <8 x i32> %13, i32 %6, i32 5
+  ; CHECK-NEXT: %15 = insertelement <8 x i32> %14, i32 %7, i32 6
+  ; CHECK-NEXT: %16 = insertelement <8 x i32> %15, i32 %8, i32 7
+  %res = shufflevector <4 x i32> %lhs, <4 x i32> %rhs, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ; CHECK-NEXT: ret <8 x i32> %16
+  ret <8 x i32> %res
+}
diff --git a/test/Transforms/NaCl/expand-small-arguments.ll b/test/Transforms/NaCl/expand-small-arguments.ll
new file mode 100644
index 000000000000..e9d4f05b1c37
--- /dev/null
+++ b/test/Transforms/NaCl/expand-small-arguments.ll
@@ -0,0 +1,216 @@
+; RUN: opt %s -expand-small-arguments -S | FileCheck %s
+
+@var = global i8 0
+
+
+define void @small_arg(i8 %val) {
+  store i8 %val, i8* @var
+  ret void
+}
+; CHECK: define void @small_arg(i32 %val) {
+; CHECK-NEXT: %val.arg_trunc = trunc i32 %val to i8
+; CHECK-NEXT: store i8 %val.arg_trunc, i8* @var
+
+
+define i8 @small_result() {
+  %val = load i8, i8* @var
+  ret i8 %val
+}
+; CHECK: define i32 @small_result() {
+; CHECK-NEXT: %val = load i8, i8* @var
+; CHECK-NEXT: %val.ret_ext = zext i8 %val to i32
+; CHECK-NEXT: ret i32 %val.ret_ext
+
+define signext i8 @small_result_signext() {
+  %val = load i8, i8* @var
+  ret i8 %val
+}
+; CHECK: define signext i32 @small_result_signext() {
+; CHECK-NEXT: %val = load i8, i8* @var
+; CHECK-NEXT: %val.ret_ext = sext i8 %val to i32
+; CHECK-NEXT: ret i32 %val.ret_ext
+
+
+define void @call_small_arg() {
+  call void @small_arg(i8 100)
+  ret void
+}
+; CHECK: define void @call_small_arg() {
+; CHECK-NEXT: %arg_ext = zext i8 100 to i32
+; CHECK-NEXT: %.arg_cast = bitcast {{.*}} @small_arg
+; CHECK-NEXT: call void %.arg_cast(i32 %arg_ext)
+
+define void @call_small_arg_signext() {
+  call void @small_arg(i8 signext 100)
+  ret void
+}
+; CHECK: define void @call_small_arg_signext() {
+; CHECK-NEXT: %arg_ext = sext i8 100 to i32
+; CHECK-NEXT: %.arg_cast = bitcast {{.*}} @small_arg
+; CHECK-NEXT: call void %.arg_cast(i32 signext %arg_ext)
+
+
+define void @call_small_result() {
+  %r = call i8 @small_result()
+  store i8 %r, i8* @var
+  ret void
+}
+; CHECK: define void @call_small_result() {
+; CHECK-NEXT: %r.arg_cast = bitcast {{.*}} @small_result
+; CHECK-NEXT: %r = call i32 %r.arg_cast()
+; CHECK-NEXT: %r.ret_trunc = trunc i32 %r to i8
+; CHECK-NEXT: store i8 %r.ret_trunc, i8* @var
+
+
+; Check that various attributes are preserved.
+define i1 @attributes(i8 %arg) nounwind {
+  %r = tail call fastcc i1 @attributes(i8 %arg) nounwind
+  ret i1 %r
+}
+; CHECK: define i32 @attributes(i32 %arg) [[NOUNWIND:#[0-9]+]] {
+; CHECK: tail call fastcc i32 {{.*}} [[NOUNWIND]]
+
+
+; These arguments and results should be left alone.
+define i64 @larger_arguments(i32 %a, i64 %b, i8* %ptr, double %d) {
+  %r = call i64 @larger_arguments(i32 %a, i64 %b, i8* %ptr, double %d)
+  ret i64 %r
+}
+; CHECK: define i64 @larger_arguments(i32 %a, i64 %b, i8* %ptr, double %d) {
+; CHECK-NEXT: %r = call i64 @larger_arguments(i32 %a, i64 %b, i8* %ptr, double %d)
+; CHECK-NEXT: ret i64 %r
+
+
+; Intrinsics must be left alone since the pass cannot change their types.
+
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
+; CHECK: declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
+
+define void @intrinsic_call(i8* %ptr) {
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 99, i32 256, i32 1, i1 0)
+  ret void
+}
+; CHECK: define void @intrinsic_call
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %ptr, i8 99,
+
+define void @invoking_small_arg(i8) {
+  invoke void @small_arg(i8 %0)
+      to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret void
+}
+; CHECK-LABEL: define void @invoking_small_arg(i32)
+; CHECK-NEXT:    %.arg_trunc = trunc i32 %0 to i8
+; CHECK-NEXT:    %arg_ext = zext i8 %.arg_trunc to i32
+; CHECK-NEXT:    %.arg_cast = bitcast void (i8)* bitcast (void (i32)* @small_arg to void (i8)*) to void (i32)*
+; CHECK-NEXT:    invoke void %.arg_cast(i32 %arg_ext)
+; CHECK-NEXT:        to label %cont unwind label %lpad
+
+; CHECK:       cont:
+; CHECK-NEXT:    ret void
+
+; CHECK:       lpad:
+; CHECK-NEXT:    %lp = landingpad { i8*, i32 } personality i8* null
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    ret void
+
+define fastcc void @invoking_cc() {
+  invoke fastcc void @invoking_cc()
+      to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret void
+}
+; CHECK-LABEL: define fastcc void @invoking_cc()
+; CHECK-NEXT:    invoke fastcc void @invoking_cc()
+
+define void @invoking_attrs() noinline {
+  invoke void @invoking_attrs() noinline
+      to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret void
+}
+; CHECK:       define void @invoking_attrs() [[NOINLINE:#[0-9]+]]
+; CHECK:         invoke void @invoking_attrs() [[NOINLINE]]
+
+define void @invoking_critical_edge() {
+entry:
+  %a = invoke i8 @small_result()
+      to label %loop unwind label %lpad
+loop:
+  %b = phi i8 [ %a, %entry ], [ %c, %loop ]
+  %c = add i8 1, %b
+  %d = icmp eq i8 %c, 5
+  br i1 %d, label %exit, label %loop
+
+exit:
+  %aa = phi i8 [ 0, %lpad ], [ %c, %loop ]
+  ret void
+
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  br label %exit
+}
+; CHECK-LABEL: define void @invoking_critical_edge()
+; CHECK:        entry:
+; CHECK-NEXT:    %a.arg_cast = bitcast i8 ()* bitcast (i32 ()* @small_result to i8 ()*) to i32 ()*
+; CHECK-NEXT:    %a = invoke i32 %a.arg_cast()
+; CHECK-NEXT:            to label %entry.loop_crit_edge unwind label %lpad
+
+; CHECK:       entry.loop_crit_edge:
+; CHECK-NEXT:    %a.ret_trunc = trunc i32 %a to i8
+; CHECK-NEXT:    br label %loop
+
+; CHECK:       loop:
+; CHECK-NEXT:    %b = phi i8 [ %a.ret_trunc, %entry.loop_crit_edge ], [ %c, %loop ]
+; CHECK-NEXT:    %c = add i8 1, %b
+; CHECK-NEXT:    %d = icmp eq i8 %c, 5
+; CHECK-NEXT:    br i1 %d, label %exit, label %loop
+
+; CHECK:       exit:
+; CHECK-NEXT:    %aa = phi i8 [ 0, %lpad ], [ %c, %loop ]
+; CHECK-NEXT:    ret void
+
+; CHECK:       lpad:
+; CHECK-NEXT:    %lp = landingpad { i8*, i32 } personality i8* null
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    br label %exit
+
+define i8 @invoking_small_result() {
+entry:
+  %a = invoke i8 @small_result()
+      to label %cont unwind label %lpad
+cont:
+  ret i8 %a
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret i8 123
+}
+; CHECK-LABEL: define i32 @invoking_small_result()
+; CHECK:       entry:
+; CHECK-NEXT:    %a.arg_cast = bitcast i8 ()* bitcast (i32 ()* @small_result to i8 ()*) to i32 ()*
+; CHECK-NEXT:    %a = invoke i32 %a.arg_cast()
+; CHECK-NEXT:        to label %cont unwind label %lpad
+
+; CHECK:       cont:
+; CHECK-NEXT:    %a.ret_trunc = trunc i32 %a to i8
+; CHECK-NEXT:    %a.ret_trunc.ret_ext = zext i8 %a.ret_trunc to i32
+; CHECK-NEXT:    ret i32 %a.ret_trunc.ret_ext
+
+; CHECK:       lpad:
+; CHECK-NEXT:    %lp = landingpad { i8*, i32 } personality i8* null
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    %.ret_ext = zext i8 123 to i32
+; CHECK-NEXT:    ret i32 %.ret_ext
+
+
+; CHECK: attributes [[NOUNWIND]] = { nounwind }
+; CHECK: attributes [[NOINLINE]] = { noinline }
diff --git a/test/Transforms/NaCl/expand-struct-regs.ll b/test/Transforms/NaCl/expand-struct-regs.ll
new file mode 100644
index 000000000000..ae8e263cb134
--- /dev/null
+++ b/test/Transforms/NaCl/expand-struct-regs.ll
@@ -0,0 +1,228 @@
+; RUN: opt %s -expand-struct-regs -S | FileCheck %s
+; RUN: opt %s -expand-struct-regs -S | FileCheck %s -check-prefix=CLEANUP
+
+; These two instructions should not appear in the output:
+; CLEANUP-NOT: extractvalue
+; CLEANUP-NOT: insertvalue
+
+target datalayout = "p:32:32:32"
+
+%struct = type { i8, i32 }
+
+
+define void @struct_load(%struct* %p, i8* %out0, i32* %out1) {
+  %val = load %struct, %struct* %p
+  %field0 = extractvalue %struct %val, 0
+  %field1 = extractvalue %struct %val, 1
+  store i8 %field0, i8* %out0
+  store i32 %field1, i32* %out1
+  ret void
+}
+; CHECK: define void @struct_load
+; CHECK-NEXT: %val.index{{.*}} = getelementptr %struct, %struct* %p, i32 0, i32 0
+; CHECK-NEXT: %val.field{{.*}} = load i8, i8* %val.index{{.*}}
+; CHECK-NEXT: %val.index{{.*}} = getelementptr %struct, %struct* %p, i32 0, i32 1
+; CHECK-NEXT: %val.field{{.*}} = load i32, i32* %val.index{{.*}}
+; CHECK-NEXT: store i8 %val.field{{.*}}, i8* %out0
+; CHECK-NEXT: store i32 %val.field{{.*}}, i32* %out1
+
+
+define void @struct_store(%struct* %in_ptr, %struct* %out_ptr) {
+  %val = load %struct, %struct* %in_ptr
+  store %struct %val, %struct* %out_ptr
+  ret void
+}
+; CHECK: define void @struct_store
+; CHECK-NEXT: %val.index{{.*}} = getelementptr %struct, %struct* %in_ptr, i32 0, i32 0
+; CHECK-NEXT: %val.field{{.*}} = load i8, i8* %val.index{{.*}}
+; CHECK-NEXT: %val.index{{.*}} = getelementptr %struct, %struct* %in_ptr, i32 0, i32 1
+; CHECK-NEXT: %val.field{{.*}} = load i32, i32* %val.index{{.*}}
+; CHECK-NEXT: %out_ptr.index{{.*}} = getelementptr %struct, %struct* %out_ptr, i32 0, i32 0
+; CHECK-NEXT: store i8 %val.field{{.*}}, i8* %out_ptr.index{{.*}}
+; CHECK-NEXT: %out_ptr.index{{.*}} = getelementptr %struct, %struct* %out_ptr, i32 0, i32 1
+; CHECK-NEXT: store i32 %val.field{{.*}}, i32* %out_ptr.index{{.*}}
+
+
+; Ensure that the pass works correctly across basic blocks.
+define void @across_basic_block(%struct* %in_ptr, %struct* %out_ptr) {
+  %val = load %struct, %struct* %in_ptr
+  br label %bb
+bb:
+  store %struct %val, %struct* %out_ptr
+  ret void
+}
+; CHECK: define void @across_basic_block
+; CHECK: load
+; CHECK: load
+; CHECK: bb:
+; CHECK: store
+; CHECK: store
+
+
+define void @const_struct_store(%struct* %ptr) {
+  store %struct { i8 99, i32 1234 }, %struct* %ptr
+  ret void
+}
+; CHECK: define void @const_struct_store
+; CHECK: store i8 99
+; CHECK: store i32 1234
+
+
+define void @struct_phi_node(%struct* %ptr) {
+entry:
+  %val = load %struct, %struct* %ptr
+  br label %bb
+bb:
+  %phi = phi %struct [ %val, %entry ]
+  ret void
+}
+; CHECK: bb:
+; CHECK-NEXT: %phi.index{{.*}} = phi i8 [ %val.field{{.*}}, %entry ]
+; CHECK-NEXT: %phi.index{{.*}} = phi i32 [ %val.field{{.*}}, %entry ]
+
+
+define void @struct_phi_node_multiple_entry(i1 %arg, %struct* %ptr) {
+entry:
+  %val = load %struct, %struct* %ptr
+  br i1 %arg, label %bb, label %bb
+bb:
+  %phi = phi %struct [ %val, %entry ], [ %val, %entry ]
+  ret void
+}
+; CHECK: bb:
+; CHECK-NEXT: %phi.index{{.*}} = phi i8 [ %val.field{{.*}}, %entry ], [ %val.field{{.*}}, %entry ]
+; CHECK-NEXT: %phi.index{{.*}} = phi i32 [ %val.field{{.*}}, %entry ], [ %val.field{{.*}}, %entry ]
+
+
+define void @struct_select_inst(i1 %cond, %struct* %ptr1, %struct* %ptr2) {
+  %val1 = load %struct, %struct* %ptr1
+  %val2 = load %struct, %struct* %ptr2
+  %select = select i1 %cond, %struct %val1, %struct %val2
+  ret void
+}
+; CHECK: define void @struct_select_inst
+; CHECK: %select.index{{.*}} = select i1 %cond, i8 %val1.field{{.*}}, i8 %val2.field{{.*}}
+; CHECK-NEXT: %select.index{{.*}} = select i1 %cond, i32 %val1.field{{.*}}, i32 %val2.field{{.*}}
+
+
+define void @insert_and_extract(i8* %out0, i32* %out1) {
+  %temp = insertvalue %struct undef, i8 100, 0
+  %sval = insertvalue %struct %temp, i32 200, 1
+  %field0 = extractvalue %struct %sval, 0
+  %field1 = extractvalue %struct %sval, 1
+  store i8 %field0, i8* %out0
+  store i32 %field1, i32* %out1
+  ret void
+}
+; CHECK: define void @insert_and_extract(i8* %out0, i32* %out1) {
+; CHECK-NEXT: store i8 100, i8* %out0
+; CHECK-NEXT: store i32 200, i32* %out1
+; CHECK-NEXT: ret void
+
+
+define i32 @extract_from_constant() {
+  %ev = extractvalue %struct { i8 99, i32 888 }, 1
+  ret i32 %ev
+}
+; CHECK: define i32 @extract_from_constant() {
+; CHECK-NEXT: ret i32 888
+
+define void @nested_structs() {
+  %a1 = alloca i64
+  %a2 = alloca i32
+  %a3 = alloca { { i32, i64 } }
+  %a = insertvalue { i32, i64 } undef, i32 5, 0
+  %b = insertvalue { i32, i64 } %a, i64 6, 1
+  %c = insertvalue { { i32, i64 } } undef, { i32, i64 } %b, 0
+  %d = insertvalue { { { i32, i64 } }, i64 } undef, { { i32, i64 } } %c, 0
+  %e = insertvalue { { { i32, i64 } }, i64 } undef, { i32, i64 } %b, 0, 0
+
+  %f = extractvalue { { { i32, i64 } }, i64 } %d, 0, 0, 1
+  %g = extractvalue { { { i32, i64 } }, i64 } %e, 0, 0, 0
+  %h = extractvalue { { { i32, i64 } }, i64 } %e, 0
+  store i64 %f, i64* %a1
+  store i32 %g, i32* %a2
+  store { { i32, i64 } } %h, { { i32, i64 } }* %a3
+  ret void
+}
+; CHECK-LABEL: define void @nested_structs()
+; CHECK-NEXT:    %a1 = alloca i64
+; CHECK-NEXT:    %a2 = alloca i32
+; CHECK-NEXT:    %a3 = alloca { { i32, i64 } }
+; CHECK-NEXT:    store i64 6, i64* %a1
+; CHECK-NEXT:    store i32 5, i32* %a2
+; CHECK-NEXT:    %a3.index = getelementptr { { i32, i64 } }, { { i32, i64 } }* %a3, i32 0, i32 0
+; CHECK-NEXT:    %a3.index.index = getelementptr { i32, i64 }, { i32, i64 }* %a3.index, i32 0, i32 0
+; CHECK-NEXT:    store i32 5, i32* %a3.index.index
+; CHECK-NEXT:    %a3.index.index1 = getelementptr { i32, i64 }, { i32, i64 }* %a3.index, i32 0, i32 1
+; CHECK-NEXT:    store i64 6, i64* %a3.index.index1
+
+define void @load_another_pass() {
+  %a = alloca { { i8, i64 } }
+  %b = load { { i8, i64 } }, { { i8, i64 } }* %a
+  %c = load { { i8, i64 } }, { { i8, i64 } }* %a, align 16
+  ret void
+}
+; CHECK-LABEL: define void @load_another_pass()
+; CHECK:         %b.field.field = load i8, i8* %b.field.index
+; CHECK:         %b.field.field{{.*}} = load i64, i64* %b.field.index{{.*}}
+; CHECK:         %c.field.field = load i8, i8* %c.field.index, align 16
+; CHECK:         %c.field.field{{.*}} = load i64, i64* %c.field.index{{.*}}, align 4
+
+define void @store_another_pass() {
+  %a = alloca { { i16, i64 } }
+  store { { i16, i64 } } undef, { { i16, i64 } }* %a
+  store { { i16, i64 } } undef, { { i16, i64 } }* %a, align 16
+  ret void
+}
+; CHECK-LABEL: define void @store_another_pass()
+; CHECK:         store i16 undef, i16* %a.index.index
+; CHECK:         store i64 undef, i64* %a.index.index{{.*}}
+; CHECK:         store i16 undef, i16* %a.index1.index, align 16
+; CHECK:         store i64 undef, i64* %a.index1.index{{.*}}, align 4
+
+define void @select_another_pass() {
+  %a = load { { i8, i64 } }, { { i8, i64 } }* null
+  %b = load { { i8, i64 } }, { { i8, i64 } }* null
+  %c = select i1 undef, { { i8, i64 } } %a, { { i8, i64 } } %b
+  store { { i8, i64 } } %c, { { i8, i64 } }* null
+  ret void
+}
+; CHECK-LABEL: define void @select_another_pass()
+; CHECK-NEXT:    %a.index = getelementptr { { i8, i64 } }, { { i8, i64 } }* null, i32 0, i32 0
+; CHECK-NEXT:    %a.field.index = getelementptr { i8, i64 }, { i8, i64 }* %a.index, i32 0, i32 0
+; CHECK-NEXT:    %a.field.field = load i8, i8* %a.field.index
+; CHECK-NEXT:    %a.field.index2 = getelementptr { i8, i64 }, { i8, i64 }* %a.index, i32 0, i32 1
+; CHECK-NEXT:    %a.field.field3 = load i64, i64* %a.field.index2
+; CHECK-NEXT:    %b.index = getelementptr { { i8, i64 } }, { { i8, i64 } }* null, i32 0, i32 0
+; CHECK-NEXT:    %b.field.index = getelementptr { i8, i64 }, { i8, i64 }* %b.index, i32 0, i32 0
+; CHECK-NEXT:    %b.field.field = load i8, i8* %b.field.index
+; CHECK-NEXT:    %b.field.index5 = getelementptr { i8, i64 }, { i8, i64 }* %b.index, i32 0, i32 1
+; CHECK-NEXT:    %b.field.field6 = load i64, i64* %b.field.index5
+; CHECK-NEXT:    %c.index.index = select i1 undef, i8 %a.field.field, i8 %b.field.field
+; CHECK-NEXT:    %c.index.index11 = select i1 undef, i64 %a.field.field3, i64 %b.field.field6
+; CHECK-NEXT:    %.index = getelementptr { { i8, i64 } }, { { i8, i64 } }* null, i32 0, i32 0
+; CHECK-NEXT:    %.index.index = getelementptr { i8, i64 }, { i8, i64 }* %.index, i32 0, i32 0
+; CHECK-NEXT:    store i8 %c.index.index, i8* %.index.index
+; CHECK-NEXT:    %.index.index13 = getelementptr { i8, i64 }, { i8, i64 }* %.index, i32 0, i32 1
+; CHECK-NEXT:    store i64 %c.index.index11, i64* %.index.index13
+; CHECK-NEXT:    ret void
+
+define void @phi_another_pass() {
+entry:
+  br i1 false, label %next, label %not_next
+
+not_next:
+  %a = alloca { { i64, i16 }, i8* }
+  %b = load { { i64, i16 }, i8* }, { { i64, i16 }, i8* }* %a
+  br label %next
+
+next:
+  %c = phi { { i64, i16 }, i8* } [ undef, %entry ], [ %b, %not_next ]
+  store { { i64, i16 }, i8* } %c, { { i64, i16 }, i8* }* null
+  ret void
+}
+; CHECK-LABEL: define void @phi_another_pass()
+; CHECK:         %c.index.index = phi i64 [ undef, %entry ], [ %b.field.field, %not_next ]
+; CHECK:         %c.index.index{{.*}} = phi i16 [ undef, %entry ], [ %b.field.field{{.*}}, %not_next ]
+; CHECK:         %c.index{{.*}} = phi i8* [ undef, %entry ], [ %b.field{{.*}}, %not_next ]
diff --git a/test/Transforms/NaCl/expand-tls-aligned.ll b/test/Transforms/NaCl/expand-tls-aligned.ll
new file mode 100644
index 000000000000..75f03ba306ff
--- /dev/null
+++ b/test/Transforms/NaCl/expand-tls-aligned.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -nacl-expand-tls -S | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+
+@var = global i32 123
+
+; Put this first to check that the pass handles BSS variables last.
+@bss_tvar_aligned = thread_local global i32 0, align 64
+
+@tvar1 = thread_local global i16 234
+; Test a pointer to check we are getting the right pointer size.
+@tvar2 = thread_local global i32* @var
+@tvar_aligned = thread_local global i8 99, align 32
+
+
+; CHECK: %tls_init_template = type <{ i16, [2 x i8], i32*, [24 x i8], i8 }>
+; CHECK: %tls_struct = type <{ %tls_init_template, %tls_bss_template }>
+
+; This struct type must be "packed" because the 31 byte padding here
+; is followed by an i32.
+; CHECK: %tls_bss_template = type <{ [31 x i8], i32, [60 x i8] }>
+
+; CHECK: @__tls_template_start = internal constant %tls_init_template <{ i16 234, [2 x i8] zeroinitializer, i32* @var, [24 x i8] zeroinitializer, i8 99 }>
+
+; CHECK: @__tls_template_alignment = internal constant i32 64
+
+
+; Create references to __tls_template_* to keep these live, otherwise
+; the definition of %tls_struct (which we check for above) is removed
+; from the output.
+
+@__tls_template_tdata_end = external global i8
+@__tls_template_end = external global i8
+
+define i8* @get_tls_template_tdata_end() {
+  ret i8* @__tls_template_tdata_end
+}
+
+define i8* @get_tls_template_end() {
+  ret i8* @__tls_template_end
+}
diff --git a/test/Transforms/NaCl/expand-tls-bss.ll b/test/Transforms/NaCl/expand-tls-bss.ll
new file mode 100644
index 000000000000..82e7e41fef96
--- /dev/null
+++ b/test/Transforms/NaCl/expand-tls-bss.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -nacl-expand-tls -S | FileCheck %s
+
+
+@tvar_bss1 = thread_local global i64 0
+@tvar_bss2 = thread_local global i32 0
+
+
+; CHECK: %tls_struct = type <{ %tls_init_template, %tls_bss_template }>
+; CHECK: %tls_bss_template = type <{ i64, i32, [4 x i8] }>
+
+
+define i64* @get_tvar_bss1() {
+  ret i64* @tvar_bss1
+}
+; CHECK: define i64* @get_tvar_bss1()
+; CHECK: %field = getelementptr %tls_struct, %tls_struct* %tls_struct, i32 -1, i32 1, i32 0
+; CHECK: ret i64* %field
diff --git a/test/Transforms/NaCl/expand-tls-constexpr-alias.ll b/test/Transforms/NaCl/expand-tls-constexpr-alias.ll
new file mode 100644
index 000000000000..2b3d7546d877
--- /dev/null
+++ b/test/Transforms/NaCl/expand-tls-constexpr-alias.ll
@@ -0,0 +1,28 @@
+; RUN: opt < %s -nacl-expand-tls-constant-expr -S | FileCheck %s
+
+@real_tvar = thread_local global i32 123
+@tvar_alias = alias i32* @real_tvar
+@tvar_alias2 = alias i32* getelementptr (i32, i32* @real_tvar, i32 100)
+
+
+define i32* @get_tvar() {
+  ret i32* @tvar_alias
+}
+; CHECK: define i32* @get_tvar()
+; CHECK: ret i32* @real_tvar
+
+
+define i32* @get_tvar2() {
+  ret i32* @tvar_alias2
+}
+; CHECK: define i32* @get_tvar2()
+; CHECK: %expanded = getelementptr i32, i32* @real_tvar, i32 100
+; CHECK: ret i32* %expanded
+
+
+define i32* @get_tvar3() {
+  ret i32* getelementptr (i32, i32* @tvar_alias2, i32 100)
+}
+; CHECK: define i32* @get_tvar3()
+; CHECK: %expanded = getelementptr i32, i32* @real_tvar, i32 200
+; CHECK: ret i32* %expanded
diff --git a/test/Transforms/NaCl/expand-tls-constexpr.ll b/test/Transforms/NaCl/expand-tls-constexpr.ll
new file mode 100644
index 000000000000..fc441d354c3f
--- /dev/null
+++ b/test/Transforms/NaCl/expand-tls-constexpr.ll
@@ -0,0 +1,152 @@
+; RUN: opt < %s -nacl-expand-tls-constant-expr -S | FileCheck %s
+
+@tvar = thread_local global i32 0
+
+
+define i32 @test_converting_ptrtoint() {
+  ret i32 ptrtoint (i32* @tvar to i32)
+}
+; CHECK: define i32 @test_converting_ptrtoint()
+; CHECK: %expanded = ptrtoint i32* @tvar to i32
+; CHECK: ret i32 %expanded
+
+
+define i32 @test_converting_add() {
+  ret i32 add (i32 ptrtoint (i32* @tvar to i32), i32 4)
+}
+; CHECK: define i32 @test_converting_add()
+; CHECK: %expanded1 = ptrtoint i32* @tvar to i32
+; CHECK: %expanded = add i32 %expanded1, 4
+; CHECK: ret i32 %expanded
+
+
+define i32 @test_converting_multiple_operands() {
+  ret i32 add (i32 ptrtoint (i32* @tvar to i32),
+               i32 ptrtoint (i32* @tvar to i32))
+}
+; CHECK: define i32 @test_converting_multiple_operands()
+; CHECK: %expanded1 = ptrtoint i32* @tvar to i32
+; CHECK: %expanded = add i32 %expanded1, %expanded1
+; CHECK: ret i32 %expanded
+
+
+define i32 @test_allocating_new_var_name(i32 %expanded) {
+  %result = add i32 %expanded, ptrtoint (i32* @tvar to i32)
+  ret i32 %result
+}
+; CHECK: define i32 @test_allocating_new_var_name(i32 %expanded)
+; CHECK: %expanded1 = ptrtoint i32* @tvar to i32
+; CHECK: %result = add i32 %expanded, %expanded1
+; CHECK: ret i32 %result
+
+
+define i8* @test_converting_bitcast() {
+  ret i8* bitcast (i32* @tvar to i8*)
+}
+; CHECK: define i8* @test_converting_bitcast()
+; CHECK: %expanded = bitcast i32* @tvar to i8*
+; CHECK: ret i8* %expanded
+
+
+define i32* @test_converting_getelementptr() {
+  ; Use an index >1 to ensure that "inbounds" is not added automatically.
+  ret i32* getelementptr (i32, i32* @tvar, i32 2)
+}
+; CHECK: define i32* @test_converting_getelementptr()
+; CHECK: %expanded = getelementptr i32, i32* @tvar, i32 2
+; CHECK: ret i32* %expanded
+
+
+; This is identical to @test_converting_getelementptr().
+; We need to check that both copies of getelementptr are fixed.
+define i32* @test_converting_getelementptr_copy() {
+  ret i32* getelementptr (i32, i32* @tvar, i32 2)
+}
+; CHECK: define i32* @test_converting_getelementptr_copy()
+; CHECK: %expanded = getelementptr i32, i32* @tvar, i32 2
+; CHECK: ret i32* %expanded
+
+
+define i32* @test_converting_getelementptr_inbounds() {
+  ret i32* getelementptr inbounds (i32, i32* @tvar, i32 2)
+}
+; CHECK: define i32* @test_converting_getelementptr_inbounds()
+; CHECK: %expanded = getelementptr inbounds i32, i32* @tvar, i32 2
+; CHECK: ret i32* %expanded
+
+
+define i32* @test_converting_phi(i1 %cmp) {
+entry:
+  br i1 %cmp, label %return, label %else
+
+else:
+  br label %return
+
+return:
+  %result = phi i32* [ getelementptr (i32, i32* @tvar, i32 1), %entry ], [ null, %else ]
+  ret i32* %result
+}
+; The converted ConstantExprs get pushed back into the PHI node's
+; incoming block, which might be suboptimal but works in all cases.
+; CHECK: define i32* @test_converting_phi(i1 %cmp)
+; CHECK: entry:
+; CHECK: %expanded = getelementptr inbounds i32, i32* @tvar, i32 1
+; CHECK: else:
+; CHECK: return:
+; CHECK: %result = phi i32* [ %expanded, %entry ], [ null, %else ]
+
+
+@addr1 = global i8* blockaddress(@test_converting_phi_with_indirectbr, %return)
+@addr2 = global i8* blockaddress(@test_converting_phi_with_indirectbr, %else)
+define i32* @test_converting_phi_with_indirectbr(i8* %addr) {
+entry:
+  indirectbr i8* %addr, [ label %return, label %else ]
+
+else:
+  br label %return
+
+return:
+  %result = phi i32* [ getelementptr (i32, i32* @tvar, i32 1), %entry ], [ null, %else ]
+  ret i32* %result
+}
+; CHECK: define i32* @test_converting_phi_with_indirectbr(i8* %addr)
+; CHECK: entry:
+; CHECK: %expanded = getelementptr inbounds i32, i32* @tvar, i32 1
+; CHECK: return:
+; CHECK: %result = phi i32* [ %expanded, %entry ], [ null, %else ]
+
+
+; This tests that ExpandTlsConstantExpr correctly handles a PHI node
+; that contains the same ConstantExpr twice.  Using
+; replaceAllUsesWith() is not correct on a PHI node when the new
+; instruction has to be added to an incoming block.
+define i32 @test_converting_phi_twice(i1 %arg) {
+  br i1 %arg, label %iftrue, label %iffalse
+iftrue:
+  br label %exit
+iffalse:
+  br label %exit
+exit:
+  %result = phi i32 [ ptrtoint (i32* @tvar to i32), %iftrue ],
+                    [ ptrtoint (i32* @tvar to i32), %iffalse ]
+  ret i32 %result
+}
+; CHECK: define i32 @test_converting_phi_twice(i1 %arg)
+; CHECK: iftrue:
+; CHECK: %expanded{{.*}} = ptrtoint i32* @tvar to i32
+; CHECK: iffalse:
+; CHECK: %expanded{{.*}} = ptrtoint i32* @tvar to i32
+; CHECK: exit:
+; CHECK: %result = phi i32 [ %expanded1, %iftrue ], [ %expanded, %iffalse ]
+
+
+define i32 @test_converting_phi_multiple_entry(i1 %arg) {
+entry:
+  br i1 %arg, label %done, label %done
+done:
+  %result = phi i32 [ ptrtoint (i32* @tvar to i32), %entry ],
+                    [ ptrtoint (i32* @tvar to i32), %entry ]
+  ret i32 %result
+}
+; CHECK: define i32 @test_converting_phi_multiple_entry(i1 %arg)
+; CHECK: %result = phi i32 [ %expanded, %entry ], [ %expanded, %entry ]
diff --git a/test/Transforms/NaCl/expand-tls-constexpr2.ll b/test/Transforms/NaCl/expand-tls-constexpr2.ll
new file mode 100644
index 000000000000..9f1bbe88cb48
--- /dev/null
+++ b/test/Transforms/NaCl/expand-tls-constexpr2.ll
@@ -0,0 +1,12 @@
+; RUN: opt < %s -nacl-expand-tls -S | FileCheck %s
+
+@tvar = thread_local global i32 0
+
+define i32 @get_tvar() {
+  ret i32 ptrtoint (i32* @tvar to i32)
+}
+; CHECK: %tls_raw = call i8* @llvm.nacl.read.tp()
+; CHECK: %tls_struct = bitcast i8* %tls_raw to %tls_struct*
+; CHECK: %field = getelementptr %tls_struct, %tls_struct* %tls_struct, i32 -1, i32 1, i32 0
+; CHECK: %expanded = ptrtoint i32* %field to i32
+; CHECK: ret i32 %expanded
diff --git a/test/Transforms/NaCl/expand-tls-phi.ll b/test/Transforms/NaCl/expand-tls-phi.ll
new file mode 100644
index 000000000000..6c2715b8b965
--- /dev/null
+++ b/test/Transforms/NaCl/expand-tls-phi.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -nacl-expand-tls -S | FileCheck %s
+
+
+@tvar = thread_local global i32 123
+
+define i32* @get_tvar(i1 %cmp) {
+entry:
+  br i1 %cmp, label %return, label %else
+
+else:
+  br label %return
+
+return:
+  %result = phi i32* [ @tvar, %entry ], [ null, %else ]
+  ret i32* %result
+}
+; The TLS access gets pushed back into the PHI node's incoming block,
+; which might be suboptimal but works in all cases.
+; CHECK: define i32* @get_tvar(i1 %cmp) {
+; CHECK: entry:
+; CHECK: %field = getelementptr %tls_struct, %tls_struct* %tls_struct, i32 -1, i32 0, i32 0
+; CHECK: else:
+; CHECK: return:
+; CHECK: %result = phi i32* [ %field, %entry ], [ null, %else ]
+
+
+; This tests that ExpandTls correctly handles a PHI node that contains
+; the same TLS variable twice.  Using replaceAllUsesWith() is not
+; correct on a PHI node when the new instruction has to be added to an
+; incoming block.
+define i32* @tls_phi_twice(i1 %arg) {
+  br i1 %arg, label %iftrue, label %iffalse
+iftrue:
+  br label %exit
+iffalse:
+  br label %exit
+exit:
+  %result = phi i32* [ @tvar, %iftrue ], [ @tvar, %iffalse ]
+  ret i32* %result
+}
+; CHECK: define i32* @tls_phi_twice(i1 %arg) {
+; CHECK: iftrue:
+; CHECK: %field{{.*}} = getelementptr %tls_struct, %tls_struct* %tls_struct{{.*}}, i32 -1, i32 0, i32 0
+; CHECK: iffalse:
+; CHECK: %field{{.*}} = getelementptr %tls_struct, %tls_struct* %tls_struct{{.*}}, i32 -1, i32 0, i32 0
+; CHECK: exit:
+; CHECK: %result = phi i32* [ %field{{.*}}, %iftrue ], [ %field{{.*}}, %iffalse ]
+
+
+; In this corner case, ExpandTls must expand out @tvar only once,
+; otherwise it will produce invalid IR.
+define i32* @tls_phi_multiple_entry(i1 %arg) {
+entry:
+  br i1 %arg, label %done, label %done
+done:
+  %result = phi i32* [ @tvar, %entry ], [ @tvar, %entry ]
+  ret i32* %result
+}
+; CHECK: define i32* @tls_phi_multiple_entry(i1 %arg) {
+; CHECK: %result = phi i32* [ %field, %entry ], [ %field, %entry ]
diff --git a/test/Transforms/NaCl/expand-tls.ll b/test/Transforms/NaCl/expand-tls.ll
new file mode 100644
index 000000000000..b1159729544b
--- /dev/null
+++ b/test/Transforms/NaCl/expand-tls.ll
@@ -0,0 +1,86 @@
+; RUN: opt < %s -nacl-expand-tls -S | FileCheck %s
+
+; All thread-local variables should be removed
+; RUN: opt < %s -nacl-expand-tls -S | FileCheck %s -check-prefix=NO_TLS
+
+; NO_TLS-NOT: thread_local
+
+@tvar1 = thread_local global i64 123
+@tvar2 = thread_local global i32 456
+
+
+; CHECK: %tls_init_template = type <{ i64, i32 }>
+; CHECK: %tls_struct = type <{ %tls_init_template, %tls_bss_template }>
+; CHECK: %tls_bss_template = type <{ [4 x i8] }>
+
+
+; CHECK: @__tls_template_start = internal constant %tls_init_template <{ i64 123, i32 456 }>
+
+; CHECK: @__tls_template_alignment = internal constant i32 8
+
+
+define i64* @get_tvar1() {
+  ret i64* @tvar1
+}
+; CHECK: define i64* @get_tvar1()
+; CHECK: %tls_raw = call i8* @llvm.nacl.read.tp()
+; CHECK: %tls_struct = bitcast i8* %tls_raw to %tls_struct*
+; CHECK: %field = getelementptr %tls_struct, %tls_struct* %tls_struct, i32 -1, i32 0, i32 0
+; CHECK: ret i64* %field
+
+
+define i32* @get_tvar2() {
+  ret i32* @tvar2
+}
+; Much the same as for get_tvar1.
+; CHECK: define i32* @get_tvar2()
+; CHECK: %field = getelementptr %tls_struct, %tls_struct* %tls_struct, i32 -1, i32 0, i32 1
+
+
+; Check that we define global variables for TLS templates
+
+@__tls_template_start = external global i8
+@__tls_template_tdata_end = external global i8
+@__tls_template_end = external global i8
+
+define i8* @get_tls_template_start() {
+  ret i8* @__tls_template_start
+}
+; CHECK: define i8* @get_tls_template_start()
+; CHECK: ret i8* bitcast (%tls_init_template* @__tls_template_start to i8*)
+
+define i8* @get_tls_template_tdata_end() {
+  ret i8* @__tls_template_tdata_end
+}
+; CHECK: define i8* @get_tls_template_tdata_end()
+; CHECK: ret i8* bitcast (%tls_init_template* getelementptr inbounds (%tls_init_template, %tls_init_template* @__tls_template_start, i32 1) to i8*)
+
+define i8* @get_tls_template_end() {
+  ret i8* @__tls_template_end
+}
+; CHECK: define i8* @get_tls_template_end()
+; CHECK: ret i8* bitcast (%tls_struct* getelementptr (%tls_struct, %tls_struct* bitcast (%tls_init_template* @__tls_template_start to %tls_struct*), i32 1) to i8*)
+
+
+; Check that we define the TLS layout functions
+
+declare i32 @__nacl_tp_tls_offset(i32)
+declare i32 @__nacl_tp_tdb_offset(i32)
+
+define i32 @test_get_tp_tls_offset(i32 %tls_size) {
+  %offset = call i32 @__nacl_tp_tls_offset(i32 %tls_size)
+  ret i32 %offset
+}
+; Uses of the intrinsic are replaced with uses of a regular function.
+; CHECK: define i32 @test_get_tp_tls_offset
+; CHECK: call i32 @nacl_tp_tls_offset
+; NO_TLS-NOT: __nacl_tp_tls_offset
+
+define i32 @test_get_tp_tdb_offset(i32 %tdb_size) {
+  %offset = call i32 @__nacl_tp_tdb_offset(i32 %tdb_size)
+  ret i32 %offset
+}
+; Uses of the intrinsic are replaced with uses of a regular function.
+; CHECK: define i32 @test_get_tp_tdb_offset
+; CHECK: call i32 @nacl_tp_tdb_offset
+; NO_TLS-NOT: __nacl_tp_tdb_offset
diff --git a/test/Transforms/NaCl/expand-varargs-attrs.ll b/test/Transforms/NaCl/expand-varargs-attrs.ll
new file mode 100644
index 000000000000..17061abd3cef
--- /dev/null
+++ b/test/Transforms/NaCl/expand-varargs-attrs.ll
@@ -0,0 +1,72 @@
+; RUN: opt < %s -expand-varargs -S | FileCheck %s
+
+declare i32 @varargs_func(i32 %arg, ...)
+
+
+; Check that attributes such as "byval" are preserved on fixed arguments.
+
+%MyStruct = type { i64, i64 }
+
+define void @func_with_arg_attrs(%MyStruct* byval, ...) {
+  ret void
+}
+; CHECK-LABEL: define void @func_with_arg_attrs(%MyStruct* byval, i8* noalias %varargs) {
+
+
+declare void @take_struct_arg(%MyStruct* byval %s, ...)
+
+define void @call_with_arg_attrs(%MyStruct* %s) {
+  call void (%MyStruct*, ...) @take_struct_arg(%MyStruct* byval %s)
+  ret void
+}
+; CHECK-LABEL: @call_with_arg_attrs(
+; CHECK: call void bitcast (void (%MyStruct*, i8*)* @take_struct_arg to void (%MyStruct*, { i32 }*)*)(%MyStruct* byval %s, { i32 }* %vararg_buffer)
+
+
+; The "byval" attribute here should be dropped.
+define i32 @pass_struct_via_vararg1(%MyStruct* %s) {
+  %result = call i32 (i32, ...) @varargs_func(i32 111, %MyStruct* byval %s)
+  ret i32 %result
+}
+; CHECK-LABEL: @pass_struct_via_vararg1(
+; CHECK: %result = call i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { %MyStruct }*)*)(i32 111, { %MyStruct }* %vararg_buffer)
+
+
+; The "byval" attribute here should be dropped.
+define i32 @pass_struct_via_vararg2(%MyStruct* %s) {
+  %result = call i32 (i32, ...) @varargs_func(i32 111, i32 2, %MyStruct* byval %s)
+  ret i32 %result
+}
+; CHECK-LABEL: @pass_struct_via_vararg2(
+; CHECK: %result = call i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i32, %MyStruct }*)*)(i32 111, { i32, %MyStruct }* %vararg_buffer)
+
+
+; Check that return attributes such as "signext" are preserved.
+define i32 @call_with_return_attr() {
+  %result = call signext i32 (i32, ...) @varargs_func(i32 111, i64 222)
+  ret i32 %result
+}
+; CHECK-LABEL: @call_with_return_attr(
+; CHECK: %result = call signext i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i64 }*)*)(i32 111, { i64 }* %vararg_buffer)
+
+
+; Check that the "readonly" function attribute is preserved.
+define i32 @call_readonly() {
+  %result = call i32 (i32, ...) @varargs_func(i32 111, i64 222) readonly
+  ret i32 %result
+}
+; CHECK-LABEL: @call_readonly(
+; CHECK: %result = call i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i64 }*)*)(i32 111, { i64 }* %vararg_buffer) #1
+
+
+; Check that the "tail" attribute gets removed, because the callee
+; reads space alloca'd by the caller.
+define i32 @tail_call() {
+  %result = tail call i32 (i32, ...) @varargs_func(i32 111, i64 222)
+  ret i32 %result
+}
+; CHECK-LABEL: @tail_call(
+; CHECK: %result = call i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i64 }*)*)(i32 111, { i64 }* %vararg_buffer)
+
+
+; CHECK: attributes #1 = { readonly }
diff --git a/test/Transforms/NaCl/expand-varargs-emscripten.ll b/test/Transforms/NaCl/expand-varargs-emscripten.ll
new file mode 100644
index 000000000000..ae19c519f11f
--- /dev/null
+++ b/test/Transforms/NaCl/expand-varargs-emscripten.ll
@@ -0,0 +1,28 @@
+; RUN: opt < %s -mtriple=asmjs-unknown-emscripten -expand-varargs -S | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+%va_list = type i8*
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+declare void @llvm.va_copy(i8*, i8*)
+
+declare void @emscripten_asm_const_int(...)
+declare void @emscripten_asm_const_double(...)
+declare void @emscripten_landingpad(...)
+declare void @emscripten_resume(...)
+
+define void @test(i32 %arg) {
+  call void (...) @emscripten_asm_const_int(i32 %arg)
+  call void (...) @emscripten_asm_const_double(i32 %arg)
+  call void (...) @emscripten_landingpad(i32 %arg)
+  call void (...) @emscripten_resume(i32 %arg)
+  ret void
+}
+; CHECK-LABEL: define void @test(
+; CHECK-NEXT: call void (...) @emscripten_asm_const_int(i32 %arg)
+; CHECK-NEXT: call void (...) @emscripten_asm_const_double(i32 %arg)
+; CHECK-NEXT: call void (...) @emscripten_landingpad(i32 %arg)
+; CHECK-NEXT: call void (...) @emscripten_resume(i32 %arg)
+; CHECK-NEXT: ret void
diff --git a/test/Transforms/NaCl/expand-varargs-struct.ll b/test/Transforms/NaCl/expand-varargs-struct.ll
new file mode 100644
index 000000000000..755c9e81021f
--- /dev/null
+++ b/test/Transforms/NaCl/expand-varargs-struct.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -expand-varargs -S | FileCheck %s
+
+declare i32 @varargs_func(i32 %arg, ...)
+
+
+%MyStruct = type { i64, i64 }
+
+; Test passing a struct by value.
+define i32 @varargs_call_struct(%MyStruct* %ptr) {
+  %result = call i32 (i32, ...) @varargs_func(i32 111, i64 222, %MyStruct* byval %ptr)
+  ret i32 %result
+}
+; CHECK-LABEL: @varargs_call_struct(
+; CHECK: %vararg_ptr1 = getelementptr inbounds { i64, %MyStruct }, { i64, %MyStruct }* %vararg_buffer, i32 0, i32 1
+; CHECK: %1 = bitcast %MyStruct* %vararg_ptr1 to i8*
+; CHECK: %2 = bitcast %MyStruct* %ptr to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 16, i32 1, i1 false)
diff --git a/test/Transforms/NaCl/expand-varargs.ll b/test/Transforms/NaCl/expand-varargs.ll
new file mode 100644
index 000000000000..814e13a5857c
--- /dev/null
+++ b/test/Transforms/NaCl/expand-varargs.ll
@@ -0,0 +1,250 @@
+; RUN: opt < %s -expand-varargs -S | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+%va_list = type i8*
+
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_end(i8*)
+declare void @llvm.va_copy(i8*, i8*)
+
+declare i32 @outside_func(i32 %arg, %va_list* %args)
+
+define i32 @varargs_func(i32 %arg, ...) {
+  %arglist_alloc = alloca %va_list
+  %arglist = bitcast %va_list* %arglist_alloc to i8*
+
+  call void @llvm.va_start(i8* %arglist)
+  %result = call i32 @outside_func(i32 %arg, %va_list* %arglist_alloc)
+  call void @llvm.va_end(i8* %arglist)
+  ret i32 %result
+}
+; CHECK-LABEL: define i32 @varargs_func(i32 %arg, i8* noalias %varargs) {
+; CHECK-NEXT: %arglist_alloc = alloca i8*
+; CHECK-NEXT: %arglist = bitcast i8** %arglist_alloc to i8*
+; CHECK-NEXT: %arglist1 = bitcast i8* %arglist to i8**
+; CHECK-NEXT: store i8* %varargs, i8** %arglist1
+; CHECK-NEXT: %result = call i32 @outside_func(i32 %arg, i8** %arglist_alloc)
+; CHECK-NEXT: ret i32 %result
+
+
+; Obtain each argument in the va_list according to its type (known from fmt).
+; This function ensures that each argument is loaded with the same alignment as
+; if it were inside a struct: this is how the caller passed the arguments.
+;
+; Note that alignof is represented as a GEP off of nullptr to the second element
+; of a struct with { i1, types_whose_alignment_is_desired }.
+define void @varargs_func_2(i8* nocapture %o8, i8* nocapture readonly %fmt, ...) {
+; CHECK-LABEL: @varargs_func_2(
+entry:
+  %o16 = bitcast i8* %o8 to i16*
+  %o32 = bitcast i8* %o8 to i32*
+  %o64 = bitcast i8* %o8 to i64*
+  %ofloat = bitcast i8* %o8 to float*
+  %odouble = bitcast i8* %o8 to double*
+
+  %arglist_alloc = alloca [4 x i32], align 4
+  %arglist = getelementptr inbounds [4 x i32], [4 x i32]* %arglist_alloc, i32 0, i32 0
+  %arglist.i8 = bitcast [4 x i32]* %arglist_alloc to i8*
+  call void @llvm.va_start(i8* %arglist.i8)
+  br label %start
+
+start:
+  %idx = phi i32 [ 0, %entry ], [ %inc, %next ]
+  %fmt.gep = getelementptr inbounds i8, i8* %fmt, i32 %idx
+  %arg.type = load i8, i8* %fmt.gep
+  switch i8 %arg.type, label %next [
+    i8 0, label %done
+    i8 1, label %type.i8
+    i8 2, label %type.i16
+    i8 3, label %type.i32
+    i8 4, label %type.i64
+    i8 5, label %type.float
+    i8 6, label %type.double
+  ]
+
+type.i8: ; CHECK: type.i8:
+  %i8 = va_arg i32* %arglist, i8
+  store i8 %i8, i8* %o8
+  br label %next
+; CHECK-NEXT: %arglist1 = bitcast i32* %arglist to i8**
+; CHECK-NEXT: %arglist_current = load i8*, i8** %arglist1
+; CHECK-NEXT: %[[P2I:[0-9]+]] = ptrtoint i8* %arglist_current to i32
+; %A8 = (uintptr_t)Addr + Alignment - 1
+; CHECK-NEXT: %[[A8:[0-9]+]] = add nuw i32 %[[P2I]], sub nuw (i32 ptrtoint (i8* getelementptr ({ i1, i8 }, { i1, i8 }* null, i64 0, i32 1) to i32), i32 1)
+; %B8 = %1 & ~(uintptr_t)(Alignment - 1)
+; CHECK-NEXT: %[[B8:[0-9]+]] = and i32 %[[A8]], xor (i32 sub nuw (i32 ptrtoint (i8* getelementptr ({ i1, i8 }, { i1, i8 }* null, i64 0, i32 1) to i32), i32 1), i32 -1)
+; CHECK-NEXT: %[[C8:[0-9]+]] = inttoptr i32 %[[B8]] to i8*
+; CHECK-NEXT: %i8 = load i8, i8* %[[C8]]
+; CHECK-NEXT: %arglist_next = getelementptr inbounds i8, i8* %[[C8]], i32 1
+; CHECK-NEXT: store i8* %arglist_next, i8** %arglist1
+; CHECK-NEXT: store i8 %i8, i8* %o8
+; CHECK-NEXT: br label %next
+
+type.i16: ; CHECK: type.i16:
+  %i16 = va_arg i32* %arglist, i16
+  store i16 %i16, i16* %o16
+  br label %next
+; CHECK:      %[[A16:[0-9]+]] = add nuw i32 %4, sub nuw (i32 ptrtoint (i16* getelementptr ({ i1, i16 }, { i1, i16 }* null, i64 0, i32 1) to i32), i32 1)
+; CHECK-NEXT: %[[B16:[0-9]+]] = and i32 %[[A16]], xor (i32 sub nuw (i32 ptrtoint (i16* getelementptr ({ i1, i16 }, { i1, i16 }* null, i64 0, i32 1) to i32), i32 1), i32 -1)
+; CHECK-NEXT: %[[C16:[0-9]+]] = inttoptr i32 %[[B16]] to i16*
+; CHECK-NEXT: %i16 = load i16, i16* %[[C16]]
+
+type.i32: ; CHECK: type.i32:
+  %i32 = va_arg i32* %arglist, i32
+  store i32 %i32, i32* %o32
+  br label %next
+; CHECK:      %[[A32:[0-9]+]] = add nuw i32 %8, sub nuw (i32 ptrtoint (i32* getelementptr ({ i1, i32 }, { i1, i32 }* null, i64 0, i32 1) to i32), i32 1)
+; CHECK-NEXT: %[[B32:[0-9]+]] = and i32 %[[A32]], xor (i32 sub nuw (i32 ptrtoint (i32* getelementptr ({ i1, i32 }, { i1, i32 }* null, i64 0, i32 1) to i32), i32 1), i32 -1)
+; CHECK-NEXT: %[[C32:[0-9]+]] = inttoptr i32 %[[B32]] to i32*
+; CHECK-NEXT: %i32 = load i32, i32* %[[C32]]
+
+type.i64: ; CHECK: type.i64:
+  %i64 = va_arg i32* %arglist, i64
+  store i64 %i64, i64* %o64
+  br label %next
+; CHECK:      %[[A64:[0-9]+]] = add nuw i32 %12, sub nuw (i32 ptrtoint (i64* getelementptr ({ i1, i64 }, { i1, i64 }* null, i64 0, i32 1) to i32), i32 1)
+; CHECK-NEXT: %[[B64:[0-9]+]] = and i32 %[[A64]], xor (i32 sub nuw (i32 ptrtoint (i64* getelementptr ({ i1, i64 }, { i1, i64 }* null, i64 0, i32 1) to i32), i32 1), i32 -1)
+; CHECK-NEXT: %[[C64:[0-9]+]] = inttoptr i32 %[[B64]] to i64*
+; CHECK-NEXT: %i64 = load i64, i64* %[[C64]]
+
+type.float: ; CHECK: type.float:
+  %float = va_arg i32* %arglist, float
+  store float %float, float* %ofloat
+  br label %next
+; CHECK:      %[[AF:[0-9]+]] = add nuw i32 %16, sub nuw (i32 ptrtoint (float* getelementptr ({ i1, float }, { i1, float }* null, i64 0, i32 1) to i32), i32 1)
+; CHECK-NEXT: %[[BF:[0-9]+]] = and i32 %[[AF]], xor (i32 sub nuw (i32 ptrtoint (float* getelementptr ({ i1, float }, { i1, float }* null, i64 0, i32 1) to i32), i32 1), i32 -1)
+; CHECK-NEXT: %[[CF:[0-9]+]] = inttoptr i32 %[[BF]] to float*
+; CHECK-NEXT: %float = load float, float* %[[CF]]
+
+type.double: ; CHECK: type.double:
+  %double = va_arg i32* %arglist, double
+  store double %double, double* %odouble
+  br label %next
+; CHECK:      %[[AD:[0-9]+]] = add nuw i32 %20, sub nuw (i32 ptrtoint (double* getelementptr ({ i1, double }, { i1, double }* null, i64 0, i32 1) to i32), i32 1)
+; CHECK-NEXT: %[[BD:[0-9]+]] = and i32 %[[AD]], xor (i32 sub nuw (i32 ptrtoint (double* getelementptr ({ i1, double }, { i1, double }* null, i64 0, i32 1) to i32), i32 1), i32 -1)
+; CHECK-NEXT: %[[CD:[0-9]+]] = inttoptr i32 %[[BD]] to double*
+; CHECK-NEXT: %double = load double, double* %[[CD]]
+
+next:
+  %inc = add i32 %idx, 1
+  br label %start
+
+done:
+  call void @llvm.va_end(i8* %arglist.i8)
+  ret void
+}
+
+
+define i32 @varargs_call1() {
+  %result = call i32 (i32, ...) @varargs_func(i32 111, i64 222, i32 333, double 4.0)
+  ret i32 %result
+}
+; CHECK-LABEL: @varargs_call1(
+; CHECK-NEXT: %vararg_buffer = alloca { i64, i32, double }
+; CHECK-NEXT: %vararg_lifetime_bitcast = bitcast { i64, i32, double }* %vararg_buffer to i8*
+; CHECK-NEXT: call void @llvm.lifetime.start(i64 24, i8* %vararg_lifetime_bitcast)
+; CHECK-NEXT: %vararg_ptr = getelementptr inbounds { i64, i32, double }, { i64, i32, double }* %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT: store i64 222, i64* %vararg_ptr
+; CHECK-NEXT: %vararg_ptr1 = getelementptr inbounds { i64, i32, double }, { i64, i32, double }* %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT: store i32 333, i32* %vararg_ptr1
+; CHECK-NEXT: %vararg_ptr2 = getelementptr inbounds { i64, i32, double }, { i64, i32, double }* %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT: store double 4.{{0*}}e+00, double* %vararg_ptr2
+; CHECK-NEXT: %result = call i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i64, i32, double }*)*)(i32 111, { i64, i32, double }* %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end(i64 24, i8* %vararg_lifetime_bitcast)
+; CHECK-NEXT: ret i32 %result
+
+
+; Check that the pass works when there are no variable arguments.
+define i32 @call_with_zero_varargs() {
+  %result = call i32 (i32, ...) @varargs_func(i32 111)
+  ret i32 %result
+}
+; CHECK-LABEL: @call_with_zero_varargs(
+; We have a dummy i32 field to deal with buggy programs:
+; CHECK-NEXT: %vararg_buffer = alloca { i32 }
+; CHECK-NEXT: %vararg_lifetime_bitcast = bitcast { i32 }* %vararg_buffer to i8*
+; CHECK-NEXT: call void @llvm.lifetime.start(i64 4, i8* %vararg_lifetime_bitcast)
+; CHECK-NEXT: %result = call i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i32 }*)*)(i32 111, { i32 }* %vararg_buffer)
+; CHECK-NEXT: call void @llvm.lifetime.end(i64 4, i8* %vararg_lifetime_bitcast)
+; CHECK-NEXT: ret i32 %result
+
+
+; Check that "invoke" instructions are expanded out too.
+define i32 @varargs_invoke() {
+  %result = invoke i32 (i32, ...)* @varargs_func(i32 111, i64 222)
+      to label %cont unwind label %lpad
+cont:
+  ret i32 %result
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret i32 0
+}
+; CHECK-LABEL: @varargs_invoke(
+; CHECK: call void @llvm.lifetime.start(i64 8, i8* %vararg_lifetime_bitcast)
+; CHECK: %result = invoke i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i64 }*)*)(i32 111, { i64 }* %vararg_buffer)
+; CHECK-NEXT:    to label %cont unwind label %lpad
+; CHECK: cont:
+; CHECK-NEXT: call void @llvm.lifetime.end(i64 8, i8* %vararg_lifetime_bitcast)
+; CHECK: lpad:
+; CHECK: call void @llvm.lifetime.end(i64 8, i8* %vararg_lifetime_bitcast)
+
+
+define void @varargs_multiple_calls() {
+  %call1 = call i32 (i32, ...) @varargs_func(i32 11, i64 22, i32 33)
+  %call2 = call i32 (i32, ...) @varargs_func(i32 44, i64 55, i32 66)
+  ret void
+}
+; CHECK-LABEL: @varargs_multiple_calls(
+; The added allocas should appear at the start of the function.
+; CHECK: %vararg_buffer{{.*}} = alloca { i64, i32 }
+; CHECK: %vararg_buffer{{.*}} = alloca { i64, i32 }
+; CHECK: %call1 = call i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i64, i32 }*)*)(i32 11, { i64, i32 }* %vararg_buffer{{.*}})
+; CHECK: %call2 = call i32 bitcast (i32 (i32, i8*)* @varargs_func to i32 (i32, { i64, i32 }*)*)(i32 44, { i64, i32 }* %vararg_buffer{{.*}})
+
+
+
+define i32 @va_arg_i32(i8* %arglist) {
+  %result = va_arg i8* %arglist, i32
+  ret i32 %result
+}
+; CHECK-LABEL: define i32 @va_arg_i32(i8* %arglist) {
+; CHECK-NEXT: %arglist1 = bitcast i8* %arglist to i32**
+; CHECK-NEXT: %arglist_current = load i32*, i32** %arglist1
+; CHECK-NEXT: %1 = ptrtoint i32* %arglist_current to i32
+; CHECK-NEXT: %2 = add nuw i32 %1, sub nuw (i32 ptrtoint (i32* getelementptr ({ i1, i32 }, { i1, i32 }* null, i64 0, i32 1) to i32), i32 1)
+; CHECK-NEXT: %3 = and i32 %2, xor (i32 sub nuw (i32 ptrtoint (i32* getelementptr ({ i1, i32 }, { i1, i32 }* null, i64 0, i32 1) to i32), i32 1), i32 -1)
+; CHECK-NEXT: %4 = inttoptr i32 %3 to i32*
+; CHECK-NEXT: %result = load i32, i32* %4
+; CHECK-NEXT: %arglist_next = getelementptr inbounds i32, i32* %4, i32 1
+; CHECK-NEXT: store i32* %arglist_next, i32** %arglist1
+; CHECK-NEXT: ret i32 %result
+
+
+define i64 @va_arg_i64(i8* %arglist) {
+  %result = va_arg i8* %arglist, i64
+  ret i64 %result
+}
+; CHECK-LABEL: define i64 @va_arg_i64(i8* %arglist) {
+; CHECK-NEXT: %arglist1 = bitcast i8* %arglist to i64**
+; CHECK-NEXT: %arglist_current = load i64*, i64** %arglist1
+; CHECK-NEXT: %1 = ptrtoint i64* %arglist_current to i32
+; CHECK-NEXT: %2 = add nuw i32 %1, sub nuw (i32 ptrtoint (i64* getelementptr ({ i1, i64 }, { i1, i64 }* null, i64 0, i32 1) to i32), i32 1)
+; CHECK-NEXT: %3 = and i32 %2, xor (i32 sub nuw (i32 ptrtoint (i64* getelementptr ({ i1, i64 }, { i1, i64 }* null, i64 0, i32 1) to i32), i32 1), i32 -1)
+; CHECK-NEXT: %4 = inttoptr i32 %3 to i64*
+; CHECK-NEXT: %result = load i64, i64* %4
+; CHECK-NEXT: %arglist_next = getelementptr inbounds i64, i64* %4, i32 1
+; CHECK-NEXT: store i64* %arglist_next, i64** %arglist1
+; CHECK-NEXT: ret i64 %result
+
+
+define void @do_va_copy(i8* %dest, i8* %src) {
+  call void @llvm.va_copy(i8* %dest, i8* %src)
+  ret void
+}
+; CHECK-LABEL: define void @do_va_copy(
+; CHECK-NEXT: %vacopy_src = bitcast i8* %src to i8**
+; CHECK-NEXT: %vacopy_dest = bitcast i8* %dest to i8**
+; CHECK-NEXT: %vacopy_currentptr = load i8*, i8** %vacopy_src
+; CHECK-NEXT: store i8* %vacopy_currentptr, i8** %vacopy_dest
+; CHECK-NEXT: ret void
diff --git a/test/Transforms/NaCl/fix-vector-load-store-alignment.ll b/test/Transforms/NaCl/fix-vector-load-store-alignment.ll
new file mode 100644
index 000000000000..914da432782b
--- /dev/null
+++ b/test/Transforms/NaCl/fix-vector-load-store-alignment.ll
@@ -0,0 +1,435 @@
+; RUN: opt -fix-vector-load-store-alignment %s -S | FileCheck %s
+
+; Test that vector load/store are always element-aligned when possible, and get
+; converted to scalar load/store when not.
+
+; The datalayout is needed to determine the alignment of the load/stores.
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+; Load =========================================================================
+
+define <4 x i1> @test_load_4xi1(<4 x i1>* %loc) {
+  ; CHECK-LABEL: test_load_4xi1
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <4 x i1>* %loc to i1*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[LD0:[0-9]+]] = load i1, i1* %[[GEP0]], align 4
+  ; CHECK-NEXT: %[[INS0:[0-9]+]] = insertelement <4 x i1> undef, i1 %[[LD0]], i32 0
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[LD1:[0-9]+]] = load i1, i1* %[[GEP1]], align 1
+  ; CHECK-NEXT: %[[INS1:[0-9]+]] = insertelement <4 x i1> %[[INS0]], i1 %[[LD1]], i32 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[LD2:[0-9]+]] = load i1, i1* %[[GEP2]], align 2
+  ; CHECK-NEXT: %[[INS2:[0-9]+]] = insertelement <4 x i1> %[[INS1]], i1 %[[LD2]], i32 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[LD3:[0-9]+]] = load i1, i1* %[[GEP3]], align 1
+  ; CHECK-NEXT: %[[INS3:[0-9]+]] = insertelement <4 x i1> %[[INS2]], i1 %[[LD3]], i32 3
+  ; CHECK-NEXT: ret <4 x i1> %[[INS3]]
+  %loaded = load <4 x i1>, <4 x i1>* %loc
+  ret <4 x i1> %loaded
+}
+
+define <8 x i1> @test_load_8xi1(<8 x i1>* %loc) {
+  ; CHECK-LABEL: test_load_8xi1
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <8 x i1>* %loc to i1*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[LD0:[0-9]+]] = load i1, i1* %[[GEP0]], align 8
+  ; CHECK-NEXT: %[[INS0:[0-9]+]] = insertelement <8 x i1> undef, i1 %[[LD0]], i32 0
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[LD1:[0-9]+]] = load i1, i1* %[[GEP1]], align 1
+  ; CHECK-NEXT: %[[INS1:[0-9]+]] = insertelement <8 x i1> %[[INS0]], i1 %[[LD1]], i32 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[LD2:[0-9]+]] = load i1, i1* %[[GEP2]], align 2
+  ; CHECK-NEXT: %[[INS2:[0-9]+]] = insertelement <8 x i1> %[[INS1]], i1 %[[LD2]], i32 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[LD3:[0-9]+]] = load i1, i1* %[[GEP3]], align 1
+  ; CHECK-NEXT: %[[INS3:[0-9]+]] = insertelement <8 x i1> %[[INS2]], i1 %[[LD3]], i32 3
+  ; CHECK-NEXT: %[[GEP4:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 4
+  ; CHECK-NEXT: %[[LD4:[0-9]+]] = load i1, i1* %[[GEP4]], align 4
+  ; CHECK-NEXT: %[[INS4:[0-9]+]] = insertelement <8 x i1> %[[INS3]], i1 %[[LD4]], i32 4
+  ; CHECK-NEXT: %[[GEP5:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 5
+  ; CHECK-NEXT: %[[LD5:[0-9]+]] = load i1, i1* %[[GEP5]], align 1
+  ; CHECK-NEXT: %[[INS5:[0-9]+]] = insertelement <8 x i1> %[[INS4]], i1 %[[LD5]], i32 5
+  ; CHECK-NEXT: %[[GEP6:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 6
+  ; CHECK-NEXT: %[[LD6:[0-9]+]] = load i1, i1* %[[GEP6]], align 2
+  ; CHECK-NEXT: %[[INS6:[0-9]+]] = insertelement <8 x i1> %[[INS5]], i1 %[[LD6]], i32 6
+  ; CHECK-NEXT: %[[GEP7:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 7
+  ; CHECK-NEXT: %[[LD7:[0-9]+]] = load i1, i1* %[[GEP7]], align 1
+  ; CHECK-NEXT: %[[INS7:[0-9]+]] = insertelement <8 x i1> %[[INS6]], i1 %[[LD7]], i32 7
+  ; CHECK-NEXT: ret <8 x i1> %[[INS7]]
+  %loaded = load <8 x i1>, <8 x i1>* %loc
+  ret <8 x i1> %loaded
+}
+
+define <16 x i1> @test_load_16xi1(<16 x i1>* %loc) {
+  ; CHECK-LABEL: test_load_16xi1
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <16 x i1>* %loc to i1*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[LD0:[0-9]+]] = load i1, i1* %[[GEP0]], align 16
+  ; CHECK-NEXT: %[[INS0:[0-9]+]] = insertelement <16 x i1> undef, i1 %[[LD0]], i32 0
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[LD1:[0-9]+]] = load i1, i1* %[[GEP1]], align 1
+  ; CHECK-NEXT: %[[INS1:[0-9]+]] = insertelement <16 x i1> %[[INS0]], i1 %[[LD1]], i32 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[LD2:[0-9]+]] = load i1, i1* %[[GEP2]], align 2
+  ; CHECK-NEXT: %[[INS2:[0-9]+]] = insertelement <16 x i1> %[[INS1]], i1 %[[LD2]], i32 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[LD3:[0-9]+]] = load i1, i1* %[[GEP3]], align 1
+  ; CHECK-NEXT: %[[INS3:[0-9]+]] = insertelement <16 x i1> %[[INS2]], i1 %[[LD3]], i32 3
+  ; CHECK-NEXT: %[[GEP4:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 4
+  ; CHECK-NEXT: %[[LD4:[0-9]+]] = load i1, i1* %[[GEP4]], align 4
+  ; CHECK-NEXT: %[[INS4:[0-9]+]] = insertelement <16 x i1> %[[INS3]], i1 %[[LD4]], i32 4
+  ; CHECK-NEXT: %[[GEP5:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 5
+  ; CHECK-NEXT: %[[LD5:[0-9]+]] = load i1, i1* %[[GEP5]], align 1
+  ; CHECK-NEXT: %[[INS5:[0-9]+]] = insertelement <16 x i1> %[[INS4]], i1 %[[LD5]], i32 5
+  ; CHECK-NEXT: %[[GEP6:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 6
+  ; CHECK-NEXT: %[[LD6:[0-9]+]] = load i1, i1* %[[GEP6]], align 2
+  ; CHECK-NEXT: %[[INS6:[0-9]+]] = insertelement <16 x i1> %[[INS5]], i1 %[[LD6]], i32 6
+  ; CHECK-NEXT: %[[GEP7:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 7
+  ; CHECK-NEXT: %[[LD7:[0-9]+]] = load i1, i1* %[[GEP7]], align 1
+  ; CHECK-NEXT: %[[INS7:[0-9]+]] = insertelement <16 x i1> %[[INS6]], i1 %[[LD7]], i32 7
+  ; CHECK-NEXT: %[[GEP8:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 8
+  ; CHECK-NEXT: %[[LD8:[0-9]+]] = load i1, i1* %[[GEP8]], align 8
+  ; CHECK-NEXT: %[[INS8:[0-9]+]] = insertelement <16 x i1> %[[INS7]], i1 %[[LD8]], i32 8
+  ; CHECK-NEXT: %[[GEP9:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 9
+  ; CHECK-NEXT: %[[LD9:[0-9]+]] = load i1, i1* %[[GEP9]], align 1
+  ; CHECK-NEXT: %[[INS9:[0-9]+]] = insertelement <16 x i1> %[[INS8]], i1 %[[LD9]], i32 9
+  ; CHECK-NEXT: %[[GEP10:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 10
+  ; CHECK-NEXT: %[[LD10:[0-9]+]] = load i1, i1* %[[GEP10]], align 2
+  ; CHECK-NEXT: %[[INS10:[0-9]+]] = insertelement <16 x i1> %[[INS9]], i1 %[[LD10]], i32 10
+  ; CHECK-NEXT: %[[GEP11:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 11
+  ; CHECK-NEXT: %[[LD11:[0-9]+]] = load i1, i1* %[[GEP11]], align 1
+  ; CHECK-NEXT: %[[INS11:[0-9]+]] = insertelement <16 x i1> %[[INS10]], i1 %[[LD11]], i32 11
+  ; CHECK-NEXT: %[[GEP12:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 12
+  ; CHECK-NEXT: %[[LD12:[0-9]+]] = load i1, i1* %[[GEP12]], align 4
+  ; CHECK-NEXT: %[[INS12:[0-9]+]] = insertelement <16 x i1> %[[INS11]], i1 %[[LD12]], i32 12
+  ; CHECK-NEXT: %[[GEP13:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 13
+  ; CHECK-NEXT: %[[LD13:[0-9]+]] = load i1, i1* %[[GEP13]], align 1
+  ; CHECK-NEXT: %[[INS13:[0-9]+]] = insertelement <16 x i1> %[[INS12]], i1 %[[LD13]], i32 13
+  ; CHECK-NEXT: %[[GEP14:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 14
+  ; CHECK-NEXT: %[[LD14:[0-9]+]] = load i1, i1* %[[GEP14]], align 2
+  ; CHECK-NEXT: %[[INS14:[0-9]+]] = insertelement <16 x i1> %[[INS13]], i1 %[[LD14]], i32 14
+  ; CHECK-NEXT: %[[GEP15:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 15
+  ; CHECK-NEXT: %[[LD15:[0-9]+]] = load i1, i1* %[[GEP15]], align 1
+  ; CHECK-NEXT: %[[INS15:[0-9]+]] = insertelement <16 x i1> %[[INS14]], i1 %[[LD15]], i32 15
+  ; CHECK-NEXT: ret <16 x i1> %[[INS15]]
+  %loaded = load <16 x i1>, <16 x i1>* %loc
+  ret <16 x i1> %loaded
+}
+
+define <4 x i32> @test_load_4xi32_align0(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_load_4xi32_align0
+  ; CHECK-NEXT: %loaded = load <4 x i32>, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x i32> %loaded
+  %loaded = load <4 x i32>, <4 x i32>* %loc
+  ret <4 x i32> %loaded
+}
+
+define <4 x i32> @test_load_4xi32_align1(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_load_4xi32_align1
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <4 x i32>* %loc to i32*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[LD0:[0-9]+]] = load i32, i32* %[[GEP0]], align 1
+  ; CHECK-NEXT: %[[INS0:[0-9]+]] = insertelement <4 x i32> undef, i32 %[[LD0]], i32 0
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[LD1:[0-9]+]] = load i32, i32* %[[GEP1]], align 1
+  ; CHECK-NEXT: %[[INS1:[0-9]+]] = insertelement <4 x i32> %[[INS0]], i32 %[[LD1]], i32 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[LD2:[0-9]+]] = load i32, i32* %[[GEP2]], align 1
+  ; CHECK-NEXT: %[[INS2:[0-9]+]] = insertelement <4 x i32> %[[INS1]], i32 %[[LD2]], i32 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[LD3:[0-9]+]] = load i32, i32* %[[GEP3]], align 1
+  ; CHECK-NEXT: %[[INS3:[0-9]+]] = insertelement <4 x i32> %[[INS2]], i32 %[[LD3]], i32 3
+  ; CHECK-NEXT: ret <4 x i32> %[[INS3]]
+  %loaded = load <4 x i32>, <4 x i32>* %loc, align 1
+  ret <4 x i32> %loaded
+}
+
+define <4 x i32> @test_load_4xi32_align2(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_load_4xi32_align2
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <4 x i32>* %loc to i32*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[LD0:[0-9]+]] = load i32, i32* %[[GEP0]], align 2
+  ; CHECK-NEXT: %[[INS0:[0-9]+]] = insertelement <4 x i32> undef, i32 %[[LD0]], i32 0
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[LD1:[0-9]+]] = load i32, i32* %[[GEP1]], align 2
+  ; CHECK-NEXT: %[[INS1:[0-9]+]] = insertelement <4 x i32> %[[INS0]], i32 %[[LD1]], i32 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[LD2:[0-9]+]] = load i32, i32* %[[GEP2]], align 2
+  ; CHECK-NEXT: %[[INS2:[0-9]+]] = insertelement <4 x i32> %[[INS1]], i32 %[[LD2]], i32 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[LD3:[0-9]+]] = load i32, i32* %[[GEP3]], align 2
+  ; CHECK-NEXT: %[[INS3:[0-9]+]] = insertelement <4 x i32> %[[INS2]], i32 %[[LD3]], i32 3
+  ; CHECK-NEXT: ret <4 x i32> %[[INS3]]
+  %loaded = load <4 x i32>, <4 x i32>* %loc, align 2
+  ret <4 x i32> %loaded
+}
+
+define <4 x i32> @test_load_4xi32_align4(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_load_4xi32_align4
+  ; CHECK-NEXT: %loaded = load <4 x i32>, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x i32> %loaded
+  %loaded = load <4 x i32>, <4 x i32>* %loc, align 4
+  ret <4 x i32> %loaded
+}
+
+define <4 x i32> @test_load_4xi32_align8(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_load_4xi32_align8
+  ; CHECK-NEXT: %loaded = load <4 x i32>, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x i32> %loaded
+  %loaded = load <4 x i32>, <4 x i32>* %loc, align 8
+  ret <4 x i32> %loaded
+}
+
+define <4 x i32> @test_load_4xi32_align16(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_load_4xi32_align16
+  ; CHECK-NEXT: %loaded = load <4 x i32>, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x i32> %loaded
+  %loaded = load <4 x i32>, <4 x i32>* %loc, align 16
+  ret <4 x i32> %loaded
+}
+
+define <4 x i32> @test_load_4xi32_align32(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_load_4xi32_align32
+  ; CHECK-NEXT: %loaded = load <4 x i32>, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x i32> %loaded
+  %loaded = load <4 x i32>, <4 x i32>* %loc, align 32
+  ret <4 x i32> %loaded
+}
+
+define <4 x float> @test_load_4xfloat_align0(<4 x float>* %loc) {
+  ; CHECK-LABEL: test_load_4xfloat_align0
+  ; CHECK-NEXT: %loaded = load <4 x float>, <4 x float>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x float> %loaded
+  %loaded = load <4 x float>, <4 x float>* %loc
+  ret <4 x float> %loaded
+}
+
+define <4 x float> @test_load_4xfloat_align2(<4 x float>* %loc) {
+  ; CHECK-LABEL: test_load_4xfloat_align2
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <4 x float>* %loc to float*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds float, float* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[LD0:[0-9]+]] = load float, float* %[[GEP0]], align 2
+  ; CHECK-NEXT: %[[INS0:[0-9]+]] = insertelement <4 x float> undef, float %[[LD0]], i32 0
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds float, float* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[LD1:[0-9]+]] = load float, float* %[[GEP1]], align 2
+  ; CHECK-NEXT: %[[INS1:[0-9]+]] = insertelement <4 x float> %[[INS0]], float %[[LD1]], i32 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds float, float* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[LD2:[0-9]+]] = load float, float* %[[GEP2]], align 2
+  ; CHECK-NEXT: %[[INS2:[0-9]+]] = insertelement <4 x float> %[[INS1]], float %[[LD2]], i32 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds float, float* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[LD3:[0-9]+]] = load float, float* %[[GEP3]], align 2
+  ; CHECK-NEXT: %[[INS3:[0-9]+]] = insertelement <4 x float> %[[INS2]], float %[[LD3]], i32 3
+  ; CHECK-NEXT: ret <4 x float> %[[INS3]]
+  %loaded = load <4 x float>, <4 x float>* %loc, align 2
+  ret <4 x float> %loaded
+}
+
+define <4 x float> @test_load_4xfloat_align4(<4 x float>* %loc) {
+  ; CHECK-LABEL: test_load_4xfloat_align4
+  ; CHECK-NEXT: %loaded = load <4 x float>, <4 x float>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x float> %loaded
+  %loaded = load <4 x float>, <4 x float>* %loc, align 4
+  ret <4 x float> %loaded
+}
+
+define <8 x i16> @test_load_8xi16_align0(<8 x i16>* %loc) {
+  ; CHECK-LABEL: test_load_8xi16_align0
+  ; CHECK-NEXT: %loaded = load <8 x i16>, <8 x i16>* %loc, align 2
+  ; CHECK-NEXT: ret <8 x i16> %loaded
+  %loaded = load <8 x i16>, <8 x i16>* %loc
+  ret <8 x i16> %loaded
+}
+
+define <8 x i16> @test_load_8xi16_align1(<8 x i16>* %loc) {
+  ; CHECK-LABEL: test_load_8xi16_align1
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <8 x i16>* %loc to i16*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i16, i16* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[LD0:[0-9]+]] = load i16, i16* %[[GEP0]], align 1
+  ; CHECK-NEXT: %[[INS0:[0-9]+]] = insertelement <8 x i16> undef, i16 %[[LD0]], i32 0
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i16, i16* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[LD1:[0-9]+]] = load i16, i16* %[[GEP1]], align 1
+  ; CHECK-NEXT: %[[INS1:[0-9]+]] = insertelement <8 x i16> %[[INS0]], i16 %[[LD1]], i32 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i16, i16* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[LD2:[0-9]+]] = load i16, i16* %[[GEP2]], align 1
+  ; CHECK-NEXT: %[[INS2:[0-9]+]] = insertelement <8 x i16> %[[INS1]], i16 %[[LD2]], i32 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i16, i16* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[LD3:[0-9]+]] = load i16, i16* %[[GEP3]], align 1
+  ; CHECK-NEXT: %[[INS3:[0-9]+]] = insertelement <8 x i16> %[[INS2]], i16 %[[LD3]], i32 3
+  ; CHECK-NEXT: %[[GEP4:[0-9]+]] = getelementptr inbounds i16, i16* %[[BASE]], i32 4
+  ; CHECK-NEXT: %[[LD4:[0-9]+]] = load i16, i16* %[[GEP4]], align 1
+  ; CHECK-NEXT: %[[INS4:[0-9]+]] = insertelement <8 x i16> %[[INS3]], i16 %[[LD4]], i32 4
+  ; CHECK-NEXT: %[[GEP5:[0-9]+]] = getelementptr inbounds i16, i16* %[[BASE]], i32 5
+  ; CHECK-NEXT: %[[LD5:[0-9]+]] = load i16, i16* %[[GEP5]], align 1
+  ; CHECK-NEXT: %[[INS5:[0-9]+]] = insertelement <8 x i16> %[[INS4]], i16 %[[LD5]], i32 5
+  ; CHECK-NEXT: %[[GEP6:[0-9]+]] = getelementptr inbounds i16, i16* %[[BASE]], i32 6
+  ; CHECK-NEXT: %[[LD6:[0-9]+]] = load i16, i16* %[[GEP6]], align 1
+  ; CHECK-NEXT: %[[INS6:[0-9]+]] = insertelement <8 x i16> %[[INS5]], i16 %[[LD6]], i32 6
+  ; CHECK-NEXT: %[[GEP7:[0-9]+]] = getelementptr inbounds i16, i16* %[[BASE]], i32 7
+  ; CHECK-NEXT: %[[LD7:[0-9]+]] = load i16, i16* %[[GEP7]], align 1
+  ; CHECK-NEXT: %[[INS7:[0-9]+]] = insertelement <8 x i16> %[[INS6]], i16 %[[LD7]], i32 7
+  ; CHECK-NEXT: ret <8 x i16> %[[INS7]]
+  %loaded = load <8 x i16>, <8 x i16>* %loc, align 1
+  ret <8 x i16> %loaded
+}
+
+define <8 x i16> @test_load_8xi16_align2(<8 x i16>* %loc) {
+  ; CHECK-LABEL: test_load_8xi16_align2
+  ; CHECK-NEXT: %loaded = load <8 x i16>, <8 x i16>* %loc, align 2
+  ; CHECK-NEXT: ret <8 x i16> %loaded
+  %loaded = load <8 x i16>, <8 x i16>* %loc, align 2
+  ret <8 x i16> %loaded
+}
+
+define <16 x i8> @test_load_16xi8_align0(<16 x i8>* %loc) {
+  ; CHECK-LABEL: test_load_16xi8_align0
+  ; CHECK-NEXT: %loaded = load <16 x i8>, <16 x i8>* %loc, align 1
+  ; CHECK-NEXT: ret <16 x i8> %loaded
+  %loaded = load <16 x i8>, <16 x i8>* %loc
+  ret <16 x i8> %loaded
+}
+
+define <16 x i8> @test_load_16xi8_align1(<16 x i8>* %loc) {
+  ; CHECK-LABEL: test_load_16xi8_align1
+  ; CHECK-NEXT: %loaded = load <16 x i8>, <16 x i8>* %loc, align 1
+  ; CHECK-NEXT: ret <16 x i8> %loaded
+  %loaded = load <16 x i8>, <16 x i8>* %loc, align 1
+  ret <16 x i8> %loaded
+}
+
+
+; Store ========================================================================
+
+define void @test_store_4xi1(<4 x i1> %val, <4 x i1>* %loc) {
+  ; CHECK-LABEL: test_store_4xi1
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <4 x i1>* %loc to i1*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[EXT0:[0-9]+]] = extractelement <4 x i1> %val, i32 0
+  ; CHECK-NEXT: store i1 %[[EXT0]], i1* %[[GEP0]], align 4
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[EXT1:[0-9]+]] = extractelement <4 x i1> %val, i32 1
+  ; CHECK-NEXT: store i1 %[[EXT1]], i1* %[[GEP1]], align 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[EXT2:[0-9]+]] = extractelement <4 x i1> %val, i32 2
+  ; CHECK-NEXT: store i1 %[[EXT2]], i1* %[[GEP2]], align 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i1, i1* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[EXT3:[0-9]+]] = extractelement <4 x i1> %val, i32 3
+  ; CHECK-NEXT: store i1 %[[EXT3]], i1* %[[GEP3]], align 1
+  ; CHECK-NEXT: ret void
+  store <4 x i1> %val, <4 x i1>* %loc
+  ret void
+}
+
+define void @test_store_4xi32_align0(<4 x i32> %val, <4 x i32>* %loc) {
+  ; CHECK-LABEL: test_store_4xi32_align0
+  ; CHECK-NEXT: store <4 x i32> %val, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret void
+  store <4 x i32> %val, <4 x i32>* %loc
+  ret void
+}
+
+define void @test_store_4xi32_align1(<4 x i32> %val, <4 x i32>* %loc) {
+  ; CHECK-LABEL: test_store_4xi32_align1
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <4 x i32>* %loc to i32*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[EXT0:[0-9]+]] = extractelement <4 x i32> %val, i32 0
+  ; CHECK-NEXT: store i32 %[[EXT0]], i32* %[[GEP0]], align 1
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[EXT1:[0-9]+]] = extractelement <4 x i32> %val, i32 1
+  ; CHECK-NEXT: store i32 %[[EXT1]], i32* %[[GEP1]], align 1
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[EXT2:[0-9]+]] = extractelement <4 x i32> %val, i32 2
+  ; CHECK-NEXT: store i32 %[[EXT2]], i32* %[[GEP2]], align 1
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[EXT3:[0-9]+]] = extractelement <4 x i32> %val, i32 3
+  ; CHECK-NEXT: store i32 %[[EXT3]], i32* %[[GEP3]], align 1
+  ; CHECK-NEXT: ret void
+  store <4 x i32> %val, <4 x i32>* %loc, align 1
+  ret void
+}
+
+define void @test_store_4xi32_align2(<4 x i32> %val, <4 x i32>* %loc) {
+  ; CHECK-LABEL: test_store_4xi32_align2
+  ; CHECK-NEXT: %[[BASE:[0-9]+]] = bitcast <4 x i32>* %loc to i32*
+  ; CHECK-NEXT: %[[GEP0:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 0
+  ; CHECK-NEXT: %[[EXT0:[0-9]+]] = extractelement <4 x i32> %val, i32 0
+  ; CHECK-NEXT: store i32 %[[EXT0]], i32* %[[GEP0]], align 2
+  ; CHECK-NEXT: %[[GEP1:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 1
+  ; CHECK-NEXT: %[[EXT1:[0-9]+]] = extractelement <4 x i32> %val, i32 1
+  ; CHECK-NEXT: store i32 %[[EXT1]], i32* %[[GEP1]], align 2
+  ; CHECK-NEXT: %[[GEP2:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 2
+  ; CHECK-NEXT: %[[EXT2:[0-9]+]] = extractelement <4 x i32> %val, i32 2
+  ; CHECK-NEXT: store i32 %[[EXT2]], i32* %[[GEP2]], align 2
+  ; CHECK-NEXT: %[[GEP3:[0-9]+]] = getelementptr inbounds i32, i32* %[[BASE]], i32 3
+  ; CHECK-NEXT: %[[EXT3:[0-9]+]] = extractelement <4 x i32> %val, i32 3
+  ; CHECK-NEXT: store i32 %[[EXT3]], i32* %[[GEP3]], align 2
+  ; CHECK-NEXT: ret void
+  store <4 x i32> %val, <4 x i32>* %loc, align 2
+  ret void
+}
+
+define void @test_store_4xi32_align4(<4 x i32> %val, <4 x i32>* %loc) {
+  ; CHECK-LABEL: test_store_4xi32_align4
+  ; CHECK-NEXT: store <4 x i32> %val, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret void
+  store <4 x i32> %val, <4 x i32>* %loc, align 4
+  ret void
+}
+
+define void @test_store_4xi32_align8(<4 x i32> %val, <4 x i32>* %loc) {
+  ; CHECK-LABEL: test_store_4xi32_align8
+  ; CHECK-NEXT: store <4 x i32> %val, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret void
+  store <4 x i32> %val, <4 x i32>* %loc, align 8
+  ret void
+}
+
+define void @test_store_4xi32_align16(<4 x i32> %val, <4 x i32>* %loc) {
+  ; CHECK-LABEL: test_store_4xi32_align16
+  ; CHECK-NEXT: store <4 x i32> %val, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret void
+  store <4 x i32> %val, <4 x i32>* %loc, align 16
+  ret void
+}
+
+define void @test_store_4xi32_align32(<4 x i32> %val, <4 x i32>* %loc) {
+  ; CHECK-LABEL: test_store_4xi32_align32
+  ; CHECK-NEXT: store <4 x i32> %val, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret void
+  store <4 x i32> %val, <4 x i32>* %loc, align 32
+  ret void
+}
+
+define void @test_store_4xfloat_align0(<4 x float> %val, <4 x float>* %loc) {
+  ; CHECK-LABEL: test_store_4xfloat_align0
+  ; CHECK-NEXT: store <4 x float> %val, <4 x float>* %loc, align 4
+  ; CHECK-NEXT: ret void
+  store <4 x float> %val, <4 x float>* %loc
+  ret void
+}
+
+
+; Volatile =====================================================================
+
+define <4 x i32> @test_volatile_load_4xi32_align0(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_volatile_load_4xi32_align0
+  ; CHECK-NEXT: %loaded = load volatile <4 x i32>, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x i32> %loaded
+  %loaded = load volatile <4 x i32>, <4 x i32>* %loc
+  ret <4 x i32> %loaded
+}
+
+define <4 x i32> @test_volatile_load_4xi32_align4(<4 x i32>* %loc) {
+  ; CHECK-LABEL: test_volatile_load_4xi32_align4
+  ; CHECK-NEXT: %loaded = load volatile <4 x i32>, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret <4 x i32> %loaded
+  %loaded = load volatile <4 x i32>, <4 x i32>* %loc, align 4
+  ret <4 x i32> %loaded
+}
+
+define void @test_volatile_store_4xi32_align0(<4 x i32> %val, <4 x i32>* %loc) {
+  ; CHECK-LABEL: test_volatile_store_4xi32_align0
+  ; CHECK-NEXT: store volatile <4 x i32> %val, <4 x i32>* %loc, align 4
+  ; CHECK-NEXT: ret void
+  store volatile <4 x i32> %val, <4 x i32>* %loc
+  ret void
+}
diff --git a/test/Transforms/NaCl/flatten-globals.ll b/test/Transforms/NaCl/flatten-globals.ll
new file mode 100644
index 000000000000..fa62104ae299
--- /dev/null
+++ b/test/Transforms/NaCl/flatten-globals.ll
@@ -0,0 +1,209 @@
+; RUN: opt -flatten-globals %s -S | FileCheck %s
+; RUN: opt -flatten-globals %s -S | FileCheck %s -check-prefix=CLEANED
+
+target datalayout = "p:32:32:32"
+
+
+; Check simple cases
+
+@var_i32 = global i32 258
+; CHECK: @var_i32 = global [4 x i8] c"\02\01\00\00"
+; CLEANED-NOT: global i32 258
+
+@external_var = external global i32
+; CHECK: @external_var = external global [4 x i8]
+
+@zero_init = global i32 0
+; CHECK: @zero_init = global [4 x i8] zeroinitializer
+
+@big_zero_init = global [2000 x i8] zeroinitializer
+; CHECK: @big_zero_init = global [2000 x i8] zeroinitializer
+
+@null_ptr = global i32* null
+; CHECK: @null_ptr = global [4 x i8] zeroinitializer
+
+@undef_value = global i32 undef
+; CHECK: @undef_value = global [4 x i8] zeroinitializer
+
+%opaque = type opaque
+@opaque_extern = external global %opaque
+; CHECK: @opaque_extern = external global [0 x i8]
+
+
+; Check various data types
+
+@var_i1 = global i8 1
+; CHECK: @var_i1 = global [1 x i8] c"\01"
+
+@var_i8 = global i8 65
+; CHECK: @var_i8 = global [1 x i8] c"A"
+
+@var_i16 = global i16 258
+; CHECK: @var_i16 = global [2 x i8] c"\02\01"
+
+@var_i64 = global i64 72623859790382856
+; CHECK: @var_i64 = global [8 x i8] c"\08\07\06\05\04\03\02\01"
+
+@var_i128 = global i128 1339673755198158349044581307228491536
+; CHECK: @var_i128 = global [16 x i8] c"\10\0F\0E\0D\0C\0B\0A\09\08\07\06\05\04\03\02\01"
+
+; Check that padding bits come out as zero.
+@var_i121 = global i121 1339673755198158349044581307228491536
+; CHECK: @var_i121 = global [16 x i8] c"\10\0F\0E\0D\0C\0B\0A\09\08\07\06\05\04\03\02\01"
+
+@var_double = global double 123.456
+; CHECK: @var_double = global [8 x i8] c"w\BE\9F\1A/\DD^@"
+
+@var_float = global float 123.0
+; CHECK: @var_float = global [4 x i8] c"\00\00\F6B"
+
+
+; Check aggregates
+
+@padded_struct = global { i8, i8, i32 } { i8 65, i8 66, i32 258 }
+; CHECK: @padded_struct = global [8 x i8] c"AB\00\00\02\01\00\00"
+
+@packed_struct = global <{ i8, i8, i32 }> <{ i8 67, i8 68, i32 258 }>
+; CHECK: @packed_struct = global [6 x i8] c"CD\02\01\00\00"
+
+@i8_array = global [6 x i8] c"Hello\00"
+; CHECK: @i8_array = global [6 x i8] c"Hello\00"
+
+@i16_array = global [3 x i16] [ i16 1, i16 2, i16 3 ]
+; CHECK: @i16_array = global [6 x i8] c"\01\00\02\00\03\00"
+
+%s = type { i8, i8 }
+@struct_array = global [2 x %s] [%s { i8 1, i8 2 }, %s { i8 3, i8 4 }]
+; CHECK: @struct_array = global [4 x i8] c"\01\02\03\04"
+
+@vector = global <2 x i32> <i32 259, i32 520>
+; CHECK: @vector = global [8 x i8] c"\03\01\00\00\08\02\00\00"
+
+
+; Check that various attributes are preserved
+
+@constant_var = constant i32 259
+; CHECK: @constant_var = constant [4 x i8] c"\03\01\00\00"
+
+@weak_external_var = extern_weak global i32
+; CHECK: @weak_external_var = extern_weak global [4 x i8]
+
+@tls_var = external thread_local global i32
+; CHECK: @tls_var = external thread_local global [4 x i8]
+
+@aligned_var = global i32 260, align 8
+; CHECK: @aligned_var = global [4 x i8] c"\04\01\00\00", align 8
+
+
+; Check alignment handling
+
+@implicit_alignment_i32 = global i32 zeroinitializer
+; CHECK: @implicit_alignment_i32 = global [4 x i8] zeroinitializer, align 4
+
+@implicit_alignment_double = global double zeroinitializer
+; CHECK: @implicit_alignment_double = global [8 x i8] zeroinitializer, align 8
+
+@implicit_alignment_vector = global <16 x i8> zeroinitializer
+; CHECK: @implicit_alignment_vector = global [16 x i8] zeroinitializer, align 16
+
+; FlattenGlobals is not allowed to increase the alignment of the
+; variable when an explicit section is specified (although PNaCl does
+; not support this attribute).
+@lower_alignment_section = global i32 0, section "mysection", align 1
+; CHECK: @lower_alignment_section = global [4 x i8] zeroinitializer, section "mysection", align 1
+
+; FlattenGlobals could increase the alignment when no section is
+; specified, but it does not.
+@lower_alignment = global i32 0, align 1
+; CHECK: @lower_alignment = global [4 x i8] zeroinitializer, align 1
+
+
+; Check handling of global references
+
+@var1 = external global i32
+@var2 = external global i8
+
+%ptrs1 = type { i32*, i8*, i32 }
+@ptrs1 = global %ptrs1 { i32* @var1, i8* null, i32 259 }
+; CHECK: @ptrs1 = global <{ i32, [8 x i8] }> <{ i32 ptrtoint ([4 x i8]* @var1 to i32), [8 x i8] c"\00\00\00\00\03\01\00\00" }>
+
+%ptrs2 = type { i32, i32*, i8* }
+@ptrs2 = global %ptrs2 { i32 259, i32* @var1, i8* @var2 }
+; CHECK: @ptrs2 = global <{ [4 x i8], i32, i32 }> <{ [4 x i8] c"\03\01\00\00", i32 ptrtoint ([4 x i8]* @var1 to i32), i32 ptrtoint ([1 x i8]* @var2 to i32) }>
+
+%ptrs3 = type { i32*, [3 x i8], i8* }
+@ptrs3 = global %ptrs3 { i32* @var1, [3 x i8] c"foo", i8* @var2 }
+; CHECK: @ptrs3 = global <{ i32, [4 x i8], i32 }> <{ i32 ptrtoint ([4 x i8]* @var1 to i32), [4 x i8] c"foo\00", i32 ptrtoint ([1 x i8]* @var2 to i32) }>
+
+@ptr = global i32* @var1
+; CHECK: @ptr = global i32 ptrtoint ([4 x i8]* @var1 to i32)
+
+@func_ptr = global i32* ()* @get_address
+; CHECK: @func_ptr = global i32 ptrtoint (i32* ()* @get_address to i32)
+
+@block_addr = global i8* blockaddress(@func_with_block, %label)
+; CHECK: @block_addr = global i32 ptrtoint (i8* blockaddress(@func_with_block, %label) to i32)
+
+@vector_reloc = global <2 x i32*> <i32* @var1, i32* @var1>
+; CHECK: global <{ i32, i32 }> <{ i32 ptrtoint ([4 x i8]* @var1 to i32), i32 ptrtoint ([4 x i8]* @var1 to i32) }>
+
+
+; Global references with addends
+
+@reloc_addend = global i32* getelementptr (%ptrs1, %ptrs1* @ptrs1, i32 0, i32 2)
+; CHECK: @reloc_addend = global i32 add (i32 ptrtoint (<{ i32, [8 x i8] }>* @ptrs1 to i32), i32 8)
+
+@negative_addend = global %ptrs1* getelementptr (%ptrs1, %ptrs1* @ptrs1, i32 -1)
+; CHECK: @negative_addend = global i32 add (i32 ptrtoint (<{ i32, [8 x i8] }>* @ptrs1 to i32), i32 -12)
+
+@const_ptr = global i32* getelementptr (%ptrs1, %ptrs1* null, i32 0, i32 2)
+; CHECK: @const_ptr = global [4 x i8] c"\08\00\00\00"
+
+@int_to_ptr = global i32* inttoptr (i16 260 to i32*)
+; CHECK: @int_to_ptr = global [4 x i8] c"\04\01\00\00"
+
+; Clang allows "(uintptr_t) &var" as a global initializer, so we
+; handle this case.
+@ptr_to_int = global i32 ptrtoint (i8* @var2 to i32)
+; CHECK: @ptr_to_int = global i32 ptrtoint ([1 x i8]* @var2 to i32)
+
+; This is handled via Constant folding.  The getelementptr is
+; converted to an undef when it is created, so the pass does not see a
+; getelementptr here.
+@undef_gep = global i32* getelementptr (%ptrs1, %ptrs1* undef, i32 0, i32 2)
+; CHECK: @undef_gep = global [4 x i8] zeroinitializer
+
+; Adding an offset to a function address isn't useful, but check that
+; the pass handles it anyway.
+@func_addend = global i8* getelementptr (
+    i8, 
+    i8* bitcast (void ()* @func_with_block to i8*), i32 123)
+; CHECK: @func_addend = global i32 add (i32 ptrtoint (void ()* @func_with_block to i32), i32 123)
+
+; Similarly, adding an offset to a label address isn't useful, but
+; check it anyway.
+@block_addend = global i8* getelementptr (
+    i8, 
+    i8* blockaddress(@func_with_block, %label), i32 100)
+; CHECK: @block_addend = global i32 add (i32 ptrtoint (i8* blockaddress(@func_with_block, %label) to i32), i32 100)
+
+
+; Special cases
+
+; Leave vars with "appending" linkage alone.
+@appending = appending global [1 x i32*] [i32* @var1]
+; CHECK: @appending = appending global [1 x i32*] [i32* bitcast ([4 x i8]* @var1 to i32*)]
+
+
+define i32* @get_address() {
+  ret i32* @var_i32
+}
+; CHECK: define i32* @get_address() {
+; CHECK-NEXT: ret i32* bitcast ([4 x i8]* @var_i32 to i32*)
+
+
+define void @func_with_block() {
+  br label %label
+label:
+  ret void
+}
diff --git a/test/Transforms/NaCl/globalcleanup.ll b/test/Transforms/NaCl/globalcleanup.ll
new file mode 100644
index 000000000000..57c814d8522e
--- /dev/null
+++ b/test/Transforms/NaCl/globalcleanup.ll
@@ -0,0 +1,59 @@
+; RUN: opt < %s -nacl-global-cleanup -S | FileCheck %s
+; RUN: opt < %s -nacl-global-cleanup -S | FileCheck -check-prefix=GV %s
+
+@a = global i8 42
+
+@llvm.compiler.used = appending global [1 x i8*] [i8* @a], section "llvm.metadata"
+; GV-NOT: llvm.compiler.used
+
+@llvm.used = appending global [1 x i8*] [i8* @a], section "llvm.metadata"
+; The used list remains unchanged.
+; CHECK: llvm.used
+
+@extern_weak_const = extern_weak constant i32
+@extern_weak_gv = extern_weak global i32
+
+; GV-NOT: @extern_weak_const
+; GV-NOT: @extern_weak_gv
+
+; CHECK: @weak_gv = internal global
+@weak_gv = weak global i32 0
+
+; CHECK: define void @_start
+define void @_start() {
+  ret void
+}
+
+define i32* @ewgv() {
+; CHECK: %bc = getelementptr i8, i8* null, i32 0
+  %bc = getelementptr i8, i8* bitcast (i32* @extern_weak_gv to i8*), i32 0
+; CHECK: ret i32* null
+  ret i32* @extern_weak_gv
+}
+
+define i32* @ewc() {
+; CHECK: %bc = getelementptr i8, i8* null, i32 0
+  %bc = getelementptr i8, i8* bitcast (i32* @extern_weak_const to i8*), i32 0
+; CHECK: ret i32* null
+  ret i32* @extern_weak_gv
+}
+
+; Make sure @weak_gv is actually used.
+define i32* @wgv() {
+; CHECK: ret i32* @weak_gv
+  ret i32* @weak_gv
+}
+
+; GV-NOT: @extern_weak_func
+declare extern_weak i32 @extern_weak_func()
+; CHECK: @ewf
+define i32 @ewf() {
+; CHECK: %ret = call i32 null()
+  %ret = call i32 @extern_weak_func()
+  ret i32 %ret
+}
+
+; CHECK: define internal void @weak_func
+define weak void @weak_func() {
+  ret void
+}
diff --git a/test/Transforms/NaCl/globalize-constant-vectors.ll b/test/Transforms/NaCl/globalize-constant-vectors.ll
new file mode 100644
index 000000000000..a77fb7c2214d
--- /dev/null
+++ b/test/Transforms/NaCl/globalize-constant-vectors.ll
@@ -0,0 +1,204 @@
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=C4xi1 %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=C8xi1 %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=C16xi1 %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=C16xi8 %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=C8xi16 %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=C4xi32 %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=C4xfloat %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=Cbranch %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=Cduplicate %s
+; RUN: opt -globalize-constant-vectors %s -S | FileCheck -check-prefix=Czeroinitializer %s
+; RUN: opt -expand-constant-expr -globalize-constant-vectors %s -S | FileCheck -check-prefix=Cnestedconst %s
+
+; Run the test once per function so that each check can look at its
+; globals as well as its function.
+
+; The datalayout is needed to determine the alignment of the globals.
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+; Globals shouldn't get globalized.
+; CHECK: @global_should_stay_untouched = internal constant <4 x i32> <i32 1337, i32 0, i32 0, i32 0>
+@global_should_stay_untouched = internal constant <4 x i32> <i32 1337, i32 0, i32 0, i32 0>
+
+; Also test a global initializer with nested const-exprs.
+; NOTE: Have the global share the same const-expr as an instruction below.
+; CHECK: @global_with_nesting = internal global <{ <4 x i32>, <8 x i16> }> <{ <4 x i32> <i32 1, i32 4, i32 10, i32 20>, <8 x i16> <i16 0, i16 1, i16 1, i16 2, i16 3, i16 5, i16 8, i16 13> }>
+@global_with_nesting = internal global <{ <4 x i32>, <8 x i16> }> <{ <4 x i32> <i32 1, i32 4, i32 10, i32 20>, <8 x i16> <i16 0, i16 1, i16 1, i16 2, i16 3, i16 5, i16 8, i16 13> }>
+
+; 4xi1 vectors should get globalized.
+define void @test4xi1(<4 x i1> %in) {
+  %ft0 = and <4 x i1> %in, <i1 false, i1 true, i1 false, i1 true>
+  %ft1 = and <4 x i1> <i1 true, i1 false, i1 true, i1 false>, %in
+  ret void
+}
+; C4xi1: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <4 x i1> <i1 false, i1 true, i1 false, i1 true>, align 4
+; C4xi1: @[[C2:[_a-z0-9]+]] = internal unnamed_addr constant <4 x i1> <i1 true, i1 false, i1 true, i1 false>, align 4
+; C4xi1: define void @test4xi1(<4 x i1> %in) {
+; C4xi1-NEXT: %[[M1:[_a-z0-9]+]] = load <4 x i1>, <4 x i1>* @[[C1]], align 4
+; C4xi1-NEXT: %[[M2:[_a-z0-9]+]] = load <4 x i1>, <4 x i1>* @[[C2]], align 4
+; C4xi1-NEXT: %ft0 = and <4 x i1> %in, %[[M1]]
+; C4xi1-NEXT: %ft1 = and <4 x i1> %[[M2]], %in
+; C4xi1-NEXT: ret void
+
+; 8xi1 vectors should get globalized.
+define void @test8xi1(<8 x i1> %in) {
+  %ft0 = and <8 x i1> %in, <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>
+  %ft1 = and <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, %in
+  ret void
+}
+; C8xi1: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <8 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, align 8
+; C8xi1: @[[C2:[_a-z0-9]+]] = internal unnamed_addr constant <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, align 8
+; C8xi1: define void @test8xi1(<8 x i1> %in) {
+; C8xi1-NEXT: %[[M1:[_a-z0-9]+]] = load <8 x i1>, <8 x i1>* @[[C1]], align 8
+; C8xi1-NEXT: %[[M2:[_a-z0-9]+]] = load <8 x i1>, <8 x i1>* @[[C2]], align 8
+; C8xi1-NEXT: %ft0 = and <8 x i1> %in, %[[M1]]
+; C8xi1-NEXT: %ft1 = and <8 x i1> %[[M2]], %in
+; C8xi1-NEXT: ret void
+
+; 16xi1 vectors should get globalized.
+define void @test16xi1(<16 x i1> %in) {
+  %ft0 = and <16 x i1> %in, <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>
+  %ft1 = and <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, %in
+  ret void
+}
+; C16xi1: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <16 x i1> <i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, align 16
+; C16xi1: @[[C2:[_a-z0-9]+]] = internal unnamed_addr constant <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, align 16
+; C16xi1: define void @test16xi1(<16 x i1> %in) {
+; C16xi1-NEXT: %[[M1:[_a-z0-9]+]] = load <16 x i1>, <16 x i1>* @[[C1]], align 16
+; C16xi1-NEXT: %[[M2:[_a-z0-9]+]] = load <16 x i1>, <16 x i1>* @[[C2]], align 16
+; C16xi1-NEXT: %ft0 = and <16 x i1> %in, %[[M1]]
+; C16xi1-NEXT: %ft1 = and <16 x i1> %[[M2]], %in
+; C16xi1-NEXT: ret void
+
+; 16xi8 vectors should get globalized.
+define void @test16xi8(<16 x i8> %in) {
+  %nonsquares = add <16 x i8> %in, <i8 2, i8 3, i8 5, i8 6, i8 7, i8 8, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 17, i8 18, i8 19, i8 20>
+  %sort = add <16 x i8> <i8 0, i8 1, i8 3, i8 5, i8 9, i8 11, i8 14, i8 17, i8 25, i8 27, i8 30, i8 33, i8 38, i8 41, i8 45, i8 49>, %in
+  ret void
+}
+; C16xi8: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <16 x i8> <i8 2, i8 3, i8 5, i8 6, i8 7, i8 8, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 17, i8 18, i8 19, i8 20>, align 4
+; C16xi8: @[[C2:[_a-z0-9]+]] = internal unnamed_addr constant <16 x i8> <i8 0, i8 1, i8 3, i8 5, i8 9, i8 11, i8 14, i8 17, i8 25, i8 27, i8 30, i8 33, i8 38, i8 41, i8 45, i8 49>, align 4
+; C16xi8: define void @test16xi8(<16 x i8> %in) {
+; C16xi8-NEXT: %[[M1:[_a-z0-9]+]] = load <16 x i8>, <16 x i8>* @[[C1]], align 4
+; C16xi8-NEXT: %[[M2:[_a-z0-9]+]] = load <16 x i8>, <16 x i8>* @[[C2]], align 4
+; C16xi8-NEXT: %nonsquares = add <16 x i8> %in, %[[M1]]
+; C16xi8-NEXT: %sort = add <16 x i8> %[[M2]], %in
+; C16xi8-NEXT: ret void
+
+; 8xi16 vectors should get globalized.
+define void @test8xi16(<8 x i16> %in) {
+  %fib = add <8 x i16> %in, <i16 0, i16 1, i16 1, i16 2, i16 3, i16 5, i16 8, i16 13>
+  %answer = add <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>, %in
+  ret void
+}
+; C8xi16: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <8 x i16> <i16 0, i16 1, i16 1, i16 2, i16 3, i16 5, i16 8, i16 13>, align 4
+; C8xi16: @[[C2:[_a-z0-9]+]] = internal unnamed_addr constant <8 x i16> <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>, align 4
+; C8xi16: define void @test8xi16(<8 x i16> %in) {
+; C8xi16-NEXT: %[[M1:[_a-z0-9]+]] = load <8 x i16>, <8 x i16>* @[[C1]], align 4
+; C8xi16-NEXT: %[[M2:[_a-z0-9]+]] = load <8 x i16>, <8 x i16>* @[[C2]], align 4
+; C8xi16-NEXT: %fib = add <8 x i16> %in, %[[M1]]
+; C8xi16-NEXT: %answer = add <8 x i16> %[[M2]], %in
+; C8xi16-NEXT: ret void
+
+; 4xi32 vectors should get globalized.
+define void @test4xi32(<4 x i32> %in) {
+  %tetrahedral = add <4 x i32> %in, <i32 1, i32 4, i32 10, i32 20>
+  %serauqs = add <4 x i32> <i32 1, i32 4, i32 9, i32 61>, %in
+  ret void
+}
+; C4xi32: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <4 x i32> <i32 1, i32 4, i32 10, i32 20>, align 4
+; C4xi32: @[[C2:[_a-z0-9]+]] = internal unnamed_addr constant <4 x i32> <i32 1, i32 4, i32 9, i32 61>, align 4
+; C4xi32: define void @test4xi32(<4 x i32> %in) {
+; C4xi32-NEXT: %[[M1:[_a-z0-9]+]] = load <4 x i32>, <4 x i32>* @[[C1]], align 4
+; C4xi32-NEXT: %[[M2:[_a-z0-9]+]] = load <4 x i32>, <4 x i32>* @[[C2]], align 4
+; C4xi32-NEXT: %tetrahedral = add <4 x i32> %in, %[[M1]]
+; C4xi32-NEXT: %serauqs = add <4 x i32> %[[M2]], %in
+; C4xi32-NEXT: ret void
+
+; 4xfloat vectors should get globalized.
+define void @test4xfloat(<4 x float> %in) {
+  %polyhex = fadd <4 x float> %in, <float 1., float 1., float 3., float 7.>
+  %poset = fadd <4 x float> <float 1., float 1., float 3., float 19.>, %in
+  ret void
+}
+; C4xfloat: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <4 x float> <float 1.000000e+00, float 1.000000e+00, float 3.000000e+00, float 7.000000e+00>, align 4
+; C4xfloat: @[[C2:[_a-z0-9]+]] = internal unnamed_addr constant <4 x float> <float 1.000000e+00, float 1.000000e+00, float 3.000000e+00, float 1.900000e+01>, align 4
+; C4xfloat: define void @test4xfloat(<4 x float> %in) {
+; C4xfloat-NEXT: %[[M1:[_a-z0-9]+]] = load <4 x float>, <4 x float>* @[[C1]], align 4
+; C4xfloat-NEXT: %[[M2:[_a-z0-9]+]] = load <4 x float>, <4 x float>* @[[C2]], align 4
+; C4xfloat-NEXT: %polyhex = fadd <4 x float> %in, %[[M1]]
+; C4xfloat-NEXT: %poset = fadd <4 x float> %[[M2]], %in
+; C4xfloat-NEXT: ret void
+
+; Globalized constant loads have to dominate their use.
+define void @testbranch(i1 %cond, <4 x i32> %in) {
+  br i1 %cond, label %lhs, label %rhs
+lhs:
+  %from_lhs = add <4 x i32> %in, <i32 1, i32 1, i32 2, i32 2>
+  br label %done
+rhs:
+  %from_rhs = add <4 x i32> <i32 2, i32 2, i32 1, i32 1>, %in
+  br label %done
+done:
+  %merged = phi <4 x i32> [ %from_lhs, %lhs ], [ %from_rhs, %rhs ]
+  ret void
+}
+; Cbranch: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <4 x i32> <i32 1, i32 1, i32 2, i32 2>, align 4
+; Cbranch: @[[C2:[_a-z0-9]+]] = internal unnamed_addr constant <4 x i32> <i32 2, i32 2, i32 1, i32 1>, align 4
+; Cbranch: define void @testbranch(i1 %cond, <4 x i32> %in) {
+; Cbranch-NEXT: %[[M1:[_a-z0-9]+]] = load <4 x i32>, <4 x i32>* @[[C1]], align 4
+; Cbranch-NEXT: %[[M2:[_a-z0-9]+]] = load <4 x i32>, <4 x i32>* @[[C2]], align 4
+; Cbranch-NEXT: br i1 %cond, label %lhs, label %rhs
+; Cbranch: lhs:
+; Cbranch-NEXT: %from_lhs = add <4 x i32> %in, %[[M1]]
+; Cbranch-NEXT: br label %done
+; Cbranch: rhs:
+; Cbranch-NEXT: %from_rhs = add <4 x i32> %[[M2]], %in
+; Cbranch-NEXT: br label %done
+; Cbranch: done:
+; Cbranch-NEXT: %merged = phi <4 x i32> [ %from_lhs, %lhs ], [ %from_rhs, %rhs ]
+; Cbranch-NEXT: ret void
+
+; Globalizing redundant constants between functions should materialize
+; them in each function, but there should only be a single global.
+define void @testduplicate1() {
+  %foo = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, undef
+  ret void
+}
+define void @testduplicate2() {
+  %foo = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, undef
+  ret void
+}
+; Cduplicate: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <4 x i32> <i32 1, i32 1, i32 1, i32 1>, align 4
+; Cduplicate: define void @testduplicate1() {
+; Cduplicate-NEXT: %[[M1:[_a-z0-9]+]] = load <4 x i32>, <4 x i32>* @[[C1]], align 4
+; Cduplicate-NEXT: %foo = add <4 x i32> %[[M1]], undef
+; Cduplicate-NEXT: ret void
+; Cduplicate: define void @testduplicate2() {
+; Cduplicate-NEXT: %[[M1:[_a-z0-9]+]] = load <4 x i32>, <4 x i32>* @[[C1]], align 4
+; Cduplicate-NEXT: %foo = add <4 x i32> %[[M1]], undef
+; Cduplicate-NEXT: ret void
+
+; zeroinitializer vectors should get globalized.
+define void @testzeroinitializer(<4 x float> %in) {
+  %id = fadd <4 x float> %in, <float 0.0, float 0.0, float 0.0, float 0.0>
+  ret void
+}
+; Czeroinitializer: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <4 x float> zeroinitializer, align 4
+; Czeroinitializer: define void @testzeroinitializer(<4 x float> %in) {
+; Czeroinitializer-NEXT: %[[M1:[_a-z0-9]+]] = load <4 x float>, <4 x float>* @[[C1]], align 4
+; Czeroinitializer-NEXT: %id = fadd <4 x float> %in, %[[M1]]
+; Czeroinitializer-NEXT: ret void
+
+; Nested constant exprs are handled by running -expand-constant-expr first.
+define i64 @test_nested_const(i64 %x) {
+  %foo = add i64 bitcast (<8 x i8><i8 10, i8 20, i8 30, i8 40, i8 50, i8 60, i8 70, i8 80> to i64), %x
+  ret i64 %foo
+}
+; Cnestedconst: @[[C1:[_a-z0-9]+]] = internal unnamed_addr constant <8 x i8> <i8 10, i8 20, i8 30, i8 40, i8 50, i8 60, i8 70, i8 80>, align 8
+; Cnestedconst: define i64 @test_nested_const(i64 %x) {
+; Cnestedconst-NEXT: %[[M1:[_a-z0-9]+]] = load <8 x i8>, <8 x i8>* @[[C1]], align 8
+; Cnestedconst-NEXT: %[[X1:[_a-z0-9]+]] = bitcast <8 x i8> %[[M1]] to i64
+; Cnestedconst-NEXT: add i64 %[[X1]], %x
+; Cnestedconst-NEXT: ret i64 %foo
diff --git a/test/Transforms/NaCl/internalize-used-globals.ll b/test/Transforms/NaCl/internalize-used-globals.ll
new file mode 100644
index 000000000000..f25bb6cbd0cf
--- /dev/null
+++ b/test/Transforms/NaCl/internalize-used-globals.ll
@@ -0,0 +1,34 @@
+; RUN: opt %s -internalize-used-globals  -S | FileCheck %s
+
+target datalayout = "e-p:32:32-i64:64"
+target triple = "le32-unknown-nacl"
+
+@llvm.used = appending global [1 x i8*] [i8* bitcast (void ()* @foo to i8*)], section "llvm.metadata"
+; The used list remains unchanged.
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* bitcast (void ()* @foo to i8*)], section "llvm.metadata"
+
+
+define hidden void @foo() #0 {
+  ret void
+}
+; Although in the used list, foo becomes internal.
+; CHECK-LABEL: define internal void @foo
+
+
+define i32 @_start() {
+entry:
+  ret i32 0
+}
+; @_start is left non-internal.
+; CHECK-LABEL: define i32 @_start
+
+define internal void @my_internal() {
+  ret void
+}
+
+; Internals are left as-is.
+; CHECK-LABEL: define internal void @my_internal()
+
+!llvm.ident = !{!0}
+!0 = !{!"clang version 3.5.0 "}
+
diff --git a/test/Transforms/NaCl/life.ll b/test/Transforms/NaCl/life.ll
new file mode 100644
index 000000000000..dcaf02ff2e17
--- /dev/null
+++ b/test/Transforms/NaCl/life.ll
@@ -0,0 +1,66 @@
+; RUN: opt -pnacl-abi-simplify-postopt %s -S | \
+; RUN:   opt -backend-canonicalize -S | FileCheck %s
+
+; Test that the SIMD game of life example from the NaCl SDK has an inner loop
+; that contains the expected shufflevector instructions. First run the ABI
+; simplifications on the code, then run the translator's peepholes.
+;
+; The stable PNaCl bitcode ABI doesn't have shufflevector nor constant vectors,
+; it instead has insertelement, extractelement and load from globals. Note that
+; `undef` becomes `0` in the constants.
+
+; The datalayout is needed to determine the alignment of the globals.
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+define <16 x i8> @InnerLoop(<16 x i8>* %pixel_line, <16 x i8> %src00, <16 x i8> %src01, <16 x i8> %src10, <16 x i8> %src11, <16 x i8> %src20, <16 x i8> %src21) {
+  ; CHECK-LABEL: InnerLoop
+  ; CHECK-NEXT: shufflevector <16 x i8> %src00, <16 x i8> %src01, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  ; CHECK-NEXT: shufflevector <16 x i8> %src00, <16 x i8> %src01, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ; CHECK-NEXT: shufflevector <16 x i8> %src10, <16 x i8> %src11, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  ; CHECK-NEXT: shufflevector <16 x i8> %src10, <16 x i8> %src11, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ; CHECK-NEXT: shufflevector <16 x i8> %src20, <16 x i8> %src21, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  ; CHECK-NEXT: shufflevector <16 x i8> %src20, <16 x i8> %src21, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  ; CHECK-NOT: load
+  ; CHECK-NOT: insertelement
+  ; CHECK-NOT: extractelement
+  %shuffle = shufflevector <16 x i8> %src00, <16 x i8> %src01, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %shuffle3 = shufflevector <16 x i8> %src00, <16 x i8> %src01, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  %shuffle4 = shufflevector <16 x i8> %src10, <16 x i8> %src11, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %shuffle5 = shufflevector <16 x i8> %src10, <16 x i8> %src11, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  %shuffle6 = shufflevector <16 x i8> %src20, <16 x i8> %src21, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %shuffle7 = shufflevector <16 x i8> %src20, <16 x i8> %src21, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  %add = add <16 x i8> %shuffle, %src00
+  %add8 = add <16 x i8> %add, %shuffle3
+  %add9 = add <16 x i8> %add8, %src10
+  %add10 = add <16 x i8> %add9, %shuffle5
+  %add11 = add <16 x i8> %add10, %src20
+  %add12 = add <16 x i8> %add11, %shuffle6
+  %add13 = add <16 x i8> %add12, %shuffle7
+  %add14 = shl <16 x i8> %add13, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %add15 = add <16 x i8> %add14, %shuffle4
+  %cmp = icmp ugt <16 x i8> %add15, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  %cmp16 = icmp ult <16 x i8> %add15, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+  ; CHECK: select
+  %and = select <16 x i1> %cmp16, <16 x i8> %sext, <16 x i8> zeroinitializer
+  ; CHECK-NEXT: shufflevector <16 x i8> %and, <16 x i8> <i8 0, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 16, i32 0, i32 16, i32 17, i32 16, i32 1, i32 16, i32 17, i32 16, i32 2, i32 16, i32 17, i32 16, i32 3, i32 16, i32 17>
+  ; CHECK-NEXT: shufflevector <16 x i8> %and, <16 x i8> <i8 0, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 16, i32 4, i32 16, i32 17, i32 16, i32 5, i32 16, i32 17, i32 16, i32 6, i32 16, i32 17, i32 16, i32 7, i32 16, i32 17>
+  ; CHECK-NEXT: shufflevector <16 x i8> %and, <16 x i8> <i8 0, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 16, i32 8, i32 16, i32 17, i32 16, i32 9, i32 16, i32 17, i32 16, i32 10, i32 16, i32 17, i32 16, i32 11, i32 16, i32 17>
+  ; CHECK-NEXT: shufflevector <16 x i8> %and, <16 x i8> <i8 0, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 16, i32 12, i32 16, i32 17, i32 16, i32 13, i32 16, i32 17, i32 16, i32 14, i32 16, i32 17, i32 16, i32 15, i32 16, i32 17>
+  ; CHECK-NOT: load
+  ; CHECK-NOT: insertelement
+  ; CHECK-NOT: extractelement
+  %shuffle18 = shufflevector <16 x i8> %and, <16 x i8> <i8 0, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 0, i32 16, i32 17, i32 16, i32 1, i32 16, i32 17, i32 16, i32 2, i32 16, i32 17, i32 16, i32 3, i32 16, i32 17>
+  %shuffle19 = shufflevector <16 x i8> %and, <16 x i8> <i8 0, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 4, i32 16, i32 17, i32 16, i32 5, i32 16, i32 17, i32 16, i32 6, i32 16, i32 17, i32 16, i32 7, i32 16, i32 17>
+  %shuffle20 = shufflevector <16 x i8> %and, <16 x i8> <i8 0, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 8, i32 16, i32 17, i32 16, i32 9, i32 16, i32 17, i32 16, i32 10, i32 16, i32 17, i32 16, i32 11, i32 16, i32 17>
+  %shuffle21 = shufflevector <16 x i8> %and, <16 x i8> <i8 0, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 16, i32 12, i32 16, i32 17, i32 16, i32 13, i32 16, i32 17, i32 16, i32 14, i32 16, i32 17, i32 16, i32 15, i32 16, i32 17>
+  store <16 x i8> %shuffle18, <16 x i8>* %pixel_line, align 16
+  %add.ptr22 = getelementptr inbounds <16 x i8>, <16 x i8>* %pixel_line, i32 1
+  store <16 x i8> %shuffle19, <16 x i8>* %add.ptr22, align 16
+  %add.ptr23 = getelementptr inbounds <16 x i8>, <16 x i8>* %pixel_line, i32 2
+  store <16 x i8> %shuffle20, <16 x i8>* %add.ptr23, align 16
+  %add.ptr24 = getelementptr inbounds <16 x i8>, <16 x i8>* %pixel_line, i32 3
+  store <16 x i8> %shuffle21, <16 x i8>* %add.ptr24, align 16
+  %and25 = and <16 x i8> %and, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <16 x i8> %and25
+}
diff --git a/test/Transforms/NaCl/lit.local.cfg b/test/Transforms/NaCl/lit.local.cfg
new file mode 100644
index 000000000000..a43fd3ebdd5a
--- /dev/null
+++ b/test/Transforms/NaCl/lit.local.cfg
@@ -0,0 +1,3 @@
+# -*- Python -*-
+
+config.suffixes = ['.ll']
diff --git a/test/Transforms/NaCl/normalize-alignment.ll b/test/Transforms/NaCl/normalize-alignment.ll
new file mode 100644
index 000000000000..75cead528d66
--- /dev/null
+++ b/test/Transforms/NaCl/normalize-alignment.ll
@@ -0,0 +1,73 @@
+; RUN: opt -S -normalize-alignment %s 2>&1 | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+; Implicit default alignments are changed to explicit alignments.
+define void @default_alignment_attrs(float %f, double %d) {
+  load i8, i8* null
+  load i32, i32* null
+  load float, float* null
+  load double, double* null
+
+  store i8 100, i8* null
+  store i32 100, i32* null
+  store float %f, float* null
+  store double %d, double* null
+  ret void
+}
+; CHECK-LABEL: @default_alignment_attrs
+; CHECK-NEXT: load i8, i8* null, align 1
+; CHECK-NEXT: load i32, i32* null, align 1
+; CHECK-NEXT: load float, float* null, align 4
+; CHECK-NEXT: load double, double* null, align 8
+; CHECK-NEXT: store i8 100, i8* null, align 1
+; CHECK-NEXT: store i32 100, i32* null, align 1
+; CHECK-NEXT: store float %f, float* null, align 4
+; CHECK-NEXT: store double %d, double* null, align 8
+
+define void @reduce_alignment_assumptions() {
+  load i32, i32* null, align 4
+  load float, float* null, align 2
+  load float, float* null, align 4
+  load float, float* null, align 8
+  load double, double* null, align 2
+  load double, double* null, align 8
+  load double, double* null, align 16
+
+  ; Higher alignment assumptions must be retained for atomics.
+  load atomic i32, i32* null seq_cst, align 4
+  load atomic i32, i32* null seq_cst, align 8
+  store atomic i32 100, i32* null seq_cst, align 4
+  store atomic i32 100, i32* null seq_cst, align 8
+  ret void
+}
+; CHECK-LABEL: @reduce_alignment_assumptions
+; CHECK-NEXT: load i32, i32* null, align 1
+; CHECK-NEXT: load float, float* null, align 1
+; CHECK-NEXT: load float, float* null, align 4
+; CHECK-NEXT: load float, float* null, align 4
+; CHECK-NEXT: load double, double* null, align 1
+; CHECK-NEXT: load double, double* null, align 8
+; CHECK-NEXT: load double, double* null, align 8
+; CHECK-NEXT: load atomic i32, i32* null seq_cst, align 4
+; CHECK-NEXT: load atomic i32, i32* null seq_cst, align 4
+; CHECK-NEXT: store atomic i32 100, i32* null seq_cst, align 4
+; CHECK-NEXT: store atomic i32 100, i32* null seq_cst, align 4
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)
+
+define void @reduce_memcpy_alignment_assumptions(i8* %ptr) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %ptr,
+                                       i32 20, i32 4, i1 false)
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %ptr, i8* %ptr,
+                                        i32 20, i32 4, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 99,
+                                  i32 20, i32 4, i1 false)
+  ret void
+}
+; CHECK-LABEL: @reduce_memcpy_alignment_assumptions
+; CHECK-NEXT: call void @llvm.memcpy.{{.*}}  i32 20, i32 1, i1 false)
+; CHECK-NEXT: call void @llvm.memmove.{{.*}} i32 20, i32 1, i1 false)
+; CHECK-NEXT: call void @llvm.memset.{{.*}}  i32 20, i32 1, i1 false)
diff --git a/test/Transforms/NaCl/pnacl-abi-internalize-symbols-pso.ll b/test/Transforms/NaCl/pnacl-abi-internalize-symbols-pso.ll
new file mode 100644
index 000000000000..1331e50a957f
--- /dev/null
+++ b/test/Transforms/NaCl/pnacl-abi-internalize-symbols-pso.ll
@@ -0,0 +1,22 @@
+; RUN: opt %s -pnacl-abi-simplify-preopt -S | FileCheck %s
+
+; Checks that PNaCl ABI pre-opt simplification correctly internalizes
+; symbols except __pnacl_pso_root.
+
+
+@__pnacl_pso_root = global i32 123
+; CHECK: @__pnacl_pso_root = global i32 123
+
+@global_var = global [4 x i8] c"abcd"
+; CHECK: @global_var = internal global [4 x i8] c"abcd"
+
+
+define void @main() {
+; CHECK: define internal void @main
+  ret void
+}
+
+define external void @foobarbaz() {
+; CHECK: define internal void @foobarbaz
+  ret void
+}
diff --git a/test/Transforms/NaCl/pnacl-abi-internalize-symbols.ll b/test/Transforms/NaCl/pnacl-abi-internalize-symbols.ll
new file mode 100644
index 000000000000..cd15439c2735
--- /dev/null
+++ b/test/Transforms/NaCl/pnacl-abi-internalize-symbols.ll
@@ -0,0 +1,25 @@
+; RUN: opt %s -pnacl-abi-simplify-preopt -S | FileCheck %s
+
+; Checks that PNaCl ABI pre-opt simplification correctly internalizes
+; symbols except _start.
+
+
+@global_var = global [4 x i8] c"abcd"
+; CHECK: @global_var = internal global [4 x i8] c"abcd"
+
+
+define void @main() {
+; CHECK: define internal void @main
+  ret void
+}
+
+define external void @foobarbaz() {
+; CHECK: define internal void @foobarbaz
+  ret void
+}
+
+define void @_start() {
+; CHECK: define void @_start
+  ret void
+}
+
diff --git a/test/Transforms/NaCl/pnacl-abi-simplify-postopt.ll b/test/Transforms/NaCl/pnacl-abi-simplify-postopt.ll
new file mode 100644
index 000000000000..76561d8a2f18
--- /dev/null
+++ b/test/Transforms/NaCl/pnacl-abi-simplify-postopt.ll
@@ -0,0 +1,23 @@
+; RUN: opt %s -pnacl-abi-simplify-postopt -S | FileCheck %s
+; RUN: opt %s -pnacl-abi-simplify-postopt -S \
+; RUN:     | FileCheck %s -check-prefix=CLEANUP
+
+; "-pnacl-abi-simplify-postopt" runs various passes which are tested
+; thoroughly in other *.ll files.  This file is a smoke test to check
+; that the passes work together OK.
+
+target datalayout = "p:32:32:32"
+
+@var = global i32 256
+; CHECK: @var = global [4 x i8]
+
+define i16 @read_var() {
+  %val = load i16, i16* bitcast (i32* @var to i16*)
+  ret i16 %val
+}
+; CHECK: = bitcast [4 x i8]* @var
+; CHECK-NEXT: load i16, i16*
+
+; Check that dead prototypes are successfully removed.
+declare void @unused_prototype(i8*)
+; CLEANUP-NOT: unused_prototype
diff --git a/test/Transforms/NaCl/pnacl-abi-simplify-preopt.ll b/test/Transforms/NaCl/pnacl-abi-simplify-preopt.ll
new file mode 100644
index 000000000000..e34c46d80c8a
--- /dev/null
+++ b/test/Transforms/NaCl/pnacl-abi-simplify-preopt.ll
@@ -0,0 +1,50 @@
+; RUN: opt %s -pnacl-abi-simplify-preopt -S | FileCheck %s
+
+; "-pnacl-abi-simplify-preopt" runs various passes which are tested
+; thoroughly in other *.ll files.  This file is a smoke test to check
+; that "-pnacl-abi-simplify-preopt" runs what it's supposed to run.
+
+declare void @ext_func()
+
+
+define void @invoke_func() {
+  invoke void @ext_func() to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret void
+}
+; CHECK-NOT: invoke void @ext_func()
+; CHECK-NOT: landingpad
+
+
+define void @varargs_func(...) {
+  ret void
+}
+; CHECK-NOT: @varargs_func(...)
+
+
+%MyStruct = type { i32, i32 }
+
+; Checks that ExpandVarArgs and ExpandStructRegs are applied in the
+; right order.
+define void @get_struct_from_varargs(i8* %va_list, %MyStruct* %dest) {
+  %val = va_arg i8* %va_list, %MyStruct
+  store %MyStruct %val, %MyStruct* %dest
+  ret void
+}
+; CHECK-NOT: va_arg
+
+
+@llvm.global_ctors = appending global [0 x { i32, void ()* }] zeroinitializer
+; CHECK-NOT: @llvm.global_ctors
+
+@tls_var = thread_local global i32 0
+; CHECK-NOT: thread_local
+
+@alias = alias i32* @tls_var
+; CHECK-NOT: @alias
+
+@weak_ref = extern_weak global i8*
+; CHECK-NOT: extern_weak
diff --git a/test/Transforms/NaCl/pnacl-abi-simplify.ll b/test/Transforms/NaCl/pnacl-abi-simplify.ll
new file mode 100644
index 000000000000..453990c0e8b0
--- /dev/null
+++ b/test/Transforms/NaCl/pnacl-abi-simplify.ll
@@ -0,0 +1,54 @@
+; RUN: opt %s -pnacl-abi-simplify-preopt -pnacl-abi-simplify-postopt -S \
+; RUN:     | FileCheck %s
+; RUN: opt %s -enable-pnacl-sjlj-eh -pnacl-abi-simplify-preopt \
+; RUN:     -pnacl-abi-simplify-postopt -S | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+; Check that the "tail" attribute is preserved on calls.
+define void @tail_call() {
+  tail call void @tail_call()
+  ret void
+}
+; CHECK: tail call void @tail_call()
+
+; Check that unreachable blocks are pruned out, whether or not SJLJ-EH is used.
+; Unreachable blocks can have instructions with strange properties like
+; self references. Normally, self-references are disallowed.
+define i32 @unreachable_block_self_ref() {
+entry:
+  br label %bb1
+
+bb0:                                              ; preds = %bb0
+  %x = add i32 %x, 0
+  br i1 undef, label %bb1, label %bb0
+
+bb1:                                              ; preds = %bb0, %entry
+  %phi = phi i32 [ 321, %entry ], [ %x, %bb0 ]
+  ret i32 %phi
+}
+; CHECK-LABEL: unreachable_block_self_ref() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret i32 321
+; CHECK-NEXT: }
+
+declare void @my_exit(i32)
+
+; Another check for unreachable block pruning: in this case, the unreachable
+; block can have instructions that confuse liveness analysis.
+define i32 @unreachable_block_bad_liveness() {
+entry:
+  %ret_val = add i32 undef, undef
+  call void @my_exit(i32 %ret_val)
+  unreachable
+label:
+  ; ret_val has no reaching definitions, causing an inconsistency in
+  ; liveness analysis.
+  ret i32 %ret_val
+}
+; CHECK-LABEL: unreachable_block_bad_liveness() {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %ret_val = add i32 undef, undef
+; CHECK-NEXT: call void @my_exit
+; CHECK-NEXT: unreachable
+; CHECK-NEXT: }
diff --git a/test/Transforms/NaCl/pnacl-eh-exception-info.ll b/test/Transforms/NaCl/pnacl-eh-exception-info.ll
new file mode 100644
index 000000000000..979ccf0449fc
--- /dev/null
+++ b/test/Transforms/NaCl/pnacl-eh-exception-info.ll
@@ -0,0 +1,127 @@
+; RUN: opt %s -pnacl-sjlj-eh -S | FileCheck %s
+
+; Example std::type_info objects.
+@exc_typeid1 = external global i8
+@exc_typeid2 = external global i8
+@exc_typeid3 = external global i8
+
+; This must be declared for "-pnacl-sjlj-eh" to work.
+@__pnacl_eh_stack = external thread_local global i8*
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+declare void @external_func()
+
+
+@__pnacl_eh_type_table = external global i8*
+@__pnacl_eh_action_table = external global i8*
+@__pnacl_eh_filter_table = external global i8*
+
+; CHECK: %action_table_entry = type { i32, i32 }
+
+; CHECK: @__pnacl_eh_type_table = internal constant [4 x i8*] [i8* @exc_typeid1, i8* @exc_typeid2, i8* @exc_typeid3, i8* null]
+
+; CHECK: @__pnacl_eh_action_table = internal constant [7 x %action_table_entry] [%action_table_entry { i32 3, i32 0 }, %action_table_entry { i32 2, i32 1 }, %action_table_entry { i32 1, i32 2 }, %action_table_entry { i32 -1, i32 0 }, %action_table_entry { i32 -2, i32 0 }, %action_table_entry { i32 4, i32 0 }, %action_table_entry zeroinitializer]
+
+; CHECK: @__pnacl_eh_filter_table = internal constant [5 x i32] [i32 0, i32 2, i32 3, i32 1, i32 0]
+
+
+; Exception type pointers are allocated IDs which specify the index
+; into __pnacl_eh_type_table where the type may be found.
+define void @test_eh_typeid(i32 %arg) {
+  %id1 = call i32 @llvm.eh.typeid.for(i8* @exc_typeid1)
+  %id2 = call i32 @llvm.eh.typeid.for(i8* @exc_typeid2)
+  %id3 = call i32 @llvm.eh.typeid.for(i8* @exc_typeid3)
+  %cmp1 = icmp eq i32 %arg, %id1
+  %cmp2 = icmp eq i32 %arg, %id2
+  %cmp3 = icmp eq i32 %arg, %id3
+  ret void
+}
+; CHECK: define void @test_eh_typeid
+; CHECK-NEXT: %cmp1 = icmp eq i32 %arg, 1
+; CHECK-NEXT: %cmp2 = icmp eq i32 %arg, 2
+; CHECK-NEXT: %cmp3 = icmp eq i32 %arg, 3
+; CHECK-NEXT: ret void
+
+
+define void @test_single_catch_clause() {
+  invoke void @external_func() to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  landingpad i32 personality i8* null
+      catch i8* @exc_typeid3
+  ret void
+}
+; CHECK: define void @test_single_catch_clause
+; CHECK: store i32 1, i32* %exc_info_ptr
+
+
+define void @test_multiple_catch_clauses() {
+  invoke void @external_func() to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  landingpad i32 personality i8* null
+      catch i8* @exc_typeid1
+      catch i8* @exc_typeid2
+      catch i8* @exc_typeid3
+  ret void
+}
+; CHECK: define void @test_multiple_catch_clauses
+; CHECK: store i32 3, i32* %exc_info_ptr
+
+
+define void @test_empty_filter_clause() {
+  invoke void @external_func() to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  landingpad i32 personality i8* null
+      filter [0 x i8*] zeroinitializer
+  ret void
+}
+; CHECK: define void @test_empty_filter_clause
+; CHECK: store i32 4, i32* %exc_info_ptr
+
+
+define void @test_filter_clause() {
+  invoke void @external_func() to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  landingpad i32 personality i8* null
+      filter [3 x i8*] [i8* @exc_typeid2,
+                        i8* @exc_typeid3,
+                        i8* @exc_typeid1]
+  ret void
+}
+; CHECK: define void @test_filter_clause
+; CHECK: store i32 5, i32* %exc_info_ptr
+
+
+; "catch i8* null" means that any C++ exception matches.
+define void @test_catch_all_clause() {
+  invoke void @external_func() to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  landingpad i32 personality i8* null
+      catch i8* null
+  ret void
+}
+; CHECK: define void @test_catch_all_clause
+; CHECK: store i32 6, i32* %exc_info_ptr
+
+
+define void @test_cleanup_clause() {
+  invoke void @external_func() to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  landingpad i32 personality i8* null
+      cleanup
+  ret void
+}
+; CHECK: define void @test_cleanup_clause
+; CHECK: store i32 7, i32* %exc_info_ptr
diff --git a/test/Transforms/NaCl/pnacl-sjlj-eh-bug.ll b/test/Transforms/NaCl/pnacl-sjlj-eh-bug.ll
new file mode 100644
index 000000000000..2338e5bfe331
--- /dev/null
+++ b/test/Transforms/NaCl/pnacl-sjlj-eh-bug.ll
@@ -0,0 +1,81 @@
+; RUN: opt %s -pnacl-sjlj-eh -O2 -S | FileCheck %s
+
+; datalayout must be specified for GVN to work.
+target datalayout = "p:32:32:32"
+
+; This must be declared for expanding "invoke" and "landingpad" instructions.
+@__pnacl_eh_stack = external thread_local global i8*
+
+declare i1 @might_be_setjmp()
+declare void @external_func(i32* %ptr)
+declare void @var_is_nonzero()
+
+
+; Test for a bug in which PNaClSjLjEH would transform
+; @invoke_optimize_test() such that the call to @var_is_nonzero()
+; could get optimized away by a later optimization pass.  This
+; happened because PNaClSjLjEH generated code similar to
+; @branch_optimize_test() below.
+
+define void @invoke_optimize_test() {
+  %var = alloca i32
+  store i32 0, i32* %var
+
+  invoke void @external_func(i32* %var)
+      to label %exit unwind label %lpad
+
+lpad:
+  landingpad i32 personality i8* null
+      catch i8* null
+  %value = load i32, i32* %var
+  %is_zero = icmp eq i32 %value, 0
+  br i1 %is_zero, label %exit, label %do_call
+
+do_call:
+  call void @var_is_nonzero()
+  ret void
+
+exit:
+  ret void
+}
+; CHECK: define void @invoke_optimize_test()
+; CHECK: @var_is_nonzero()
+
+
+; In @branch_optimize_test(), the optimizer can optimize away the call
+; to @var_is_nonzero(), because it can assume that %var always
+; contains 0 on the "iffalse" branch.
+;
+; The passes "-gvn -instcombine" are enough to do this.
+;
+; The optimizer can do this regardless of whether @might_be_setjmp()
+; is setjmp() or a normal function.  It doesn't need to know that
+; @might_be_setjmp() might return twice, because storing to %var
+; between setjmp() and longjmp() leaves %var pointing to an undefined
+; value.
+
+define void @branch_optimize_test() {
+  %var = alloca i32
+  store i32 0, i32* %var
+
+  %cond = call i1 @might_be_setjmp() returns_twice
+  br i1 %cond, label %iftrue, label %iffalse
+
+iftrue:
+  call void @external_func(i32* %var)
+  ret void
+
+iffalse:
+  %value = load i32, i32* %var
+  %is_zero = icmp eq i32 %value, 0
+  br i1 %is_zero, label %exit, label %do_call
+
+do_call:
+  call void @var_is_nonzero()
+  ret void
+
+exit:
+  ret void
+}
+; CHECK: define void @branch_optimize_test()
+; CHECK-NOT: @var_is_nonzero
diff --git a/test/Transforms/NaCl/pnacl-sjlj-eh.ll b/test/Transforms/NaCl/pnacl-sjlj-eh.ll
new file mode 100644
index 000000000000..6e524a5b775a
--- /dev/null
+++ b/test/Transforms/NaCl/pnacl-sjlj-eh.ll
@@ -0,0 +1,173 @@
+; RUN: opt %s -pnacl-sjlj-eh -S | FileCheck %s
+
+; This must be declared for expanding "invoke" and "landingpad" instructions.
+@__pnacl_eh_stack = external thread_local global i8*
+
+; This must be declared for expanding "resume" instructions.
+declare void @__pnacl_eh_resume(i32* %exception)
+
+declare i32 @external_func(i64 %arg)
+declare void @external_func_void()
+declare i32 @my_setjmp()
+
+
+; CHECK: %ExceptionFrame = type { [1024 x i8], %ExceptionFrame*, i32 }
+
+define i32 @invoke_test(i64 %arg) {
+  %result = invoke i32 @external_func(i64 %arg)
+      to label %cont unwind label %lpad
+cont:
+  ret i32 %result
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret i32 999
+}
+; CHECK: define i32 @invoke_test
+; CHECK-NEXT: %invoke_result_ptr = alloca i32
+; CHECK-NEXT: %invoke_frame = alloca %ExceptionFrame, align 8
+; CHECK-NEXT: %exc_info_ptr = getelementptr %ExceptionFrame, %ExceptionFrame* %invoke_frame, i32 0, i32 2
+; CHECK-NEXT: %invoke_next = getelementptr %ExceptionFrame, %ExceptionFrame* %invoke_frame, i32 0, i32 1
+; CHECK-NEXT: %invoke_jmp_buf = getelementptr %ExceptionFrame, %ExceptionFrame* %invoke_frame, i32 0, i32 0, i32 0
+; CHECK-NEXT: %pnacl_eh_stack = bitcast i8** @__pnacl_eh_stack to %ExceptionFrame**
+; CHECK-NEXT: %old_eh_stack = load %ExceptionFrame*, %ExceptionFrame** %pnacl_eh_stack
+; CHECK-NEXT: store %ExceptionFrame* %old_eh_stack, %ExceptionFrame** %invoke_next
+; CHECK-NEXT: store i32 {{[0-9]+}}, i32* %exc_info_ptr
+; CHECK-NEXT: store %ExceptionFrame* %invoke_frame, %ExceptionFrame** %pnacl_eh_stack
+; CHECK-NEXT: %invoke_is_exc = call i32 @invoke_test_setjmp_caller(i64 %arg, i32 (i64)* @external_func, i8* %invoke_jmp_buf, i32* %invoke_result_ptr)
+; CHECK-NEXT: %result = load i32, i32* %invoke_result_ptr
+; CHECK-NEXT: store %ExceptionFrame* %old_eh_stack, %ExceptionFrame** %pnacl_eh_stack
+; CHECK-NEXT: %invoke_sj_is_zero = icmp eq i32 %invoke_is_exc, 0
+; CHECK-NEXT: br i1 %invoke_sj_is_zero, label %cont, label %lpad
+; CHECK: cont:
+; CHECK-NEXT: ret i32 %result
+; CHECK: lpad:
+; CHECK-NEXT: %landingpad_ptr = bitcast i8* %invoke_jmp_buf to { i8*, i32 }*
+; CHECK-NEXT: %lp = load { i8*, i32 }, { i8*, i32 }* %landingpad_ptr
+; CHECK-NEXT: ret i32 999
+
+; Check definition of helper function:
+; CHECK: define internal i32 @invoke_test_setjmp_caller(i64 %arg, i32 (i64)* %func_ptr, i8* %jmp_buf, i32* %result_ptr) {
+; CHECK-NEXT: %invoke_sj = call i32 @llvm.nacl.setjmp(i8* %jmp_buf) [[RETURNS_TWICE:#[0-9]+]]
+; CHECK-NEXT: %invoke_sj_is_zero = icmp eq i32 %invoke_sj, 0
+; CHECK-NEXT: br i1 %invoke_sj_is_zero, label %normal, label %exception
+; CHECK: normal:
+; CHECK-NEXT: %result = call i32 %func_ptr(i64 %arg)
+; CHECK-NEXT: store i32 %result, i32* %result_ptr
+; CHECK-NEXT: ret i32 0
+; CHECK: exception:
+; CHECK-NEXT: ret i32 1
+
+
+; A landingpad block may be used by multiple "invoke" instructions.
+define i32 @shared_landingpad(i64 %arg) {
+  %result1 = invoke i32 @external_func(i64 %arg)
+      to label %cont1 unwind label %lpad
+cont1:
+  %result2 = invoke i32 @external_func(i64 %arg)
+      to label %cont2 unwind label %lpad
+cont2:
+  ret i32 %result2
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret i32 999
+}
+; CHECK: define i32 @shared_landingpad
+; CHECK: br i1 %invoke_sj_is_zero{{[0-9]*}}, label %cont1, label %lpad
+; CHECK: br i1 %invoke_sj_is_zero{{[0-9]*}}, label %cont2, label %lpad
+
+
+; Check that the pass can handle a landingpad appearing before an invoke.
+define i32 @landingpad_before_invoke() {
+  ret i32 123
+
+dead_block:
+  %lp = landingpad i32 personality i8* null cleanup
+  ret i32 %lp
+}
+; CHECK: define i32 @landingpad_before_invoke
+; CHECK: %lp = load i32, i32* %landingpad_ptr
+
+
+; Test the expansion of the "resume" instruction.
+define void @test_resume({ i8*, i32 } %arg) {
+  resume { i8*, i32 } %arg
+}
+; CHECK: define void @test_resume
+; CHECK-NEXT: %resume_exc = extractvalue { i8*, i32 } %arg, 0
+; CHECK-NEXT: %resume_cast = bitcast i8* %resume_exc to i32*
+; CHECK-NEXT: call void @__pnacl_eh_resume(i32* %resume_cast)
+; CHECK-NEXT: unreachable
+
+
+; Check that call attributes are preserved.
+define i32 @call_attrs(i64 %arg) {
+  %result = invoke fastcc i32 @external_func(i64 inreg %arg) noreturn
+      to label %cont unwind label %lpad
+cont:
+  ret i32 %result
+lpad:
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret i32 999
+}
+; CHECK: define i32 @call_attrs
+; CHECK: %result = call fastcc i32 %func_ptr(i64 inreg %arg) [[NORETURN:#[0-9]+]]
+
+
+; If the PNaClSjLjEH pass needs to insert any instructions into the
+; non-exceptional path, check that PHI nodes are updated correctly.
+; (An earlier version needed to do this, but the current version
+; doesn't.)
+define i32 @invoke_with_phi_nodes(i64 %arg) {
+entry:
+  %result = invoke i32 @external_func(i64 %arg)
+      to label %cont unwind label %lpad
+cont:
+  %cont_phi = phi i32 [ 100, %entry ]
+  ret i32 %cont_phi
+lpad:
+  %lpad_phi = phi i32 [ 200, %entry ]
+  %lp = landingpad { i8*, i32 } personality i8* null cleanup
+  ret i32 %lpad_phi
+}
+; CHECK: define i32 @invoke_with_phi_nodes
+; CHECK: cont:
+; CHECK-NEXT: %cont_phi = phi i32 [ 100, %entry ]
+; CHECK-NEXT: ret i32 %cont_phi
+; CHECK: lpad:
+; CHECK-NEXT: %lpad_phi = phi i32 [ 200, %entry ]
+; CHECK: ret i32 %lpad_phi
+
+
+; Test "void" result type from "invoke".  This requires special
+; handling because void* is not a valid type.
+define void @invoke_void_result() {
+  invoke void @external_func_void() to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  landingpad i32 personality i8* null cleanup
+  ret void
+}
+; CHECK: define void @invoke_void_result()
+; "%result_ptr" argument is omitted from the helper function:
+; CHECK: define internal i32 @invoke_void_result_setjmp_caller(void ()* %func_ptr, i8* %jmp_buf)
+
+
+; A call to setjmp() cannot be moved into a helper function, so test
+; that it isn't moved.
+define void @invoke_setjmp() {
+  %x = invoke i32 @my_setjmp() returns_twice to label %cont unwind label %lpad
+cont:
+  ret void
+lpad:
+  landingpad i32 personality i8* null cleanup
+  ret void
+}
+; CHECK: define void @invoke_setjmp()
+; CHECK-NOT: call
+; CHECK: %x = call i32 @my_setjmp() [[RETURNS_TWICE]]
+; CHECK-NEXT: br label %cont
+
+
+; CHECK: attributes [[RETURNS_TWICE]] = { returns_twice }
+; CHECK: attributes [[NORETURN]] = { noreturn }
diff --git a/test/Transforms/NaCl/promote-i1-ops.ll b/test/Transforms/NaCl/promote-i1-ops.ll
new file mode 100644
index 000000000000..10d9d77c621d
--- /dev/null
+++ b/test/Transforms/NaCl/promote-i1-ops.ll
@@ -0,0 +1,143 @@
+; RUN: opt %s -nacl-promote-i1-ops -S | FileCheck %s
+
+; Test that the PromoteI1Ops pass expands out i1 loads/stores and i1
+; comparison and arithmetic operations, with the exception of "and",
+; "or" and "xor".
+
+
+; i1 loads and stores are converted to i8 load and stores with
+; explicit casts.
+
+define i1 @load(i1* %ptr) {
+  %val = load i1, i1* %ptr
+  ret i1 %val
+}
+; CHECK: define i1 @load
+; CHECK-NEXT: %ptr.i8ptr = bitcast i1* %ptr to i8*
+; CHECK-NEXT: %val.pre_trunc = load i8, i8* %ptr.i8ptr
+; CHECK-NEXT: %val = trunc i8 %val.pre_trunc to i1
+
+define void @store(i1 %val, i1* %ptr) {
+  store i1 %val, i1* %ptr
+  ret void
+}
+; CHECK: define void @store
+; CHECK-NEXT: %ptr.i8ptr = bitcast i1* %ptr to i8*
+; CHECK-NEXT: %val.expand_i1_val = zext i1 %val to i8
+; CHECK-NEXT: store i8 %val.expand_i1_val, i8* %ptr.i8ptr
+
+
+; i1 arithmetic and comparisons are converted to their i8 equivalents
+; with explicit casts.
+
+define i1 @add(i1 %x, i1 %y) {
+  %result = add i1 %x, %y
+  ret i1 %result
+}
+; CHECK: define i1 @add
+; CHECK-NEXT: %x.expand_i1_val = zext i1 %x to i8
+; CHECK-NEXT: %y.expand_i1_val = zext i1 %y to i8
+; CHECK-NEXT: %result.pre_trunc = add i8 %x.expand_i1_val, %y.expand_i1_val
+; CHECK-NEXT: %result = trunc i8 %result.pre_trunc to i1
+
+define i1 @compare(i1 %x, i1 %y) {
+  %result = icmp slt i1 %x, %y
+  ret i1 %result
+}
+; CHECK: define i1 @compare
+; CHECK-NEXT: %x.expand_i1_val = sext i1 %x to i8
+; CHECK-NEXT: %y.expand_i1_val = sext i1 %y to i8
+; CHECK-NEXT: %result = icmp slt i8 %x.expand_i1_val, %y.expand_i1_val
+
+
+; Non-shift bitwise operations should not be modified.
+define void @bitwise_ops(i1 %x, i1 %y) {
+  %and = and i1 %x, %y
+  %or = or i1 %x, %y
+  %xor = xor i1 %x, %y
+  ret void
+}
+; CHECK: define void @bitwise_ops
+; CHECK-NEXT: %and = and i1 %x, %y
+; CHECK-NEXT: %or = or i1 %x, %y
+; CHECK-NEXT: %xor = xor i1 %x, %y
+
+
+define void @unchanged_cases(i32 %x, i32 %y, i32* %ptr) {
+  %add = add i32 %x, %y
+  %cmp = icmp slt i32 %x, %y
+  %val = load i32, i32* %ptr
+  store i32 %x, i32* %ptr
+  ret void
+}
+; CHECK: define void @unchanged_cases
+; CHECK-NEXT: %add = add i32 %x, %y
+; CHECK-NEXT: %cmp = icmp slt i32 %x, %y
+; CHECK-NEXT: %val = load i32, i32* %ptr
+; CHECK-NEXT: store i32 %x, i32* %ptr
+
+define void @i1_switch(i1 %a) {
+entry:
+  switch i1 %a, label %impossible [
+    i1 true, label %truedest
+    i1 false, label %falsedest
+  ]
+
+impossible:
+  %phi = phi i32 [ 123, %entry ]
+  unreachable
+
+truedest:
+  unreachable
+
+falsedest:
+  unreachable
+}
+; CHECK-LABEL: define void @i1_switch
+; CHECK-LABEL: entry:
+; CHECK-NEXT:    br i1 %a, label %truedest, label %falsedest
+; CHECK-LABEL: impossible:
+; CHECK-NEXT:    unreachable
+; CHECK-LABEL: truedest:
+; CHECK-NEXT:    unreachable
+; CHECK-LABEL: falsedest:
+; CHECK-NEXT:    unreachable
+
+define void @i1_switch_default_true(i1 %a) {
+entry:
+  switch i1 %a, label %truedest [
+    i1 false, label %falsedest
+  ]
+
+truedest:
+  unreachable
+falsedest:
+  unreachable
+}
+; CHECK-LABEL: define void @i1_switch_default_true(i1 %a)
+; CHECK-LABEL: entry:
+; CHECK-NEXT:    br i1 %a, label %truedest, label %falsedest
+; CHECK-LABEL: truedest:
+; CHECK-NEXT:    unreachable
+; CHECK-LABEL: falsedest:
+; CHECK-NEXT:    unreachable
+
+define void @i1_switch_default_false(i1 %a) {
+entry:
+  switch i1 %a, label %falsedest [
+    i1 true, label %truedest
+  ]
+
+truedest:
+  unreachable
+falsedest:
+  unreachable
+}
+; CHECK-LABEL: define void @i1_switch_default_false(i1 %a)
+; CHECK-LABEL: entry:
+; CHECK-NEXT:    br i1 %a, label %truedest, label %falsedest
+; CHECK-LABEL: truedest:
+; CHECK-NEXT:    unreachable
+; CHECK-LABEL: falsedest:
+; CHECK-NEXT:    unreachable
+
diff --git a/test/Transforms/NaCl/promote-integer-signatures.ll b/test/Transforms/NaCl/promote-integer-signatures.ll
new file mode 100644
index 000000000000..83bc38307407
--- /dev/null
+++ b/test/Transforms/NaCl/promote-integer-signatures.ll
@@ -0,0 +1,63 @@
+; RUN: opt %s -nacl-promote-ints -S | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+%struct.S0 = type { i24, i32 }
+declare i32 @__gxx_personality_v0(...)
+
+declare i13 @ext_fct(i16, i24, i32)
+; CHECK-LABEL: declare i16 @ext_fct(i16, i32, i32)
+
+define internal i16 @func(i32 %x, i24 %y, i32 %z) {
+  %lo = lshr i24 %y, 8
+  %lo.tk = trunc i24 %lo to i16
+  ret i16 %lo.tk
+}
+; CHECK-LABEL: define internal i16 @func(i32 %x, i32 %y, i32 %z)
+; CHECK-NEXT: %y.clear = and i32 %y, 16777215
+; CHECK-NEXT: %lo = lshr i32 %y.clear, 8
+; CHECK-NEXT: %lo.tk = trunc i32 %lo to i16
+; CHECK-NEXT: ret i16 %lo.tk
+
+
+define void @invoke_example(i16 %x, i24 %y, i32 %z) {
+entry:
+  %tmp2 = invoke i13 @ext_fct(i16 %x, i24 %y, i32 %z)
+    to label %Cont unwind label %Cleanup
+Cont:
+    ret void
+Cleanup:
+  %exn = landingpad i13 personality i32 (...)* @__gxx_personality_v0
+    cleanup
+  resume i13 %exn
+}
+; CHECK-LABEL: define void @invoke_example(i16 %x, i32 %y, i32 %z)
+; CHECK-DAG: %tmp2 = invoke i16 @ext_fct(i16 %x, i32 %y, i32 %z)
+; CHECK-DAG: %exn = landingpad i16 personality i32 (...)* @__gxx_personality_v0
+; CHECK-DAG: resume i16 %exn
+
+define i9 @a_func(i32 %x, i9* %y, i9 %z) {
+  ret i9 %z
+}
+; CHECK-LABEL: define i16 @a_func(i32 %x, i9* %y, i16 %z)
+; CHECK-NEXT: ret i16 %z
+
+define i9 @applying_fct(i9* %x, i9 %y) {
+  %ret = call i9 @applicator(i9 (i32, i9*, i9)* @a_func, i9* %x, i9 %y)
+  ret i9 %ret
+}
+; CHECK-LABEL: define i16 @applying_fct(i9* %x, i16 %y)
+; CHECK-NEXT: call i16 @applicator(i16 (i32, i9*, i16)* @a_func, i9* %x, i16 %y)
+; CHECK-NEXT: ret i16
+
+define i9 @applicator(i9 (i32, i9*, i9)* %fct, i9* %ptr, i9 %val) {
+  %ret = call i9 %fct(i32 0, i9* %ptr, i9 %val)
+; CHECK: call i16 %fct(i32 0, i9* %ptr, i16 %val)
+  ret i9 %ret
+}
+
+define i9 @plain_call(i9* %ptr, i9 %val) {
+  %ret = call i9 @applying_fct(i9* %ptr, i9 %val)
+; CHECK: call i16 @applying_fct(i9* %ptr, i16 %val)
+  ret i9 %ret
+}
\ No newline at end of file
diff --git a/test/Transforms/NaCl/promote-integers.ll b/test/Transforms/NaCl/promote-integers.ll
new file mode 100644
index 000000000000..f700be815de0
--- /dev/null
+++ b/test/Transforms/NaCl/promote-integers.ll
@@ -0,0 +1,568 @@
+; RUN: opt < %s -nacl-promote-ints -S | FileCheck %s
+
+target datalayout = "e-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-p:32:32:32-v128:32:32"
+
+declare void @consume_i16(i16 %a)
+
+; CHECK-LABEL: @sext_to_illegal(
+; CHECK-NEXT: %a40 = sext i32 %a to i64
+; (0xFFFFFFFFFF)
+define void @sext_to_illegal(i32 %a) {
+  %a40 = sext i32 %a to i40
+  ret void
+}
+
+; CHECK-LABEL: @sext_from_illegal(
+define void @sext_from_illegal(i8 %a) {
+; CHECK: call void @consume_i16(i16 -2)
+  %c12 = sext i12 -2 to i16
+  call void @consume_i16(i16 %c12)
+; CHECK: %a12 = sext i8 %a to i16
+  %a12 = sext i8 %a to i12
+; CHECK: %a12.getsign = shl i16 %a12, 4
+; CHECK-NEXT: %a16 = ashr i16 %a12.getsign, 4
+  %a16 = sext i12 %a12 to i16
+; CHECK: %a12.getsign1 = shl i16 %a12, 4
+; CHECK-NEXT: %a14 = ashr i16 %a12.getsign1, 4
+; (0x3FFF)
+  %a14 = sext i12 %a12 to i14
+; CHECK-NEXT: %a12.getsign2 = shl i16 %a12, 4
+; CHECK-NEXT: %a12.signed = ashr i16 %a12.getsign2, 4
+; CHECK-NEXT: %a24 = sext i16 %a12.signed to i32
+; (0xFFFFFF)
+  %a24 = sext i12 %a12 to i24
+
+  %a37 = zext i8 %a to i37
+; CHECK: %a37.getsign = shl i64 %a37, 27
+; CHECK-NEXT: %a64 = ashr i64 %a37.getsign, 27
+  %a64 = sext i37 %a37 to i64
+  ret void
+}
+
+; CHECK-LABEL: @sext_from_undef(
+define void @sext_from_undef(i8 %a) {
+; CHECK-NEXT: %a12 = sext i8 undef to i16
+  %a12 = sext i8 undef to i12
+  ret void
+}
+
+; CHECK-LABEL: @zext_to_illegal(
+define void @zext_to_illegal(i32 %a) {
+; CHECK: zext i32 %a to i64
+; CHECK-NOT: and
+  %a40 = zext i32 %a to i40
+  ret void
+}
+
+; CHECK-LABEL: @zext_from_illegal(
+define void @zext_from_illegal(i8 %a) {
+; get some illegal values to start with
+  %a24 = zext i8 %a to i24
+  %a40 = zext i8 %a to i40
+  %a18 = zext i8 %a to i18
+
+; CHECK: %a32 = and i32 %a24, 16777215
+; (0xFFFFFF)
+  %a32 = zext i24 %a24 to i32
+
+; CHECK: %b24 = and i32 %a18, 262143
+; (0x3FFFF)
+  %b24 = zext i18 %a18 to i24
+
+; CHECK: %a24.clear = and i32 %a24, 16777215
+; CHECK: %b40 = zext i32 %a24.clear to i64
+  %b40 = zext i24 %a24 to i40
+
+; CHECK: call void @consume_i16(i16 4094)
+  %c16 = zext i12 -2 to i16
+  call void @consume_i16(i16 %c16)
+; CHECK: call void @consume_i16(i16 4094)
+  %c14 = zext i12 -2 to i14
+  %c16.2 = zext i14 %c14 to i16
+  call void @consume_i16(i16 %c16.2)
+  ret void
+}
+
+; CHECK-LABEL: @trunc_from_illegal(
+define void @trunc_from_illegal(i8 %a) {
+  %a24 = zext i8 %a to i24
+; CHECK: %a16 = trunc i32 %a24 to i16
+  %a16 = trunc i24 %a24 to i16
+  ret void
+}
+
+; CHECK-LABEL: @trunc_to_illegal(
+define void @trunc_to_illegal(i8 %a8) {
+  %a = zext i8 %a8 to i32
+; CHECK-NOT: trunc i32 %a
+; CHECK-NOT: and
+  %a24 = trunc i32 %a to i24
+
+; CHECK: %a12 = trunc i32 %a24 to i16
+; CHECK-NOT: and
+  %a12 = trunc i24 %a24 to i12
+  ret void
+}
+
+; CHECK-LABEL: @icmpsigned(
+define void @icmpsigned(i32 %a) {
+  %shl = trunc i32 %a to i24
+; CHECK:      %shl.getsign = shl i32 %shl, 8
+; CHECK-NEXT: %shl.signed = ashr i32 %shl.getsign, 8
+; CHECK-NEXT: %cmp = icmp slt i32 %shl.signed, -2
+  %cmp = icmp slt i24 %shl, -2
+  ret void
+}
+
+; Bitcasts are left unchanged.
+%struct.ints = type { i32, i32 }
+; CHECK-LABEL: @bc1(
+; CHECK-NEXT: %bc1 = bitcast i32* %a to i40*
+; CHECK-NEXT: %bc2 = bitcast i40* %bc1 to i32*
+; CHECK-NEXT: %bc3 = bitcast %struct.ints* null to i40*
+; CHECK-NEXT: %bc4 = bitcast i40* %bc1 to %struct.ints*
+define i32* @bc1(i32* %a) {
+  %bc1 = bitcast i32* %a to i40*
+  %bc2 = bitcast i40* %bc1 to i32*
+  %bc3 = bitcast %struct.ints* null to i40*
+  %bc4 = bitcast i40* %bc1 to %struct.ints*
+  ret i32* %bc2
+}
+
+; CHECK: zext i32 %a to i64
+; CHECK: and i64 %a40, 255
+define void @and1(i32 %a) {
+  %a40 = zext i32 %a to i40
+  %and = and i40 %a40, 255
+  ret void
+}
+
+; CHECK-LABEL: @andi3(
+define void @andi3(i8 %a) {
+  %a3 = trunc i8 %a to i3
+; CHECK: and i8 %a3, 2
+  %and = and i3 %a3, 2
+  ret void
+}
+
+; CHECK-LABEL: @ori7(
+define void @ori7(i8 %a, i8 %b) {
+  %a7 = trunc i8 %a to i7
+  %b7 = trunc i8 %b to i7
+; CHECK: %or = or i8 %a7, %b7
+  %or = or i7 %a7, %b7
+  ret void
+}
+
+; CHECK-LABEL: @add1(
+define void @add1(i16 %a) {
+; CHECK-NEXT: %a24 = sext i16 %a to i32
+  %a24 = sext i16 %a to i24
+; CHECK-NEXT: %sum = add i32 %a24, 16777214
+  %sum = add i24 %a24, -2
+; CHECK-NEXT: %sumnsw = add nsw i32 %a24, 16777214
+  %sumnsw = add nsw i24 %a24, -2
+; CHECK-NEXT: %sumnuw = add nuw i32 %a24, 16777214
+  %sumnuw = add nuw i24 %a24, -2
+; CHECK-NEXT: %sumnw = add nuw nsw i32 %a24, 16777214
+  %sumnw = add nuw nsw i24 %a24, -2
+  ret void
+}
+
+; CHECK-LABEL: @mul1(
+define void @mul1(i32 %a, i32 %b) {
+; CHECK-NEXT: %a33 = sext i32 %a to i64
+  %a33 = sext i32 %a to i33
+; CHECK-NEXT: %b33 = sext i32 %b to i64
+  %b33 = sext i32 %b to i33
+; CHECK-NEXT: %product = mul i64 %a33, %b33
+  %product = mul i33 %a33, %b33
+; CHECK-NEXT: %prodnw = mul nuw nsw i64 %a33, %b33
+  %prodnw = mul nuw nsw i33 %a33, %b33
+  ret void
+}
+
+; CHECK-LABEL: @shl1(
+define void @shl1(i16 %a) {
+  %a24 = zext i16 %a to i24
+; CHECK: %ashl = shl i32 %a24, 5
+  %ashl = shl i24 %a24, 5
+
+; CHECK-NEXT: %ashl2 = shl i32 %a24, 1
+  %ashl2 = shl i24 %a24, 4278190081 ;0xFF000001
+
+  %b24 = zext i16 %a to i24
+; CHECK: %b24.clear = and i32 %b24, 16777215
+; CHECK-NEXT: %bshl = shl i32 %a24, %b24.clear
+  %bshl = shl i24 %a24, %b24
+  ret void
+}
+
+; CHECK-LABEL: @shlnuw(
+define void @shlnuw(i16 %a) {
+  %a12 = trunc i16 %a to i12
+; CHECK: %ashl = shl nuw i16 %a12, 5
+  %ashl = shl nuw i12 %a12, 5
+  ret void
+}
+
+; CHECK-LABEL: @lshr1(
+define void @lshr1(i16 %a) {
+  %a24 = zext i16 %a to i24
+; CHECK:      %a24.clear = and i32 %a24, 16777215
+; CHECK-NEXT: %b = lshr i32 %a24.clear, 20
+  %b = lshr i24 %a24, 20
+; CHECK-NEXT: %a24.clear1 = and i32 %a24, 16777215
+; CHECK-NEXT: %c = lshr i32 %a24.clear1, 5
+  %c = lshr i24 %a24, 5
+
+  %b24 = zext i16 %a to i24
+  %d = lshr i24 %a24, %b24
+; CHECK:      %a24.clear2 = and i32 %a24, 16777215
+; CHECK-NEXT: %b24.clear = and i32 %b24, 16777215
+; CHECK-NEXT: %d = lshr i32 %a24.clear2, %b24.clear
+  ret void
+}
+
+; CHECK-LABEL: @ashr1(
+define void @ashr1(i16 %a) {
+  %a24 = sext i16 %a to i24
+; CHECK:      %a24.getsign = shl i32 %a24, 8
+; CHECK-NEXT: %b24 = ashr i32 %a24.getsign, 19
+  %b24 = ashr i24 %a24, 11
+; CHECK-NEXT: %a24.getsign1 = shl i32 %a24, 8
+; CHECK-NEXT: %b24.clear = and i32 %b24, 16777215
+; CHECK-NEXT: %a24.shamt = add i32 %b24.clear, 8
+; CHECK-NEXT: %c = ashr i32 %a24.getsign1, %a24.shamt
+  %c = ashr i24 %a24, %b24
+  ret void
+}
+
+; CHECK-LABEL: @udiv1(
+define void @udiv1(i32 %a, i32 %b) {
+; CHECK-NEXT: %a33 = zext i32 %a to i64
+  %a33 = zext i32 %a to i33
+; CHECK-NEXT: %b33 = zext i32 %b to i64
+  %b33 = zext i32 %b to i33
+; CHECK-NEXT: %a33.clear = and i64 %a33, 8589934591
+; CHECK-NEXT: %b33.clear = and i64 %b33, 8589934591
+; CHECK-NEXT: %result = udiv i64 %a33.clear, %b33.clear
+  %result = udiv i33 %a33, %b33
+  ret void
+}
+
+; CHECK-LABEL: @sdiv1(
+define void @sdiv1(i32 %a, i32 %b) {
+; CHECK-NEXT: %a33 = sext i32 %a to i64
+  %a33 = sext i32 %a to i33
+; CHECK-NEXT: %b33 = sext i32 %b to i64
+; CHECK-NEXT: %a33.getsign = shl i64 %a33, 31
+; CHECK-NEXT: %a33.signed = ashr i64 %a33.getsign, 31
+; CHECK-NEXT: %b33.getsign = shl i64 %b33, 31
+; CHECK-NEXT: %b33.signed = ashr i64 %b33.getsign, 31
+  %b33 = sext i32 %b to i33
+; CHECK-NEXT: %result = sdiv i64 %a33.signed, %b33.signed
+  %result = sdiv i33 %a33, %b33
+  ret void
+}
+
+; CHECK-LABEL: @urem1(
+define void @urem1(i32 %a, i32 %b) {
+; CHECK-NEXT: %a33 = zext i32 %a to i64
+  %a33 = zext i32 %a to i33
+; CHECK-NEXT: %b33 = zext i32 %b to i64
+; CHECK-NEXT: %a33.clear = and i64 %a33, 8589934591
+; CHECK-NEXT: %b33.clear = and i64 %b33, 8589934591
+  %b33 = zext i32 %b to i33
+; CHECK-NEXT: %result = urem i64 %a33.clear, %b33.clear
+  %result = urem i33 %a33, %b33
+  ret void
+}
+
+; CHECK-LABEL: @srem1(
+define void @srem1(i32 %a, i32 %b) {
+; CHECK-NEXT: %a33 = sext i32 %a to i64
+  %a33 = sext i32 %a to i33
+; CHECK-NEXT: %b33 = sext i32 %b to i64
+; CHECK-NEXT: %a33.getsign = shl i64 %a33, 31
+; CHECK-NEXT: %a33.signed = ashr i64 %a33.getsign, 31
+; CHECK-NEXT: %b33.getsign = shl i64 %b33, 31
+; CHECK-NEXT: %b33.signed = ashr i64 %b33.getsign, 31
+  %b33 = sext i32 %b to i33
+; CHECK-NEXT: %result = srem i64 %a33.signed, %b33.signed
+  %result = srem i33 %a33, %b33
+  ret void
+}
+
+; CHECK-LABEL: @phi_icmp(
+define void @phi_icmp(i32 %a) {
+entry:
+  br label %loop
+loop:
+; CHECK: %phi40 = phi i64 [ 1099511627774, %entry ], [ %phi40, %loop ]
+  %phi40 = phi i40 [ -2, %entry ],  [ %phi40, %loop ]
+; CHECK-NEXT: %phi40.clear = and i64 %phi40, 1099511627775
+; CHECK-NEXT: %b = icmp eq i64 %phi40.clear, 1099511627775
+  %b = icmp eq i40 %phi40, -1
+; CHECK-NEXT: br i1 %b, label %loop, label %end
+  br i1 %b, label %loop, label %end
+end:
+  ret void
+}
+
+; CHECK-LABEL: @icmp_ult(
+define void @icmp_ult(i32 %a) {
+  %a40 = zext i32 %a to i40
+; CHECK:      %a40.clear = and i64 %a40, 1099511627775
+; CHECK-NEXT: %b = icmp ult i64 %a40.clear, 1099511627774
+  %b = icmp ult i40 %a40, -2
+
+; CHECK:      %a40.clear1 = and i64 %a40, 1099511627775
+; CHECK-NEXT: %b40.clear = and i64 %b40, 1099511627775
+; CHECK-NEXT: %c = icmp ult i64 %a40.clear1, %b40.clear
+  %b40 = zext i32 %a to i40
+  %c = icmp ult i40 %a40, %b40
+  ret void
+}
+
+; CHECK-LABEL: @select1(
+define void @select1(i32 %a) {
+  %a40 = zext i32 %a to i40
+; CHECK: %s40 = select i1 true, i64 %a40, i64 1099511627775
+  %s40 = select i1 true, i40 %a40, i40 -1
+  ret void
+}
+
+; Allocas are left unchanged.
+; CHECK-LABEL: @alloca40(
+; CHECK: %a = alloca i40, align 8
+define void @alloca40() {
+  %a = alloca i40, align 8
+  %b = bitcast i40* %a to i8*
+  %c = load i8, i8* %b
+  ret void
+}
+
+; CHECK-LABEL: @load24(
+; CHECK:      %bc.loty = bitcast i8* %a to i16*
+; CHECK-NEXT: %load.lo = load i16, i16* %bc.loty, align 8
+; CHECK-NEXT: %load.lo.ext = zext i16 %load.lo to i32
+; CHECK-NEXT: %bc.hi = getelementptr i16, i16* %bc.loty, i32 1
+; CHECK-NEXT: %bc.hity = bitcast i16* %bc.hi to i8*
+; CHECK-NEXT: %load.hi = load i8, i8* %bc.hity, align 2
+; CHECK-NEXT: %load.hi.ext = zext i8 %load.hi to i32
+; CHECK-NEXT: %load.hi.ext.sh = shl i32 %load.hi.ext, 16
+; CHECK-NEXT: %load = or i32 %load.lo.ext, %load.hi.ext.sh
+define void @load24(i8* %a) {
+  %bc = bitcast i8* %a to i24*
+  %load = load i24, i24* %bc, align 8
+  ret void
+}
+
+; CHECK-LABEL: @load24_overaligned(
+; CHECK: %load.lo = load i16, i16* %bc.loty, align 32
+; CHECK: %load.hi = load i8, i8* %bc.hity, align 2
+define void @load24_overaligned(i8* %a) {
+  %bc = bitcast i8* %a to i24*
+  %load = load i24, i24* %bc, align 32
+  ret void
+}
+
+; CHECK-LABEL: @load48(
+; CHECK:      %load.lo = load i32, i32* %a, align 8
+; CHECK-NEXT: %load.lo.ext = zext i32 %load.lo to i64
+; CHECK-NEXT: %bc.hi = getelementptr i32, i32* %a, i32 1
+; CHECK-NEXT: %bc.hity = bitcast i32* %bc.hi to i16*
+; CHECK-NEXT: %load.hi = load i16, i16* %bc.hity, align 4
+; CHECK-NEXT: %load.hi.ext = zext i16 %load.hi to i64
+; CHECK-NEXT: %load.hi.ext.sh = shl i64 %load.hi.ext, 32
+; CHECK-NEXT: %load = or i64 %load.lo.ext, %load.hi.ext.sh
+define void @load48(i32* %a) {
+  %bc = bitcast i32* %a to i48*
+  %load = load i48, i48* %bc, align 8
+  ret void
+}
+
+; CHECK-LABEL: @load56(
+; CHECK:       %bc = bitcast i32* %a to i56*
+; CHECK-NEXT:  %load.lo = load i32, i32* %a, align 8
+; CHECK-NEXT:  %load.lo.ext = zext i32 %load.lo to i64
+; CHECK-NEXT:  %bc.hi = getelementptr i32, i32* %a, i32 1
+; CHECK-NEXT:  %bc.hity = bitcast i32* %bc.hi to i24*
+; CHECK-NEXT:  %bc.hity.loty = bitcast i32* %bc.hi to i16*
+; CHECK-NEXT:  %load.hi.lo = load i16, i16* %bc.hity.loty, align 4
+; CHECK-NEXT:  %load.hi.lo.ext = zext i16 %load.hi.lo to i32
+; CHECK-NEXT:  %bc.hity.hi = getelementptr i16, i16* %bc.hity.loty, i32 1
+; CHECK-NEXT:  %bc.hity.hity = bitcast i16* %bc.hity.hi to i8*
+; CHECK-NEXT:  %load.hi.hi = load i8, i8* %bc.hity.hity, align 2
+; CHECK-NEXT:  %load.hi.hi.ext = zext i8 %load.hi.hi to i32
+; CHECK-NEXT:  %load.hi.hi.ext.sh = shl i32 %load.hi.hi.ext, 16
+; CHECK-NEXT:  %load.hi = or i32 %load.hi.lo.ext, %load.hi.hi.ext.sh
+; CHECK-NEXT:  %load.hi.ext = zext i32 %load.hi to i64
+; CHECK-NEXT:  %load.hi.ext.sh = shl i64 %load.hi.ext, 32
+; CHECK-NEXT:  %load = or i64 %load.lo.ext, %load.hi.ext.sh
+define void @load56(i32* %a) {
+  %bc = bitcast i32* %a to i56*
+  %load = load i56, i56* %bc
+  ret void
+}
+
+; Ensure that types just above and just below large powers of 2 can be compiled.
+; CHECK-LABEL: @load_large(
+define void @load_large(i32* %a) {
+  %bc1 = bitcast i32* %a to i2056*
+  %load1 = load i2056, i2056* %bc1
+  %bc2 = bitcast i32* %a to i4088*
+  %load2 = load i4088, i4088* %bc2
+  ret void
+}
+
+; CHECK-LABEL: @store24(
+; CHECK:      %b24 = zext i8 %b to i32
+; CHECK-NEXT: %bc.loty = bitcast i8* %a to i16*
+; CHECK-NEXT: %b24.lo = trunc i32 %b24 to i16
+; CHECK-NEXT: store i16 %b24.lo, i16* %bc.loty, align 4
+; CHECK-NEXT: %b24.hi.sh = lshr i32 %b24, 16
+; CHECK-NEXT: %bc.hi = getelementptr i16, i16* %bc.loty, i32 1
+; CHECK-NEXT: %b24.hi = trunc i32 %b24.hi.sh to i8
+; CHECK-NEXT: %bc.hity = bitcast i16* %bc.hi to i8*
+; CHECK-NEXT: store i8 %b24.hi, i8* %bc.hity, align 2
+define void @store24(i8* %a, i8 %b) {
+  %bc = bitcast i8* %a to i24*
+  %b24 = zext i8 %b to i24
+  store i24 %b24, i24* %bc
+  ret void
+}
+
+; CHECK-LABEL: @store24_overaligned(
+; CHECK: store i16 %b24.lo, i16* %bc.loty, align 32
+; CHECK: store i8 %b24.hi, i8* %bc.hity, align 2
+define void @store24_overaligned(i8* %a, i8 %b) {
+  %bc = bitcast i8* %a to i24*
+  %b24 = zext i8 %b to i24
+  store i24 %b24, i24* %bc, align 32
+  ret void
+}
+
+; CHECK-LABEL: @store56(
+; CHECK:      %b56 = zext i8 %b to i64
+; CHECK-NEXT: %bc.loty = bitcast i8* %a to i32*
+; CHECK-NEXT: %b56.lo = trunc i64 %b56 to i32
+; CHECK-NEXT: store i32 %b56.lo, i32* %bc.loty, align 8
+; CHECK-NEXT: %b56.hi.sh = lshr i64 %b56, 32
+; CHECK-NEXT: %bc.hi = getelementptr i32, i32* %bc.loty, i32 1
+; CHECK-NEXT: %bc.hity = bitcast i32* %bc.hi to i24*
+; CHECK-NEXT: %bc.hity.loty = bitcast i32* %bc.hi to i16*
+; CHECK-NEXT: %b56.hi.sh.lo = trunc i64 %b56.hi.sh to i16
+; CHECK-NEXT: store i16 %b56.hi.sh.lo, i16* %bc.hity.loty, align 4
+; CHECK-NEXT: %b56.hi.sh.hi.sh = lshr i64 %b56.hi.sh, 16
+; CHECK-NEXT: %bc.hity.hi = getelementptr i16, i16* %bc.hity.loty, i32 1
+; CHECK-NEXT: %b56.hi.sh.hi = trunc i64 %b56.hi.sh.hi.sh to i8
+; CHECK-NEXT: %bc.hity.hity = bitcast i16* %bc.hity.hi to i8*
+; CHECK-NEXT: store i8 %b56.hi.sh.hi, i8* %bc.hity.hity, align 2
+define void @store56(i8* %a, i8 %b) {
+  %bc = bitcast i8* %a to i56*
+  %b56 = zext i8 %b to i56
+  store i56 %b56, i56* %bc
+  ret void
+}
+
+; Ensure that types just above and just below large powers of 2 can be compiled.
+; CHECK-LABEL: @store_large(
+define void @store_large(i32* %a, i8 %b) {
+  %bc1 = bitcast i32* %a to i2056*
+  %b2056 = zext i8 %b to i2056
+  store i2056 %b2056, i2056* %bc1
+  %bc2 = bitcast i32* %a to i4088*
+  %b4088 = zext i8 %b to i4088
+  store i4088 %b4088, i4088* %bc2
+  ret void
+}
+
+; Undef can be converted to anything that's convenient.
+; CHECK-LABEL: @undefoperand(
+; CHECK-NEXT: %a40 = zext i32 %a to i64
+; CHECK-NEXT: %au = and i64 %a40, {{.*}}
+define void @undefoperand(i32 %a) {
+  %a40 = zext i32 %a to i40
+  %au = and i40 %a40, undef
+  ret void
+}
+
+; CHECK-LABEL: @constoperand(
+; CHECK-NEXT: %a40 = zext i32 %a to i64
+; CHECK-NEXT: %au = and i64 %a40, 1099494850815
+define void @constoperand(i32 %a) {
+  %a40 = zext i32 %a to i40
+  %au = and i40 %a40, 1099494850815 ; 0xffff0000ff
+  ret void
+}
+
+; CHECK-LABEL: @switch(
+; CHECK-NEXT: %a24 = zext i16 %a to i32
+; CHECK-NEXT: %a24.clear = and i32 %a24, 16777215
+; CHECK-NEXT: switch i32 %a24.clear, label %end [
+; CHECK-NEXT: i32 0, label %if1
+; CHECK-NEXT: i32 1, label %if2
+define void @switch(i16 %a) {
+  %a24 = zext i16 %a to i24
+  switch i24 %a24, label %end [
+    i24 0, label %if1
+    i24 1, label %if2
+  ]
+if1:
+  ret void
+if2:
+  ret void
+end:
+  ret void
+}
+
+
+; The getelementptr here should be handled unchanged.
+; CHECK-LABEL: @pointer_to_array(
+; CHECK: %element_ptr = getelementptr [2 x i40], [2 x i40]* %ptr, i32 0, i32 0
+define void @pointer_to_array([2 x i40]* %ptr) {
+  %element_ptr = getelementptr [2 x i40], [2 x i40]* %ptr, i32 0, i32 0
+  load i40, i40* %element_ptr
+  ret void
+}
+
+; Store 0x1222277777777 and make sure it's split up into 3 stores of each part.
+; CHECK-LABEL: @constants(
+; CHECK: store i32 2004318071, i32* %{{.*}}, align 4
+; CHECK: store i16 8738, i16* %{{.*}}
+; CHECK: store i8 1, i8* %{{.*}}
+define void @constants(i56* %ptr) {
+  store i56 319006405261175, i56* %ptr, align 4
+  ret void
+}
+
+@from = external global [300 x i8], align 4
+@to = external global [300 x i8], align 4
+
+; CHECK-LABEL: @load_bc_to_i80(
+; CHECK-NEXT:  %expanded = bitcast [300 x i8]* @from to i64*
+; CHECK-NEXT:  %loaded.short.lo = load i64, i64* %expanded, align 4
+; CHECK-NEXT:  %loaded.short.lo.ext = zext i64 %loaded.short.lo to i128
+; CHECK-NEXT:  %expanded5 = bitcast [300 x i8]* @from to i64*
+; CHECK-NEXT:  %expanded4 = getelementptr i64, i64* %expanded5, i32 1
+; CHECK-NEXT:  %expanded3 = bitcast i64* %expanded4 to i16*
+; CHECK-NEXT:  %loaded.short.hi = load i16, i16* %expanded3, align 4
+; CHECK-NEXT:  %loaded.short.hi.ext = zext i16 %loaded.short.hi to i128
+; CHECK-NEXT:  %loaded.short.hi.ext.sh = shl i128 %loaded.short.hi.ext, 64
+; CHECK-NEXT:  %loaded.short = or i128 %loaded.short.lo.ext, %loaded.short.hi.ext.sh
+; CHECK-NEXT:  %loaded.short.lo1 = trunc i128 %loaded.short to i64
+; CHECK-NEXT:  %expanded6 = bitcast [300 x i8]* @to to i64*
+; CHECK-NEXT:  store i64 %loaded.short.lo1, i64* %expanded6, align 4
+; CHECK-NEXT:  %loaded.short.hi.sh = lshr i128 %loaded.short, 64
+; CHECK-NEXT:  %loaded.short.hi2 = trunc i128 %loaded.short.hi.sh to i16
+; CHECK-NEXT:  %expanded9 = bitcast [300 x i8]* @to to i64*
+; CHECK-NEXT:  %expanded8 = getelementptr i64, i64* %expanded9, i32 1
+; CHECK-NEXT:  %expanded7 = bitcast i64* %expanded8 to i16*
+; CHECK-NEXT:  store i16 %loaded.short.hi2, i16* %expanded7, align 4
+define void @load_bc_to_i80() {
+  %loaded.short = load i80, i80* bitcast ([300 x i8]* @from to i80*), align 4
+  store i80 %loaded.short, i80* bitcast ([300 x i8]* @to to i80*), align 4
+  ret void
+}
diff --git a/test/Transforms/NaCl/remove-asm-memory.ll b/test/Transforms/NaCl/remove-asm-memory.ll
new file mode 100644
index 000000000000..cd3f99c83e41
--- /dev/null
+++ b/test/Transforms/NaCl/remove-asm-memory.ll
@@ -0,0 +1,88 @@
+; RUN: opt < %s -nacl-rewrite-atomics -remove-asm-memory -S | \
+; RUN:       FileCheck %s
+; RUN: opt < %s -O3 -nacl-rewrite-atomics -remove-asm-memory -S | \
+; RUN:       FileCheck %s
+; RUN: opt < %s -O3 -nacl-rewrite-atomics -remove-asm-memory -S | \
+; RUN:       FileCheck %s -check-prefix=ELIM
+; RUN: opt < %s -nacl-rewrite-atomics -remove-asm-memory -S | \
+; RUN:       FileCheck %s -check-prefix=CLEANED
+
+; ``asm("":::"memory")`` is used as a compiler barrier and the GCC-style
+; builtin ``__sync_synchronize`` is intended as a barrier for all memory
+; that could be observed by external threads. They both get rewritten
+; for NaCl by Clang to a sequentially-consistent fence surrounded by
+; ``call void asm sideeffect "", "~{memory}"``.
+;
+; The test is also run at O3 to make sure that non-volatile and
+; non-atomic loads and stores to escaping objects (i.e. loads and stores
+; which could be observed by other threads) don't get unexpectedly
+; eliminated.
+
+; CLEANED-NOT: asm
+
+target datalayout = "p:32:32:32"
+
+@a = external global i32
+@b = external global i32
+
+; Different triples encode ``asm("":::"memory")``'s "touch everything"
+; constraints differently.  They should get detected and removed.
+define void @memory_assembly_encoding_test() {
+; CHECK: @memory_assembly_encoding_test()
+  call void asm sideeffect "", "~{memory}"()
+  call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"()
+  call void asm sideeffect "", "~{foo},~{memory},~{bar}"()
+
+  ret void
+  ; CHECK-NEXT: ret void
+}
+
+define void @memory_assembly_ordering_test() {
+; CHECK: @memory_assembly_ordering_test()
+  %1 = load i32, i32* @a, align 4
+  store i32 %1, i32* @b, align 4
+  call void asm sideeffect "", "~{memory}"()
+  fence seq_cst
+  call void asm sideeffect "", "~{memory}"()
+  ; CHECK-NEXT: %1 = load i32, i32* @a, align 4
+  ; CHECK-NEXT: store i32 %1, i32* @b, align 4
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.fence.all()
+
+  ; Redundant load from the previous location, and store to the same
+  ; location (making the previous one dead). Shouldn't get eliminated
+  ; because of the fence.
+  %2 = load i32, i32* @a, align 4
+  store i32 %2, i32* @b, align 4
+  call void asm sideeffect "", "~{memory}"()
+  fence seq_cst
+  call void asm sideeffect "", "~{memory}"()
+  ; CHECK-NEXT: %2 = load i32, i32* @a, align 4
+  ; CHECK-NEXT: store i32 %2, i32* @b, align 4
+  ; CHECK-NEXT: call void @llvm.nacl.atomic.fence.all()
+
+  ; Same here.
+  %3 = load i32, i32* @a, align 4
+  store i32 %3, i32* @b, align 4
+  ; CHECK-NEXT: %3 = load i32, i32* @a, align 4
+  ; CHECK-NEXT: store i32 %3, i32* @b, align 4
+
+  ret void
+  ; CHECK-NEXT: ret void
+}
+
+; Same function as above, but without the barriers. At O3 some loads and
+; stores should get eliminated.
+define void @memory_ordering_test() {
+; ELIM: @memory_ordering_test()
+  %1 = load i32, i32* @a, align 4
+  store i32 %1, i32* @b, align 4
+  %2 = load i32, i32* @a, align 4
+  store i32 %2, i32* @b, align 4
+  %3 = load i32, i32* @a, align 4
+  store i32 %3, i32* @b, align 4
+  ; ELIM-NEXT: %1 = load i32, i32* @a, align 4
+  ; ELIM-NEXT: store i32 %1, i32* @b, align 4
+
+  ret void
+  ; ELIM-NEXT: ret void
+}
diff --git a/test/Transforms/NaCl/replace-ptrs-with-ints.ll b/test/Transforms/NaCl/replace-ptrs-with-ints.ll
new file mode 100644
index 000000000000..8a9d3de22179
--- /dev/null
+++ b/test/Transforms/NaCl/replace-ptrs-with-ints.ll
@@ -0,0 +1,656 @@
+; RUN: opt %s -replace-ptrs-with-ints -S | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+
+%struct = type { i32, i32 }
+
+declare %struct* @addr_taken_func(%struct*)
+
+@addr_of_func = global %struct* (%struct*)* @addr_taken_func
+; CHECK: @addr_of_func = global %struct* (%struct*)* bitcast (i32 (i32)* @addr_taken_func to %struct* (%struct*)*)
+
+@blockaddr = global i8* blockaddress(@indirectbr, %l1)
+; CHECK: @blockaddr = global i8* blockaddress(@indirectbr, %l1)
+
+
+define i8* @pointer_arg(i8* %ptr, i64 %non_ptr) {
+  ret i8* %ptr
+}
+; CHECK: define i32 @pointer_arg(i32 %ptr, i64 %non_ptr) {
+; CHECK-NEXT: ret i32 %ptr
+; CHECK-NEXT: }
+
+
+declare i8* @declared_func(i8*, i64)
+; CHECK: declare i32 @declared_func(i32, i64)
+
+
+define void @self_reference_phi(i8* %ptr) {
+entry:
+  br label %loop
+loop:
+  %x = phi i8* [ %x, %loop ], [ %ptr, %entry ]
+  br label %loop
+}
+; CHECK: define void @self_reference_phi(i32 %ptr) {
+; CHECK: %x = phi i32 [ %x, %loop ], [ %ptr, %entry ]
+
+; Self-referencing bitcasts are possible in unreachable basic blocks.
+; It is not very likely that we will encounter this, but we handle it
+; for completeness.
+define void @self_reference_bitcast(i8** %dest) {
+  ret void
+unreachable_loop:
+  store i8* %self_ref, i8** %dest
+  %self_ref = bitcast i8* %self_ref to i8*
+  store i8* %self_ref, i8** %dest
+  br label %unreachable_loop
+}
+; CHECK: define void @self_reference_bitcast(i32 %dest) {
+; CHECK: store i32 undef, i32* %dest.asptr
+; CHECK: store i32 undef, i32* %dest.asptr
+
+define void @circular_reference_bitcasts(i8** %dest) {
+  ret void
+unreachable_loop:
+  store i8* %cycle1, i8** %dest
+  %cycle1 = bitcast i8* %cycle2 to i8*
+  %cycle2 = bitcast i8* %cycle1 to i8*
+  br label %unreachable_loop
+}
+; CHECK: define void @circular_reference_bitcasts(i32 %dest) {
+; CHECK: store i32 undef, i32* %dest.asptr
+
+define void @circular_reference_inttoptr(i8** %dest) {
+  ret void
+unreachable_loop:
+  %ptr = inttoptr i32 %int to i8*
+  %int = ptrtoint i8* %ptr to i32
+  store i8* %ptr, i8** %dest
+  br label %unreachable_loop
+}
+; CHECK: define void @circular_reference_inttoptr(i32 %dest) {
+; CHECK: store i32 undef, i32* %dest.asptr
+
+define i8* @forwards_reference(%struct** %ptr) {
+  br label %block1
+block2:
+  ; Forwards reference to %val.
+  %cast = bitcast %struct* %val to i8*
+  br label %block3
+block1:
+  %val = load %struct*, %struct** %ptr
+  br label %block2
+block3:
+  ; Backwards reference to a forwards reference that has already been
+  ; resolved.
+  ret i8* %cast
+}
+; CHECK: define i32 @forwards_reference(i32 %ptr) {
+; CHECK-NEXT: br label %block1
+; CHECK: block2:
+; CHECK-NEXT: br label %block3
+; CHECK: block1:
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i32*
+; CHECK-NEXT: %val = load i32, i32* %ptr.asptr
+; CHECK-NEXT: br label %block2
+; CHECK: block3:
+; CHECK-NEXT: ret i32 %val
+
+
+define i8* @phi_multiple_entry(i1 %arg, i8* %ptr) {
+entry:
+  br i1 %arg, label %done, label %done
+done:
+  %result = phi i8* [ %ptr, %entry ], [ %ptr, %entry ]
+  ret i8* %result
+}
+; CHECK: define i32 @phi_multiple_entry(i1 %arg, i32 %ptr) {
+; CHECK: %result = phi i32 [ %ptr, %entry ], [ %ptr, %entry ]
+
+
+define i8* @select(i1 %cond, i8* %val1, i8* %val2) {
+  %r = select i1 %cond, i8* %val1, i8* %val2
+  ret i8* %r
+}
+; CHECK: define i32 @select(i1 %cond, i32 %val1, i32 %val2) {
+; CHECK-NEXT: %r = select i1 %cond, i32 %val1, i32 %val2
+
+
+define i32* @ptrtoint_same_size(i32* %ptr) {
+  %a = ptrtoint i32* %ptr to i32
+  %b = add i32 %a, 4
+  %c = inttoptr i32 %b to i32*
+  ret i32* %c
+}
+; CHECK: define i32 @ptrtoint_same_size(i32 %ptr) {
+; CHECK-NEXT: %b = add i32 %ptr, 4
+; CHECK-NEXT: ret i32 %b
+
+
+define i32* @ptrtoint_different_size(i32* %ptr) {
+  %a = ptrtoint i32* %ptr to i64
+  %b = add i64 %a, 4
+  %c = inttoptr i64 %b to i32*
+  ret i32* %c
+}
+; CHECK: define i32 @ptrtoint_different_size(i32 %ptr) {
+; CHECK-NEXT: %a = zext i32 %ptr to i64
+; CHECK-NEXT: %b = add i64 %a, 4
+; CHECK-NEXT: %c = trunc i64 %b to i32
+; CHECK-NEXT: ret i32 %c
+
+define i8 @ptrtoint_truncates_var(i32* %ptr) {
+  %a = ptrtoint i32* %ptr to i8
+  ret i8 %a
+}
+; CHECK: define i8 @ptrtoint_truncates_var(i32 %ptr) {
+; CHECK-NEXT: %a = trunc i32 %ptr to i8
+
+define i8 @ptrtoint_truncates_global() {
+  %a = ptrtoint i32* @var to i8
+  ret i8 %a
+}
+; CHECK: define i8 @ptrtoint_truncates_global() {
+; CHECK-NEXT: %expanded = ptrtoint i32* @var to i32
+; CHECK-NEXT: %a = trunc i32 %expanded to i8
+
+
+define i32* @pointer_bitcast(i64* %ptr) {
+  %cast = bitcast i64* %ptr to i32*
+  ret i32* %cast
+}
+; CHECK: define i32 @pointer_bitcast(i32 %ptr) {
+; CHECK-NEXT: ret i32 %ptr
+
+; Same-type non-pointer bitcasts happen to be left alone by this pass.
+define i32 @no_op_bitcast(i32 %val) {
+  %val2 = bitcast i32 %val to i32
+  ret i32 %val2
+}
+; CHECK: define i32 @no_op_bitcast(i32 %val) {
+; CHECK-NEXT: %val2 = bitcast i32 %val to i32
+
+define i64 @kept_bitcast(double %d) {
+  %i = bitcast double %d to i64
+  ret i64 %i
+}
+; CHECK: define i64 @kept_bitcast(double %d) {
+; CHECK-NEXT: %i = bitcast double %d to i64
+
+
+define i32 @constant_pointer_null() {
+  %val = ptrtoint i32* null to i32
+  ret i32 %val
+}
+; CHECK: define i32 @constant_pointer_null() {
+; CHECK-NEXT: ret i32 0
+
+define i32 @constant_pointer_undef() {
+  %val = ptrtoint i32* undef to i32
+  ret i32 %val
+}
+; CHECK: define i32 @constant_pointer_undef() {
+; CHECK-NEXT: ret i32 undef
+
+define i16* @constant_pointer_null_load() {
+  %val = load i16*, i16** null
+  ret i16* %val
+}
+; CHECK: define i32 @constant_pointer_null_load() {
+; CHECK-NEXT: %.asptr = inttoptr i32 0 to i32*
+; CHECK-NEXT: %val = load i32, i32* %.asptr
+
+define i16* @constant_pointer_undef_load() {
+  %val = load i16*, i16** undef
+  ret i16* %val
+}
+; CHECK: define i32 @constant_pointer_undef_load() {
+; CHECK-NEXT: %.asptr = inttoptr i32 undef to i32*
+; CHECK-NEXT: %val = load i32, i32* %.asptr
+
+
+define i8 @load(i8* %ptr) {
+  %x = load i8, i8* %ptr
+  ret i8 %x
+}
+; CHECK: define i8 @load(i32 %ptr) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i8*
+; CHECK-NEXT: %x = load i8, i8* %ptr.asptr
+
+define void @store(i8* %ptr, i8 %val) {
+  store i8 %val, i8* %ptr
+  ret void
+}
+; CHECK: define void @store(i32 %ptr, i8 %val) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i8*
+; CHECK-NEXT: store i8 %val, i8* %ptr.asptr
+
+
+define i8* @load_ptr(i8** %ptr) {
+  %x = load i8*, i8** %ptr
+  ret i8* %x
+}
+; CHECK: define i32 @load_ptr(i32 %ptr) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i32*
+; CHECK-NEXT: %x = load i32, i32* %ptr.asptr
+
+define void @store_ptr(i8** %ptr, i8* %val) {
+  store i8* %val, i8** %ptr
+  ret void
+}
+; CHECK: define void @store_ptr(i32 %ptr, i32 %val) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i32*
+; CHECK-NEXT: store i32 %val, i32* %ptr.asptr
+
+
+define i8 @load_attrs(i8* %ptr) {
+  %x = load atomic volatile i8, i8* %ptr seq_cst, align 128
+  ret i8 %x
+}
+; CHECK: define i8 @load_attrs(i32 %ptr) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i8*
+; CHECK-NEXT: %x = load atomic volatile i8, i8* %ptr.asptr seq_cst, align 128
+
+define void @store_attrs(i8* %ptr, i8 %val) {
+  store atomic volatile i8 %val, i8* %ptr singlethread release, align 256
+  ret void
+}
+; CHECK: define void @store_attrs(i32 %ptr, i8 %val) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i8*
+; CHECK-NEXT: store atomic volatile i8 %val, i8* %ptr.asptr singlethread release, align 256
+
+
+define i32 @cmpxchg(i32* %ptr, i32 %a, i32 %b) {
+  %r = cmpxchg i32* %ptr, i32 %a, i32 %b seq_cst seq_cst
+  %res = extractvalue { i32, i1 } %r, 0
+  ret i32 %res
+}
+; CHECK: define i32 @cmpxchg(i32 %ptr, i32 %a, i32 %b) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i32*
+; CHECK-NEXT: %r = cmpxchg i32* %ptr.asptr, i32 %a, i32 %b seq_cst seq_cst
+
+define i32 @atomicrmw(i32* %ptr, i32 %x) {
+  %r = atomicrmw add i32* %ptr, i32 %x seq_cst
+  ret i32 %r
+}
+; CHECK: define i32 @atomicrmw(i32 %ptr, i32 %x) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i32*
+; CHECK-NEXT: %r = atomicrmw add i32* %ptr.asptr, i32 %x seq_cst
+
+
+define i8* @indirect_call(i8* (i8*)* %func, i8* %arg) {
+  %result = call i8* %func(i8* %arg)
+  ret i8* %result
+}
+; CHECK: define i32 @indirect_call(i32 %func, i32 %arg) {
+; CHECK-NEXT: %func.asptr = inttoptr i32 %func to i32 (i32)*
+; CHECK-NEXT: %result = call i32 %func.asptr(i32 %arg)
+; CHECK-NEXT: ret i32 %result
+
+
+; Test forwards reference
+define i8* @direct_call1(i8* %arg) {
+  %result = call i8* @direct_call2(i8* %arg)
+  ret i8* %result
+}
+; CHECK: define i32 @direct_call1(i32 %arg) {
+; CHECK-NEXT: %result = call i32 @direct_call2(i32 %arg)
+; CHECK-NEXT: ret i32 %result
+
+; Test backwards reference
+define i8* @direct_call2(i8* %arg) {
+  %result = call i8* @direct_call1(i8* %arg)
+  ret i8* %result
+}
+; CHECK: define i32 @direct_call2(i32 %arg) {
+; CHECK-NEXT: %result = call i32 @direct_call1(i32 %arg)
+; CHECK-NEXT: ret i32 %result
+
+
+@var = global i32 0
+
+define i32* @get_addr_of_global() {
+  ret i32* @var
+}
+; CHECK: define i32 @get_addr_of_global() {
+; CHECK-NEXT: %expanded = ptrtoint i32* @var to i32
+; CHECK-NEXT: ret i32 %expanded
+
+define %struct* (%struct*)* @get_addr_of_func() {
+  ret %struct* (%struct*)* @addr_taken_func
+}
+; CHECK: define i32 @get_addr_of_func() {
+; CHECK-NEXT: %expanded = ptrtoint i32 (i32)* @addr_taken_func to i32
+; CEHCK-NEXT: ret i32 %expanded
+
+
+define i32 @load_global() {
+  %val = load i32, i32* @var
+  ret i32 %val
+}
+; CHECK: define i32 @load_global() {
+; CHECK-NEXT: %val = load i32, i32* @var
+; CHECK-NEXT: ret i32 %val
+
+define i16 @load_global_bitcast() {
+  %ptr = bitcast i32* @var to i16*
+  %val = load i16, i16* %ptr
+  ret i16 %val
+}
+; CHECK: define i16 @load_global_bitcast() {
+; CHECK-NEXT: %var.bc = bitcast i32* @var to i16*
+; CHECK-NEXT: %val = load i16, i16* %var.bc
+; CHECK-NEXT: ret i16 %val
+
+
+; Check that unsimplified allocas are properly handled:
+declare void @receive_alloca(%struct* %ptr)
+
+define void @unsimplified_alloca() {
+  %a = alloca %struct
+  call void @receive_alloca(%struct* %a)
+  unreachable
+}
+; CHECK-LABEL: define void @unsimplified_alloca()
+; CHECK-NEXT:    %a = alloca %struct
+; CHECK-NEXT:    %a.asint = ptrtoint %struct* %a to i32
+; CHECK-NEXT:    call void @receive_alloca(i32 %a.asint)
+; CHECK-NEXT:    unreachable
+
+
+define i1 @compare(i8* %ptr1, i8* %ptr2) {
+  %cmp = icmp ult i8* %ptr1, %ptr2
+  ret i1 %cmp
+}
+; CHECK: define i1 @compare(i32 %ptr1, i32 %ptr2) {
+; CHECK-NEXT: %cmp = icmp ult i32 %ptr1, %ptr2
+
+
+declare i8* @llvm.some.intrinsic(i8* %ptr)
+
+define i8* @preserve_intrinsic_type(i8* %ptr) {
+  %result = call i8* @llvm.some.intrinsic(i8* %ptr)
+  ret i8* %result
+}
+; CHECK: define i32 @preserve_intrinsic_type(i32 %ptr) {
+; CHECK-NEXT: %ptr.asptr = inttoptr i32 %ptr to i8*
+; CHECK-NEXT: %result = call i8* @llvm.some.intrinsic(i8* %ptr.asptr)
+; CHECK-NEXT: %result.asint = ptrtoint i8* %result to i32
+; CHECK-NEXT: ret i32 %result.asint
+
+
+; Just check that the pass does not crash on inline asm.
+define i16* @inline_asm1(i8* %ptr) {
+  %val = call i16* asm "foo", "=r,r"(i8* %ptr)
+  ret i16* %val
+}
+
+define i16** @inline_asm2(i8** %ptr) {
+  %val = call i16** asm "foo", "=r,r"(i8** %ptr)
+  ret i16** %val
+}
+
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+define void @debug_declare(i32 %val) {
+  ; We normally expect llvm.dbg.declare to be used on an alloca.
+  %var = alloca i32
+  call void @llvm.dbg.declare(metadata i32* %var, metadata !11, metadata !12), !dbg !13
+  call void @llvm.dbg.declare(metadata i32 %val, metadata !14, metadata !12), !dbg !13
+  ret void
+}
+; CHECK: define void @debug_declare(i32 %val) {
+; CHECK-NEXT: %var = alloca i32
+; CHECK-NEXT: call void @llvm.dbg.declare(metadata i32* %var, metadata !11, metadata !12), !dbg !13
+; This case is currently not converted.
+; CHECK-NEXT: call void @llvm.dbg.declare(metadata !2, metadata !14, metadata !12)
+; CHECK-NEXT: ret void
+
+; For now, debugging info for values is lost.  replaceAllUsesWith()
+; does not work for metadata references -- it converts them to nulls.
+; This makes dbg.value too tricky to handle for now.
+define void @debug_value(i32 %val, i8* %ptr) {
+  tail call void @llvm.dbg.value(metadata i32 %val, i64 1, metadata !11, metadata !12), !dbg !18
+  tail call void @llvm.dbg.value(metadata i8* %ptr, i64 2, metadata !14, metadata !12), !dbg !18
+
+; check that we don't crash when encountering odd things:
+  tail call void @llvm.dbg.value(metadata i8* null, i64 3, metadata !11, metadata !12), !dbg !18
+  tail call void @llvm.dbg.value(metadata i8* undef, i64 4, metadata !11, metadata !12), !dbg !18
+  tail call void @llvm.dbg.value(metadata !{}, i64 5, metadata !11, metadata !12), !dbg !18
+  ret void
+}
+; CHECK: define void @debug_value(i32 %val, i32 %ptr) {
+; CHECK-NEXT: call void @llvm.dbg.value(metadata !2, i64 1, metadata !11, metadata !12)
+; CHECK-NEXT: call void @llvm.dbg.value(metadata !2, i64 2, metadata !14, metadata !12)
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* null, i64 3, metadata !11, metadata !12)
+; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* undef, i64 4, metadata !11, metadata !12)
+; CHECK-NEXT: call void @llvm.dbg.value(metadata !2, i64 5, metadata !11, metadata !12)
+; CHECK-NEXT: ret void
+
+
+declare void @llvm.lifetime.start(i64 %size, i8* %ptr)
+declare {}* @llvm.invariant.start(i64 %size, i8* %ptr)
+declare void @llvm.invariant.end({}* %start, i64 %size, i8* %ptr)
+
+; GVN can introduce the following horrible corner case of a lifetime
+; marker referencing a PHI node.  But we convert the phi to i32 type,
+; and lifetime.start doesn't work on an inttoptr converting an i32 phi
+; to a pointer.  Because of this, we just strip out all lifetime
+; markers.
+
+define void @alloca_lifetime_via_phi() {
+entry:
+  %buf = alloca i8
+  br label %block
+block:
+  %phi = phi i8* [ %buf, %entry ]
+  call void @llvm.lifetime.start(i64 -1, i8* %phi)
+  ret void
+}
+; CHECK: define void @alloca_lifetime_via_phi() {
+; CHECK: %phi = phi i32 [ %buf.asint, %entry ]
+; CHECK-NEXT: ret void
+
+define void @alloca_lifetime() {
+  %buf = alloca i8
+  call void @llvm.lifetime.start(i64 -1, i8* %buf)
+  ret void
+}
+; CHECK: define void @alloca_lifetime() {
+; CHECK-NEXT: %buf = alloca i8
+; CHECK-NEXT: ret void
+
+define void @alloca_lifetime_via_bitcast() {
+  %buf = alloca i32
+  %buf_cast = bitcast i32* %buf to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %buf_cast)
+  ret void
+}
+; CHECK: define void @alloca_lifetime_via_bitcast() {
+; CHECK-NEXT: %buf = alloca i32
+; CHECK-NEXT: ret void
+
+
+define void @strip_invariant_markers() {
+  %buf = alloca i8
+  %start = call {}* @llvm.invariant.start(i64 1, i8* %buf)
+  call void @llvm.invariant.end({}* %start, i64 1, i8* %buf)
+  ret void
+}
+; CHECK: define void @strip_invariant_markers() {
+; CHECK-NEXT: %buf = alloca i8
+; CHECK-NEXT: ret void
+
+
+; "nocapture" and "noalias" only apply to pointers, so must be stripped.
+define void @nocapture_attr(i8* nocapture noalias %ptr) {
+  ret void
+}
+; CHECK: define void @nocapture_attr(i32 %ptr) {
+
+
+define void @readonly_readnone(i8* readonly dereferenceable_or_null(4)) {
+  ret void
+}
+; CHECK-LABEL: define void @readonly_readnone(i32)
+
+define nonnull i8* @nonnull_ptr(i8* nonnull) {
+  ret i8* undef
+}
+; CHECK-LABEL: define i32 @nonnull_ptr(i32)
+
+define dereferenceable(16) i8* @dereferenceable_ptr(i8* dereferenceable(8)) {
+  ret i8* undef
+}
+; CHECK-LABEL: define i32 @dereferenceable_ptr(i32)
+
+; "nounwind" should be preserved.
+define void @nounwind_func_attr() nounwind {
+  ret void
+}
+; CHECK: define void @nounwind_func_attr() [[NOUNWIND:#[0-9]+]] {
+
+define void @nounwind_call_attr() {
+  call void @nounwind_func_attr() nounwind
+  ret void
+}
+; CHECK: define void @nounwind_call_attr() {
+; CHECK: call void @nounwind_func_attr() {{.*}}[[NOUNWIND]]
+
+define fastcc void @fastcc_func() {
+  ret void
+}
+; CHECK: define fastcc void @fastcc_func() {
+
+define void @fastcc_call() {
+  call fastcc void @fastcc_func()
+  ret void
+}
+; CHECK: define void @fastcc_call() {
+; CHECK-NEXT: call fastcc void @fastcc_func()
+
+define void @tail_call() {
+  tail call void @tail_call()
+  ret void
+}
+; CHECK: define void @tail_call()
+; CHECK-NEXT: tail call void @tail_call()
+
+
+; Just check that the pass does not crash on getelementptr.  (The pass
+; should not depend unnecessarily on ExpandGetElementPtr having been
+; run.)
+define i8* @getelementptr(i8, i8* %ptr) {
+  %gep = getelementptr i8, i8* %ptr, i32 10
+  ret i8* %gep
+}
+
+; Just check that the pass does not crash on va_arg.
+define i32* @va_arg(i8* %valist) {
+  %r = va_arg i8* %valist, i32*
+  ret i32* %r
+}
+
+
+define void @indirectbr(i8* %addr) {
+  indirectbr i8* %addr, [ label %l1, label %l2 ]
+l1:
+  ret void
+l2:
+  ret void
+}
+; CHECK: define void @indirectbr(i32 %addr) {
+; CHECK-NEXT: %addr.asptr = inttoptr i32 %addr to i8*
+; CHECK-NEXT: indirectbr i8* %addr.asptr, [label %l1, label %l2]
+
+
+define i8* @invoke(i8* %val) {
+  %result = invoke i8* @direct_call1(i8* %val)
+      to label %cont unwind label %lpad
+cont:
+  ret i8* %result
+lpad:
+  %lp = landingpad { i8*, i32 } personality void (i8*)* @personality_func cleanup
+  %p = extractvalue { i8*, i32 } %lp, 0
+  %s = insertvalue { i8*, i32 } %lp, i8* %val, 0
+  ret i8* %p
+}
+; CHECK: define i32 @invoke(i32 %val) {
+; CHECK-NEXT: %result = invoke i32 @direct_call1(i32 %val)
+; CHECK-NEXT:         to label %cont unwind label %lpad
+; CHECK: %lp = landingpad { i8*, i32 } personality void (i8*)* bitcast (void (i32)* @personality_func to void (i8*)*)
+; CHECK: %p = extractvalue { i8*, i32 } %lp, 0
+; CHECK-NEXT: %p.asint = ptrtoint i8* %p to i32
+; CHECK-NEXT: %val.asptr = inttoptr i32 %val to i8*
+; CHECK-NEXT: %s = insertvalue { i8*, i32 } %lp, i8* %val.asptr, 0
+; CHECK-NEXT: ret i32 %p.asint
+
+define void @personality_func(i8* %arg) {
+  ret void
+}
+
+
+declare i32 @llvm.eh.typeid.for(i8*)
+
+@typeid = global i32 0
+
+; The argument here must be left as a bitcast, otherwise the backend
+; rejects it.
+define void @typeid_for() {
+  %bc = bitcast i32* @typeid to i8*
+  call i32 @llvm.eh.typeid.for(i8* %bc)
+  ret void
+}
+; CHECK: define void @typeid_for() {
+; CHECK-NEXT: %typeid.bc = bitcast i32* @typeid to i8*
+; CHECK-NEXT: call i32 @llvm.eh.typeid.for(i8* %typeid.bc)
+
+
+; Subprogram debug metadata may refer to a function.
+; Make sure those are updated too.
+; Regenerate the debug info from the following C program:
+; void nop(void *ptr) {
+; }
+
+define void @nop(i8* %ptr) {
+  tail call void @llvm.dbg.value(metadata i8* %ptr, i64 0, metadata !11, metadata !12), !dbg !19
+  ret void, !dbg !19
+}
+; CHECK: define void @nop(i32 %ptr) {
+; CHECK-NEXT: call void @llvm.dbg.value{{.*}}
+; CHECK-NEXT: ret void
+
+
+; CHECK: attributes {{.*}}[[NOUNWIND]] = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+; CHECK: !4 = !MDSubprogram(name: "debug_declare", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, function: void (i32)* @debug_declare, variables: !2)
+
+!0 = !MDCompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (trunk 235150) (llvm/trunk 235152)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!1 = !MDFile(filename: "foo.c", directory: "/s/llvm/cmakebuild")
+!2 = !{}
+!3 = !{!4}
+!4 = !MDSubprogram(name: "debug_declare", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, function: void (i32)* @debug_declare, variables: !2)
+!5 = !MDSubroutineType(types: !6)
+!6 = !{null, !7}
+!7 = !MDBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{!"clang version 3.7.0 (trunk 235150) (llvm/trunk 235152)"}
+!11 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+!12 = !MDExpression()
+!13 = !MDLocation(line: 1, column: 24, scope: !4)
+
+!14 = !MDLocalVariable(tag: DW_TAG_auto_variable, name: "var", scope: !4, file: !1, line: 2, type: !15)
+!15 = !MDCompositeType(tag: DW_TAG_array_type, baseType: !7, align: 32, elements: !16)
+!16 = !{!17}
+!17 = !MDSubrange(count: -1)
+!18 = !MDLocation(line: 2, column: 11, scope: !4)
+!19 = !MDLocation(line: 2, column: 3, scope: !4)
diff --git a/test/Transforms/NaCl/resolve-aliases.ll b/test/Transforms/NaCl/resolve-aliases.ll
new file mode 100644
index 000000000000..82ad54d74e95
--- /dev/null
+++ b/test/Transforms/NaCl/resolve-aliases.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -resolve-aliases -S | FileCheck %s
+
+; CHECK-NOT: @alias
+
+@r1 = internal global i32 zeroinitializer
+@a1 = alias i32* @r1
+define i32* @usea1() {
+; CHECK: ret i32* @r1
+  ret i32* @a1
+}
+
+@funcalias = alias i32* ()* @usea1
+; CHECK: @usefuncalias
+define void @usefuncalias() {
+; CHECK: call i32* @usea1
+  %1 = call i32* @funcalias()
+  ret void
+}
+
+@bc1 = global i8* bitcast (i32* @r1 to i8*)
+@bcalias = alias i8* bitcast (i32* @r1 to i8*)
+
+; CHECK: @usebcalias
+define i8* @usebcalias() {
+; CHECK: ret i8* bitcast (i32* @r1 to i8*)
+  ret i8* @bcalias
+}
+
+
+@fa2 = alias i32* ()* @funcalias
+; CHECK: @usefa2
+define void @usefa2() {
+; CHECK: call i32* @usea1
+  call i32* @fa2()
+  ret void
+}
diff --git a/test/Transforms/NaCl/resolve-pnacl-intrinsics-lock-free.ll b/test/Transforms/NaCl/resolve-pnacl-intrinsics-lock-free.ll
new file mode 100644
index 000000000000..ba4f6e6a1c7f
--- /dev/null
+++ b/test/Transforms/NaCl/resolve-pnacl-intrinsics-lock-free.ll
@@ -0,0 +1,99 @@
+; RUN: opt < %s -resolve-pnacl-intrinsics -mtriple=x86_64 -S | FileCheck %s -check-prefix=CLEANED
+; 'CLEANED' only needs to check a single architecture.
+; RUN: opt < %s -resolve-pnacl-intrinsics -mtriple=x86_64 -S | FileCheck %s -check-prefix=X8664
+; RUN: opt < %s -resolve-pnacl-intrinsics -mtriple=i386   -S | FileCheck %s -check-prefix=X8632
+; RUN: opt < %s -resolve-pnacl-intrinsics -mtriple=arm    -S | FileCheck %s -check-prefix=ARM32
+; RUN: opt < %s -resolve-pnacl-intrinsics -mtriple=mipsel -S | FileCheck %s -check-prefix=MIPS32
+; RUN: opt < %s -resolve-pnacl-intrinsics -mtriple=asmjs  -S | FileCheck %s -check-prefix=ASMJS
+
+; CLEANED-NOT: call {{.*}} @llvm.nacl.atomic
+
+declare i32 @llvm.nacl.setjmp(i8*)
+declare void @llvm.nacl.longjmp(i8*, i32)
+declare i1 @llvm.nacl.atomic.is.lock.free(i32, i8*)
+
+; These declarations must be here because the function pass expects
+; to find them. In real life they're inserted by the translator
+; before the function pass runs.
+declare i32 @setjmp(i8*)
+declare void @longjmp(i8*, i32)
+
+
+; X8664-LABEL:  @test_is_lock_free_1(
+; X8632-LABEL:  @test_is_lock_free_1(
+; ARM32-LABEL:  @test_is_lock_free_1(
+; MIPS32-LABEL: @test_is_lock_free_1(
+; ASMJS-LABEL:  @test_is_lock_free_1(
+; X8664:  ret i1 true
+; X8632:  ret i1 true
+; ARM32:  ret i1 true
+; MIPS32: ret i1 true
+; ASMJS:  ret i1 true
+define i1 @test_is_lock_free_1(i8* %ptr) {
+  %res = call i1 @llvm.nacl.atomic.is.lock.free(i32 1, i8* %ptr)
+  ret i1 %res
+}
+
+; X8664-LABEL:  @test_is_lock_free_2(
+; X8632-LABEL:  @test_is_lock_free_2(
+; ARM32-LABEL:  @test_is_lock_free_2(
+; MIPS32-LABEL: @test_is_lock_free_2(
+; ASMJS-LABEL:  @test_is_lock_free_2(
+; X8664:  ret i1 true
+; X8632:  ret i1 true
+; ARM32:  ret i1 true
+; MIPS32: ret i1 true
+; ASMJS:  ret i1 true
+define i1 @test_is_lock_free_2(i16* %ptr) {
+  %ptr2 = bitcast i16* %ptr to i8*
+  %res = call i1 @llvm.nacl.atomic.is.lock.free(i32 2, i8* %ptr2)
+  ret i1 %res
+}
+
+; X8664-LABEL:  @test_is_lock_free_4(
+; X8632-LABEL:  @test_is_lock_free_4(
+; ARM32-LABEL:  @test_is_lock_free_4(
+; MIPS32-LABEL: @test_is_lock_free_4(
+; ASMJS-LABEL:  @test_is_lock_free_4(
+; X8664:  ret i1 true
+; X8632:  ret i1 true
+; ARM32:  ret i1 true
+; MIPS32: ret i1 true
+; ASMJS:  ret i1 true
+define i1 @test_is_lock_free_4(i32* %ptr) {
+  %ptr2 = bitcast i32* %ptr to i8*
+  %res = call i1 @llvm.nacl.atomic.is.lock.free(i32 4, i8* %ptr2)
+  ret i1 %res
+}
+
+; X8664-LABEL:  @test_is_lock_free_8(
+; X8632-LABEL:  @test_is_lock_free_8(
+; ARM32-LABEL:  @test_is_lock_free_8(
+; MIPS32-LABEL: @test_is_lock_free_8(
+; ASMJS-LABEL:  @test_is_lock_free_8(
+; X8664:  ret i1 true
+; X8632:  ret i1 true
+; ARM32:  ret i1 true
+; MIPS32: ret i1 false
+; ASMJS:  ret i1 false
+define i1 @test_is_lock_free_8(i64* %ptr) {
+  %ptr2 = bitcast i64* %ptr to i8*
+  %res = call i1 @llvm.nacl.atomic.is.lock.free(i32 8, i8* %ptr2)
+  ret i1 %res
+}
+
+; X8664-LABEL:  @test_is_lock_free_16(
+; X8632-LABEL:  @test_is_lock_free_16(
+; ARM32-LABEL:  @test_is_lock_free_16(
+; MIPS32-LABEL: @test_is_lock_free_16(
+; ASMJS-LABEL:  @test_is_lock_free_16(
+; X8664:  ret i1 false
+; X8632:  ret i1 false
+; ARM32:  ret i1 false
+; MIPS32: ret i1 false
+; ASMJS:  ret i1 false
+define i1 @test_is_lock_free_16(i128* %ptr) {
+  %ptr2 = bitcast i128* %ptr to i8*
+  %res = call i1 @llvm.nacl.atomic.is.lock.free(i32 16, i8* %ptr2)
+  ret i1 %res
+}
diff --git a/test/Transforms/NaCl/resolve-pnacl-intrinsics.ll b/test/Transforms/NaCl/resolve-pnacl-intrinsics.ll
new file mode 100644
index 000000000000..8e2bbb66df4b
--- /dev/null
+++ b/test/Transforms/NaCl/resolve-pnacl-intrinsics.ll
@@ -0,0 +1,293 @@
+; RUN: opt < %s -resolve-pnacl-intrinsics -S | FileCheck %s \
+; RUN:   -check-prefix=CLEANED
+; RUN: opt < %s -resolve-pnacl-intrinsics -S | FileCheck %s
+
+; CLEANED-NOT: call i32 @llvm.nacl.setjmp
+; CLEANED-NOT: call void @llvm.nacl.longjmp
+; CLEANED-NOT: call {{.*}} @llvm.nacl.atomic
+
+declare i32 @llvm.nacl.setjmp(i8*)
+declare void @llvm.nacl.longjmp(i8*, i32)
+
+; Intrinsic name mangling is based on overloaded parameters only,
+; including return type. Note that all pointers parameters are
+; overloaded on type-pointed-to in Intrinsics.td, and are therefore
+; mangled on the type-pointed-to only.
+declare i8 @llvm.nacl.atomic.load.i8(i8*, i32)
+declare i16 @llvm.nacl.atomic.load.i16(i16*, i32)
+declare i32 @llvm.nacl.atomic.load.i32(i32*, i32)
+declare i64 @llvm.nacl.atomic.load.i64(i64*, i32)
+declare void @llvm.nacl.atomic.store.i8(i8, i8*, i32)
+declare void @llvm.nacl.atomic.store.i16(i16, i16*, i32)
+declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
+declare void @llvm.nacl.atomic.store.i64(i64, i64*, i32)
+declare i8 @llvm.nacl.atomic.rmw.i8(i32, i8*, i8, i32)
+declare i16 @llvm.nacl.atomic.rmw.i16(i32, i16*, i16, i32)
+declare i32 @llvm.nacl.atomic.rmw.i32(i32, i32*, i32, i32)
+declare i64 @llvm.nacl.atomic.rmw.i64(i32, i64*, i64, i32)
+declare i8 @llvm.nacl.atomic.cmpxchg.i8(i8*, i8, i8, i32, i32)
+declare i16 @llvm.nacl.atomic.cmpxchg.i16(i16*, i16, i16, i32, i32)
+declare i32 @llvm.nacl.atomic.cmpxchg.i32(i32*, i32, i32, i32, i32)
+declare i64 @llvm.nacl.atomic.cmpxchg.i64(i64*, i64, i64, i32, i32)
+declare void @llvm.nacl.atomic.fence(i32)
+declare void @llvm.nacl.atomic.fence.all()
+
+; These declarations must be here because the function pass expects
+; to find them. In real life they're inserted by the translator
+; before the function pass runs.
+declare i32 @setjmp(i8*)
+declare void @longjmp(i8*, i32)
+
+; For correctness, the resulting call must get the "returns_twice" attribute.
+define i32 @call_setjmp(i8* %arg) {
+  %val = call i32 @llvm.nacl.setjmp(i8* %arg)
+; CHECK: %val = call i32 @setjmp(i8* %arg) [[RETURNS_TWICE:#[0-9]+]]
+  ret i32 %val
+}
+
+define void @call_longjmp(i8* %arg, i32 %num) {
+  call void @llvm.nacl.longjmp(i8* %arg, i32 %num)
+; CHECK: call void @longjmp(i8* %arg, i32 %num){{$}}
+  ret void
+}
+
+; atomics.
+
+; CHECK-LABEL: @test_atomic_acquire
+define i32 @test_atomic_acquire(i32* %ptr) {
+  ; CHECK: %1 = load atomic i32, i32* %ptr acquire, align 4
+  %1 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 3)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_atomic_release
+define void @test_atomic_release(i32* %ptr, i32 %value) {
+  ; CHECK: store atomic i32 %value, i32* %ptr release, align 4
+  call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: @test_atomic_acquire_release
+define i32 @test_atomic_acquire_release(i32* %ptr, i32 %value) {
+  ; CHECK: %1 = atomicrmw add i32* %ptr, i32 %value acq_rel
+  %1 = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %value, i32 5)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_fetch_and_add_i32
+define i32 @test_fetch_and_add_i32(i32* %ptr, i32 %value) {
+  ; CHECK: %1 = atomicrmw add i32* %ptr, i32 %value seq_cst
+  %1 = call i32 @llvm.nacl.atomic.rmw.i32(i32 1, i32* %ptr, i32 %value, i32 6)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_fetch_and_sub_i32
+define i32 @test_fetch_and_sub_i32(i32* %ptr, i32 %value) {
+  ; CHECK: %1 = atomicrmw sub i32* %ptr, i32 %value seq_cst
+  %1 = call i32 @llvm.nacl.atomic.rmw.i32(i32 2, i32* %ptr, i32 %value, i32 6)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_fetch_and_or_i32
+define i32 @test_fetch_and_or_i32(i32* %ptr, i32 %value) {
+  ; CHECK: %1 = atomicrmw or i32* %ptr, i32 %value seq_cst
+  %1 = call i32 @llvm.nacl.atomic.rmw.i32(i32 3, i32* %ptr, i32 %value, i32 6)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_fetch_and_and_i32
+define i32 @test_fetch_and_and_i32(i32* %ptr, i32 %value) {
+  ; CHECK: %1 = atomicrmw and i32* %ptr, i32 %value seq_cst
+  %1 = call i32 @llvm.nacl.atomic.rmw.i32(i32 4, i32* %ptr, i32 %value, i32 6)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_fetch_and_xor_i32
+define i32 @test_fetch_and_xor_i32(i32* %ptr, i32 %value) {
+  ; CHECK: %1 = atomicrmw xor i32* %ptr, i32 %value seq_cst
+  %1 = call i32 @llvm.nacl.atomic.rmw.i32(i32 5, i32* %ptr, i32 %value, i32 6)
+  ret i32 %1
+}
+
+; Test different compare-and-swap patterns that commonly occur and are a bit
+; tricky because the PNaCl intrinsic only returns the value whereas the LLVM
+; intrinsic also returns the success flag (equivalent to comparing the oldval
+; with what was just loaded).
+
+; CHECK-LABEL: @test_val_compare_and_swap_i32
+define i32 @test_val_compare_and_swap_i32(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  ; CHECK-NEXT: %2 = extractvalue { i32, i1 } %1, 0
+  ; CHECK-NEXT: ret i32 %2
+  %1 = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_val_compare_and_swap_i32_new
+define i32 @test_val_compare_and_swap_i32_new(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  ; CHECK-NEXT: %res2 = extractvalue { i32, i1 } %1, 0
+  ; CHECK-NEXT: ret i32 %res2
+  %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  %success = icmp eq i32 %res, %oldval
+  %res.insert.value = insertvalue { i32, i1 } undef, i32 %res, 0
+  %res.insert.success = insertvalue { i32, i1 } %res.insert.value, i1 %success, 1
+  %val = extractvalue { i32, i1 } %res.insert.success, 0
+  ret i32 %val
+}
+
+; CHECK-LABEL: @test_bool_compare_and_swap_i32
+define i1 @test_bool_compare_and_swap_i32(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  ; CHECK-NEXT: %success = extractvalue { i32, i1 } %1, 1
+  ; CHECK-NEXT: ret i1 %success
+  %1 = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  %2 = icmp eq i32 %1, %oldval
+  ret i1 %2
+}
+
+; CHECK-LABEL: @test_bool_compare_and_swap_i32_new
+define i1 @test_bool_compare_and_swap_i32_new(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  ; CHECK-NEXT: %suc = extractvalue { i32, i1 } %1, 1
+  ; CHECK-NEXT: ret i1 %suc
+  %res = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  %success = icmp eq i32 %res, %oldval
+  %res.insert.value = insertvalue { i32, i1 } undef, i32 %res, 0
+  %res.insert.success = insertvalue { i32, i1 } %res.insert.value, i1 %success, 1
+  %suc = extractvalue { i32, i1 } %res.insert.success, 1
+  ret i1 %suc
+}
+
+; CHECK-LABEL: @test_bool_compare_and_swap_i32_reordered
+define i1 @test_bool_compare_and_swap_i32_reordered(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  ; CHECK-NEXT: %success = extractvalue { i32, i1 } %1, 1
+  ; CHECK-NEXT: ret i1 %success
+  %1 = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  %2 = icmp eq i32 %oldval, %1 ; Note operands are swapped from above.
+  ret i1 %2
+}
+
+; CHECK-LABEL: @test_struct_compare_and_swap_i32
+define { i32, i1 } @test_struct_compare_and_swap_i32(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  ; CHECK-NEXT: ret { i32, i1 } %1
+  %1 = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  %2 = icmp eq i32 %1, %oldval
+  %3 = insertvalue { i32, i1 } undef, i32 %1, 0
+  %4 = insertvalue { i32, i1 } %3, i1 %2, 1
+  ret { i32, i1 } %4
+}
+
+; Test all allowed cmpxchg success/failure memory orderings.
+
+; CHECK-LABEL: @test_cmpxchg_seqcst_seqcst
+define i32 @test_cmpxchg_seqcst_seqcst(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst seq_cst
+  %1 = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 6)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_cmpxchg_seqcst_acquire
+define i32 @test_cmpxchg_seqcst_acquire(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval seq_cst acquire
+  %1 = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 6, i32 3)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_cmpxchg_acquire_acquire
+define i32 @test_cmpxchg_acquire_acquire(i32* %ptr, i32 %oldval, i32 %newval) {
+  ; CHECK: %1 = cmpxchg i32* %ptr, i32 %oldval, i32 %newval acquire acquire
+  %1 = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %oldval, i32 %newval, i32 3, i32 3)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_c11_fence
+define void @test_c11_fence() {
+  ; CHECK: fence seq_cst
+  call void @llvm.nacl.atomic.fence(i32 6)
+  ret void
+}
+
+; CHECK-LABEL: @test_synchronize
+define void @test_synchronize() {
+  ; CHECK: call void asm sideeffect "", "~{memory}"()
+  ; CHECK: fence seq_cst
+  ; CHECK: call void asm sideeffect "", "~{memory}"()
+  call void @llvm.nacl.atomic.fence.all()
+  ret void
+}
+
+; CHECK-LABEL: @test_lock_test_and_set_i32
+define i32 @test_lock_test_and_set_i32(i32* %ptr, i32 %value) {
+  ; CHECK: %1 = atomicrmw xchg i32* %ptr, i32 %value seq_cst
+  %1 = call i32 @llvm.nacl.atomic.rmw.i32(i32 6, i32* %ptr, i32 %value, i32 6)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_lock_release_i32
+define void @test_lock_release_i32(i32* %ptr) {
+  ; Note that the 'release' was changed to a 'seq_cst'.
+  ; CHECK: store atomic i32 0, i32* %ptr seq_cst, align 4
+  call void @llvm.nacl.atomic.store.i32(i32 0, i32* %ptr, i32 6)
+  ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_i8
+define zeroext i8 @test_atomic_load_i8(i8* %ptr) {
+  ; CHECK: %1 = load atomic i8, i8* %ptr seq_cst, align 1
+  %1 = call i8 @llvm.nacl.atomic.load.i8(i8* %ptr, i32 6)
+  ret i8 %1
+}
+
+; CHECK-LABEL: @test_atomic_store_i8
+define void @test_atomic_store_i8(i8* %ptr, i8 zeroext %value) {
+  ; CHECK: store atomic i8 %value, i8* %ptr seq_cst, align 1
+  call void @llvm.nacl.atomic.store.i8(i8 %value, i8* %ptr, i32 6)
+  ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_i16
+define zeroext i16 @test_atomic_load_i16(i16* %ptr) {
+  ; CHECK: %1 = load atomic i16, i16* %ptr seq_cst, align 2
+  %1 = call i16 @llvm.nacl.atomic.load.i16(i16* %ptr, i32 6)
+  ret i16 %1
+}
+
+; CHECK-LABEL: @test_atomic_store_i16
+define void @test_atomic_store_i16(i16* %ptr, i16 zeroext %value) {
+  ; CHECK: store atomic i16 %value, i16* %ptr seq_cst, align 2
+  call void @llvm.nacl.atomic.store.i16(i16 %value, i16* %ptr, i32 6)
+  ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_i32
+define i32 @test_atomic_load_i32(i32* %ptr) {
+  ; CHECK: %1 = load atomic i32, i32* %ptr seq_cst, align 4
+  %1 = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test_atomic_store_i32
+define void @test_atomic_store_i32(i32* %ptr, i32 %value) {
+  ; CHECK: store atomic i32 %value, i32* %ptr seq_cst, align 4
+  call void @llvm.nacl.atomic.store.i32(i32 %value, i32* %ptr, i32 6)
+  ret void
+}
+
+; CHECK-LABEL: @test_atomic_load_i64
+define i64 @test_atomic_load_i64(i64* %ptr) {
+  ; CHECK: %1 = load atomic i64, i64* %ptr seq_cst, align 8
+  %1 = call i64 @llvm.nacl.atomic.load.i64(i64* %ptr, i32 6)
+  ret i64 %1
+}
+
+; CHECK-LABEL: @test_atomic_store_i64
+define void @test_atomic_store_i64(i64* %ptr, i64 %value) {
+  ; CHECK: store atomic i64 %value, i64* %ptr seq_cst, align 8
+  call void @llvm.nacl.atomic.store.i64(i64 %value, i64* %ptr, i32 6)
+  ret void
+}
+
+; CHECK: attributes [[RETURNS_TWICE]] = { returns_twice }
diff --git a/test/Transforms/NaCl/rewrite-assume.ll b/test/Transforms/NaCl/rewrite-assume.ll
new file mode 100644
index 000000000000..50e5d2bb6ff3
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-assume.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -rewrite-llvm-intrinsic-calls -S | FileCheck %s
+; RUN: opt < %s -rewrite-llvm-intrinsic-calls -S | FileCheck %s -check-prefix=CLEANED
+; Test the @llvm.assume part of the RewriteLLVMIntrinsics pass
+
+declare void @llvm.assume(i1)
+
+; No declaration or definition of llvm.assume() should remain.
+; CLEANED-NOT: @llvm.assume
+
+define void @call_assume(i1 %val) {
+; CHECK: call_assume
+; CHECK-NEXT: ret void
+  call void @llvm.assume(i1 %val)
+  ret void
+}
+
+; A more complex example with a number of calls in several BBs.
+define void @multiple_calls(i1 %val) {
+; CHECK: multiple_calls
+entryblock:
+; CHECK: entryblock
+; CHECK-NEXT: br
+  call void @llvm.assume(i1 %val)
+  br i1 %val, label %exitblock, label %never
+never:
+; CHECK: never:
+; CHECK-NEXT: br
+  call void @llvm.assume(i1 %val)
+  br label %exitblock
+exitblock:
+; CHECK: exitblock:
+; CHECK-NEXT: ret void
+  call void @llvm.assume(i1 %val)
+  ret void
+}
diff --git a/test/Transforms/NaCl/rewrite-call-with-libfunc-argument.ll b/test/Transforms/NaCl/rewrite-call-with-libfunc-argument.ll
new file mode 100644
index 000000000000..56ee2d2c078e
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-call-with-libfunc-argument.ll
@@ -0,0 +1,18 @@
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s
+
+; See https://code.google.com/p/nativeclient/issues/detail?id=3706
+; Make sure that when @longjmp is used as an argument in a call instruction,
+; the rewrite pass does the right thing and doesn't get confused.
+
+; CHECK: define internal void @longjmp(i64* %env, i32 %val) {
+
+declare void @longjmp(i64*, i32)
+
+declare void @somefunc(i32, void (i64*, i32)*, i32)
+
+define void @foo() {
+entry:
+  call void @somefunc(i32 1, void (i64*, i32)* @longjmp, i32 2)
+; CHECK: call void @somefunc(i32 1, void (i64*, i32)* @longjmp, i32 2)
+  ret void
+}
diff --git a/test/Transforms/NaCl/rewrite-flt-rounds.ll b/test/Transforms/NaCl/rewrite-flt-rounds.ll
new file mode 100644
index 000000000000..cb1a7e4a9924
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-flt-rounds.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -rewrite-llvm-intrinsic-calls -S | FileCheck %s
+; RUN: opt < %s -rewrite-llvm-intrinsic-calls -S | FileCheck %s -check-prefix=CLEANED
+; Test the @llvm.flt.rounds part of the RewriteLLVMIntrinsics pass
+
+declare i32 @llvm.flt.rounds()
+
+; No declaration or definition of llvm.flt.rounds() should remain.
+; CLEANED-NOT: @llvm.flt.rounds
+
+define i32 @call_flt_rounds() {
+; CHECK: call_flt_rounds
+; CHECK-NEXT: ret i32 1
+  %val = call i32 @llvm.flt.rounds()
+  ret i32 %val
+}
+
+; A more complex example with a number of calls in several BBs.
+define i32 @multiple_calls(i64* %arg, i32 %num) {
+; CHECK: multiple_calls
+entryblock:
+; CHECK: entryblock
+  %v1 = call i32 @llvm.flt.rounds()
+  br label %block1
+block1:
+; CHECK: block1:
+; CHECK-NEXT: %v3 = add i32 1, 1
+  %v2 = call i32 @llvm.flt.rounds()
+  %v3 = add i32 %v2, %v1
+  br label %exitblock
+exitblock:
+; CHECK: exitblock:
+; CHECK-NEXT: %v4 = add i32 1, %v3
+; CHECK-NEXT: %v6 = add i32 1, %v4
+  %v4 = add i32 %v2, %v3
+  %v5 = call i32 @llvm.flt.rounds()
+  %v6 = add i32 %v5, %v4
+  ret i32 %v6
+}
diff --git a/test/Transforms/NaCl/rewrite-libcalls-wrong-signature.ll b/test/Transforms/NaCl/rewrite-libcalls-wrong-signature.ll
new file mode 100644
index 000000000000..3ab64d9dd26e
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-libcalls-wrong-signature.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s
+; Check how the pass behaves in the presence of library functions with wrong
+; signatures.
+
+declare i8 @longjmp(i64)
+
+@flongjmp = global i8 (i64)* @longjmp
+; CHECK: @flongjmp = global i8 (i64)* bitcast (void (i64*, i32)* @longjmp to i8 (i64)*)
+
+; CHECK: define internal void @longjmp(i64* %env, i32 %val)
+
+declare i8* @memcpy(i32)
+
+define i8* @call_bad_memcpy(i32 %arg) {
+  %result = call i8* @memcpy(i32 %arg)
+  ret i8* %result
+}
+
+; CHECK: define i8* @call_bad_memcpy(i32 %arg) {
+; CHECK:   %result = call i8* bitcast (i8* (i8*, i8*, i32)* @memcpy to i8* (i32)*)(i32 %arg)
+
+declare i8 @setjmp()
+
+; This simulates a case where the original C file had a correct setjmp
+; call but due to linking order a wrong declaration made it into the
+; IR. In this case, the correct call is bitcasted to the correct type.
+; The pass should treat this properly by creating a direct intrinsic
+; call instead of going through the wrapper.
+define i32 @call_valid_setjmp(i64* %buf) {
+  %result = call i32 bitcast (i8 ()* @setjmp to i32 (i64*)*)(i64* %buf)
+  ret i32 %result
+}
+
+; CHECK:      define i32 @call_valid_setjmp(i64* %buf) {
+; CHECK-NEXT:   %jmp_buf_i8 = bitcast i64* %buf to i8*
+; CHECK-NEXT:   %result = call i32 @llvm.nacl.setjmp(i8* %jmp_buf_i8)
+; CHECK-NEXT:   ret i32 %result
+; CHECK-NEXT: }
diff --git a/test/Transforms/NaCl/rewrite-longjmp-no-store.ll b/test/Transforms/NaCl/rewrite-longjmp-no-store.ll
new file mode 100644
index 000000000000..134593ad3971
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-longjmp-no-store.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s -check-prefix=CLEANED
+; Test that when there are no uses other than calls to longjmp,
+; no function body is generated.
+
+declare void @longjmp(i64*, i32)
+
+; No declaration or definition of longjmp() should remain.
+; CLEANED-NOT: @longjmp
+
+define void @call_longjmp(i64* %arg, i32 %num) {
+  call void @longjmp(i64* %arg, i32 %num)
+; CHECK: call void @llvm.nacl.longjmp(i8* %jmp_buf_i8, i32 %num)
+  ret void
+}
+
diff --git a/test/Transforms/NaCl/rewrite-longjmp-noncall-uses.ll b/test/Transforms/NaCl/rewrite-longjmp-noncall-uses.ll
new file mode 100644
index 000000000000..ed7818ec9688
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-longjmp-noncall-uses.ll
@@ -0,0 +1,21 @@
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s
+; Check that the rewrite pass behaves correctly in the presence 
+; of various uses of longjmp that are not calls.
+
+@fp = global void (i64*, i32)* @longjmp, align 8
+; CHECK: @fp = global void (i64*, i32)* @longjmp, align 8
+@arrfp = global [3 x void (i64*, i32)*] [void (i64*, i32)* null, void (i64*, i32)* @longjmp, void (i64*, i32)* null], align 16
+; CHECK: @arrfp = global [3 x void (i64*, i32)*] [void (i64*, i32)* null, void (i64*, i32)* @longjmp, void (i64*, i32)* null], align 16
+
+; CHECK: define internal void @longjmp(i64* %env, i32 %val) {
+
+declare void @longjmp(i64*, i32)
+
+declare void @somefunc(i8*)
+
+define void @foo() {
+entry:
+  call void @somefunc(i8* bitcast (void (i64*, i32)* @longjmp to i8*))
+; CHECK: call void @somefunc(i8* bitcast (void (i64*, i32)* @longjmp to i8*))
+  ret void
+}
diff --git a/test/Transforms/NaCl/rewrite-memfuncs-no-store.ll b/test/Transforms/NaCl/rewrite-memfuncs-no-store.ll
new file mode 100644
index 000000000000..e661fae83af5
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-memfuncs-no-store.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s -check-prefix=CLEANED
+
+declare i8* @memcpy(i8*, i8*, i32)
+declare i8* @memmove(i8*, i8*, i32)
+declare i8* @memset(i8*, i32, i32)
+
+; No declaration or definition of the library functions should remain, since
+; the only uses of mem* functions are calls.
+; CLEANED-NOT: @memcpy
+; CLEANED-NOT: @memmove
+; CLEANED-NOT: @memset
+
+define i8* @call_memcpy(i8* %dest, i8* %src, i32 %len) {
+  %result = call i8* @memcpy(i8* %dest, i8* %src, i32 %len)
+  ret i8* %result
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i32 1, i1 false)
+; CHECK: ret i8* %dest
+}
+
+define i8* @call_memmove(i8* %dest, i8* %src, i32 %len) {
+  %result = call i8* @memmove(i8* %dest, i8* %src, i32 %len)
+  ret i8* %result
+; CHECK: call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i32 1, i1 false)
+; CHECK: ret i8* %dest
+}
+
+define i8* @call_memset(i8* %dest, i32 %c, i32 %len) {
+  %result = call i8* @memset(i8* %dest, i32 %c, i32 %len)
+  ret i8* %result
+; CHECK: %trunc_byte = trunc i32 %c to i8
+; CHECK: call void @llvm.memset.p0i8.i32(i8* %dest, i8 %trunc_byte, i32 %len, i32 1, i1 false)
+; CHECK: ret i8* %dest
+}
diff --git a/test/Transforms/NaCl/rewrite-memfuncs-noncall-uses.ll b/test/Transforms/NaCl/rewrite-memfuncs-noncall-uses.ll
new file mode 100644
index 000000000000..5c6bdfdcb596
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-memfuncs-noncall-uses.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s
+; Check that the rewrite pass behaves correctly in the presence 
+; of various uses of mem* that are not calls.
+
+@fpcpy = global i8* (i8*, i8*, i32)* @memcpy
+; CHECK: @fpcpy = global i8* (i8*, i8*, i32)* @memcpy
+@fpmove = global i8* (i8*, i8*, i32)* @memmove
+; CHECK: @fpmove = global i8* (i8*, i8*, i32)* @memmove
+@fpset = global i8* (i8*, i32, i32)* @memset
+; CHECK: @fpset = global i8* (i8*, i32, i32)* @memset
+
+; CHECK: define internal i8* @memcpy(i8* %dest, i8* %src, i32 %len) {
+; CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i32 1, i1 false)
+; CHECK:   ret i8* %dest
+; CHECK: }
+
+; CHECK: define internal i8* @memmove(i8* %dest, i8* %src, i32 %len) {
+; CHECK:   call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i32 1, i1 false)
+; CHECK:   ret i8* %dest
+; CHECK: }
+
+; CHECK: define internal i8* @memset(i8* %dest, i32 %val, i32 %len) {
+; CHECK:   %trunc_byte = trunc i32 %val to i8
+; CHECK:   call void @llvm.memset.p0i8.i32(i8* %dest, i8 %trunc_byte, i32 %len, i32 1, i1 false)
+; CHECK:   ret i8* %dest
+; CHECK: }
+
+declare i8* @memcpy(i8*, i8*, i32)
+declare i8* @memmove(i8*, i8*, i32)
+declare i8* @memset(i8*, i32, i32)
diff --git a/test/Transforms/NaCl/rewrite-prefetch.ll b/test/Transforms/NaCl/rewrite-prefetch.ll
new file mode 100644
index 000000000000..0826bd32c988
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-prefetch.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -rewrite-llvm-intrinsic-calls -S | FileCheck %s
+; RUN: opt < %s -rewrite-llvm-intrinsic-calls -S | FileCheck %s -check-prefix=CLEANED
+; Test the @llvm.prefetch part of the RewriteLLVMIntrinsics pass
+
+declare void @llvm.prefetch(i8 *%ptr, i32 %rw, i32 %locality, i32 %cache_type)
+
+; No declaration or definition of llvm.prefetch() should remain.
+; CLEANED-NOT: @llvm.prefetch
+
+define void @call_prefetch(i8 *%ptr) {
+; CHECK: call_prefetch
+; CHECK-NEXT: ret void
+  call void @llvm.prefetch(i8 *%ptr, i32 0, i32 0, i32 0)
+  ret void
+}
+
+; A more complex example with a number of calls in several BBs.
+define void @multiple_calls(i8 *%ptr) {
+; CHECK: multiple_calls
+entryblock:
+; CHECK: entryblock
+; CHECK-NEXT: br
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 2, i32 1)
+  br label %block1
+block1:
+; CHECK: block1:
+; CHECK-NEXT: br
+  call void @llvm.prefetch(i8 *%ptr, i32 0, i32 1, i32 0)
+  br label %exitblock
+exitblock:
+; CHECK: exitblock:
+; CHECK-NEXT: ret void
+  call void @llvm.prefetch(i8 *%ptr, i32 1, i32 3, i32 1)
+  ret void
+}
diff --git a/test/Transforms/NaCl/rewrite-setjmp-store-error.ll b/test/Transforms/NaCl/rewrite-setjmp-store-error.ll
new file mode 100644
index 000000000000..05d7dedefba3
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-setjmp-store-error.ll
@@ -0,0 +1,13 @@
+; RUN: not opt < %s -rewrite-pnacl-library-calls -S 2>&1 | FileCheck %s
+; Test that the pass enforces not being able to store the address
+; of setjmp.
+
+declare i32 @setjmp(i64*)
+
+define i32 @takeaddr_setjmp(i64* %arg) {
+  %fp = alloca i32 (i64*)*, align 8
+; CHECK: Taking the address of setjmp is invalid
+  store i32 (i64*)* @setjmp, i32 (i64*)** %fp, align 8
+  ret i32 7
+}
+
diff --git a/test/Transforms/NaCl/rewrite-setlongjmp-calls.ll b/test/Transforms/NaCl/rewrite-setlongjmp-calls.ll
new file mode 100644
index 000000000000..f34f004d7f39
--- /dev/null
+++ b/test/Transforms/NaCl/rewrite-setlongjmp-calls.ll
@@ -0,0 +1,76 @@
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s
+; RUN: opt < %s -rewrite-pnacl-library-calls -S | FileCheck %s -check-prefix=CLEANED
+; Test the RewritePNaClLibraryCalls pass
+
+declare i32 @setjmp(i64*)
+declare void @longjmp(i64*, i32)
+
+; No declaration or definition of setjmp() should remain.
+; CLEANED-NOT: @setjmp
+
+; Since the address of longjmp is being taken here, a body is generated
+; for it, which does a cast and calls an intrinsic
+
+; CHECK: define internal void @longjmp(i64* %env, i32 %val) {
+; CHECK: entry:
+; CHECK:   %jmp_buf_i8 = bitcast i64* %env to i8*
+; CHECK:   call void @llvm.nacl.longjmp(i8* %jmp_buf_i8, i32 %val)
+; CHECK:   unreachable
+; CHECK: }
+
+define i32 @call_setjmp(i64* %arg) {
+; CHECK-NOT: call i32 @setjmp
+; CHECK: %jmp_buf_i8 = bitcast i64* %arg to i8*
+; CHECK-NEXT: %val = call i32 @llvm.nacl.setjmp(i8* %jmp_buf_i8)
+  %val = call i32 @setjmp(i64* %arg)
+  ret i32 %val
+}
+
+define void @call_longjmp(i64* %arg, i32 %num) {
+; CHECK-NOT: call void @longjmp
+; CHECK: %jmp_buf_i8 = bitcast i64* %arg to i8*
+; CHECK-NEXT: call void @llvm.nacl.longjmp(i8* %jmp_buf_i8, i32 %num)
+  call void @longjmp(i64* %arg, i32 %num)
+  ret void
+}
+
+define i32 @takeaddr_longjmp(i64* %arg, i32 %num) {
+  %fp = alloca void (i64*, i32)*, align 8
+; CHECK: store void (i64*, i32)* @longjmp, void (i64*, i32)** %fp
+  store void (i64*, i32)* @longjmp, void (i64*, i32)** %fp, align 8
+  ret i32 7
+}
+
+; A more complex example with a number of calls in several BBs
+define void @multiple_calls(i64* %arg, i32 %num) {
+entryblock:
+; CHECK: entryblock
+; CHECK: bitcast
+; CHECK-NEXT: call void @llvm.nacl.longjmp
+  call void @longjmp(i64* %arg, i32 %num)
+  br label %block1
+block1:
+; CHECK: block1
+; CHECK: bitcast
+; CHECK-NEXT: call void @llvm.nacl.longjmp
+  call void @longjmp(i64* %arg, i32 %num)
+; CHECK: call i32 @llvm.nacl.setjmp
+  %val = call i32 @setjmp(i64* %arg)
+  %num2 = add i32 %val, %num
+; CHECK: bitcast
+; CHECK-NEXT: call void @llvm.nacl.longjmp
+  call void @longjmp(i64* %arg, i32 %num2)
+  br label %exitblock
+exitblock:
+  %num3 = add i32 %num, %num
+  call void @longjmp(i64* %arg, i32 %num3)
+  %num4 = add i32 %num, %num3
+; CHECK: bitcast
+; CHECK-NEXT: call void @llvm.nacl.longjmp
+  call void @longjmp(i64* %arg, i32 %num4)
+  ret void
+}
+
+; CHECK: declare i32 @llvm.nacl.setjmp(i8*)
+; CHECK: declare void @llvm.nacl.longjmp(i8*, i32)
+
diff --git a/test/Transforms/NaCl/simplify-allocas.ll b/test/Transforms/NaCl/simplify-allocas.ll
new file mode 100644
index 000000000000..fab28064c9c3
--- /dev/null
+++ b/test/Transforms/NaCl/simplify-allocas.ll
@@ -0,0 +1,207 @@
+; RUN: opt < %s -simplify-allocas -S | FileCheck %s
+
+target datalayout = "p:32:32:32"
+
+%struct = type { i32, i32 }
+
+declare void @receive_alloca(%struct* %ptr)
+declare void @receive_vector_alloca(<4 x i32>* %ptr)
+
+define void @alloca_fixed() {
+  %buf = alloca %struct, align 128
+  call void @receive_alloca(%struct* %buf)
+  ret void
+}
+; CHECK-LABEL: define void @alloca_fixed() {
+; CHECK-NEXT:    %buf = alloca i8, i32 8, align 128
+; CHECK-NEXT:    %buf.bc = bitcast i8* %buf to %struct*
+; CHECK-NEXT:    call void @receive_alloca(%struct* %buf.bc)
+
+; When the size passed to alloca is a constant, it should be a
+; constant in the output too.
+define void @alloca_fixed_array() {
+  %buf = alloca %struct, i32 100
+  call void @receive_alloca(%struct* %buf)
+  ret void
+}
+; CHECK-LABEL: define void @alloca_fixed_array() {
+; CHECK-NEXT:    %buf = alloca i8, i32 800, align 8
+; CHECK-NEXT:    %buf.bc = bitcast i8* %buf to %struct*
+; CHECK-NEXT:    call void @receive_alloca(%struct* %buf.bc)
+
+define void @alloca_fixed_vector() {
+  %buf = alloca <4 x i32>, align 128
+  call void @receive_vector_alloca(<4 x i32>* %buf)
+  ret void
+}
+; CHECK-LABEL: define void @alloca_fixed_vector() {
+; CHECK-NEXT: %buf = alloca i8, i32 16, align 128
+; CHECK-NEXT: %buf.bc = bitcast i8* %buf to <4 x i32>*
+; CHECK-NEXT: call void @receive_vector_alloca(<4 x i32>* %buf.bc)
+
+define void @alloca_variable(i32 %size) {
+  %buf = alloca %struct, i32 %size
+  call void @receive_alloca(%struct* %buf)
+  ret void
+}
+; CHECK-LABEL: define void @alloca_variable(i32 %size) {
+; CHECK-NEXT:    %buf.alloca_mul = mul i32 8, %size
+; CHECK-NEXT:    %buf = alloca i8, i32 %buf.alloca_mul
+; CHECK-NEXT:    %buf.bc = bitcast i8* %buf to %struct*
+; CHECK-NEXT:    call void @receive_alloca(%struct* %buf.bc)
+
+define void @alloca_alignment_i32() {
+  %buf = alloca i32
+  ret void
+}
+; CHECK-LABEL: void @alloca_alignment_i32() {
+; CHECK-NEXT:    alloca i8, i32 4, align 4
+
+define void @alloca_alignment_double() {
+  %buf = alloca double
+  ret void
+}
+; CHECK-LABEL: void @alloca_alignment_double() {
+; CHECK-NEXT:    alloca i8, i32 8, align 8
+
+define void @alloca_lower_alignment() {
+  %buf = alloca i32, align 1
+  ret void
+}
+; CHECK-LABEL: void @alloca_lower_alignment() {
+; CHECK-NEXT:    alloca i8, i32 4, align 1
+
+define void @alloca_array_trunc() {
+  %a = alloca i32, i64 1024
+  unreachable
+}
+; CHECK-LABEL: define void @alloca_array_trunc()
+; CHECK-NEXT:    %a = alloca i8, i32 4096
+
+define void @alloca_array_zext() {
+  %a = alloca i32, i8 128
+  unreachable
+}
+; CHECK-LABEL: define void @alloca_array_zext()
+; CHECK-NEXT:    %a = alloca i8, i32 512
+
+define void @dyn_alloca_array_trunc(i64 %a) {
+  %b = alloca i32, i64 %a
+  unreachable
+}
+; CHECK-LABEL: define void @dyn_alloca_array_trunc(i64 %a)
+; CHECK-NEXT:    trunc i64 %a to i32
+; CHECK-NEXT:    mul i32 4,
+; CHECK-NEXT:    alloca i8, i32
+
+define void @dyn_alloca_array_zext(i8 %a) {
+  %b = alloca i32, i8 %a
+  unreachable
+}
+; CHECK-LABEL: define void @dyn_alloca_array_zext(i8 %a)
+; CHECK-NEXT:    zext i8 %a to i32
+; CHECK-NEXT:    mul i32 4,
+; CHECK-NEXT:    alloca i8, i32
+
+define void @dyn_inst_alloca_array(i32 %a) {
+  %b = add i32 1, %a
+  %c = alloca i32, i32 %b
+  unreachable
+}
+; CHECK-LABEL: define void @dyn_inst_alloca_array(i32 %a)
+; CHECK-NEXT:    %b = add i32 1, %a
+; CHECK-NEXT:    mul i32 4, %b
+; CHECK-NEXT:    %c = alloca i8, i32
+
+define void @dyn_inst_alloca_array_trunc(i64 %a) {
+  %b = add i64 1, %a
+  %c = alloca i32, i64 %b
+  unreachable
+}
+; CHECK-LABEL: define void @dyn_inst_alloca_array_trunc(i64 %a)
+; CHECK-NEXT:    %b = add i64 1, %a
+; CHECK-NEXT:    trunc i64 %b to i32
+; CHECK-NEXT:    mul i32 4,
+; CHECK-NEXT:    %c = alloca i8, i32
+
+define void @dyn_inst_alloca_array_zext(i8 %a) {
+  %b = add i8 1, %a
+  %c = alloca i32, i8 %b
+  unreachable
+}
+; CHECK-LABEL: define void @dyn_inst_alloca_array_zext(i8 %a)
+; CHECK-NEXT:    %b = add i8 1, %a
+; CHECK-NEXT:    zext i8 %b to i32
+; CHECK-NEXT:    mul i32 4,
+; CHECK-NEXT:    %c = alloca i8, i32
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+define void @debug_declare() {
+  %var = alloca i32
+  call void @llvm.dbg.declare(metadata i32* %var, metadata !12, metadata !13), !dbg !14
+  unreachable
+}
+; Ensure that the first arg to dbg.declare points to the alloca, not the bitcast
+; CHECK-LABEL: define void @debug_declare
+; CHECK-NEXT: %var = alloca i8, i32 4
+; CHECK: call void @llvm.dbg.declare(metadata i8* %var, metadata !12, metadata !13), !dbg !14
+
+define void @debug_declare_morecasts() {
+  %var = alloca i32, i32 2, align 8
+  %other_bc = bitcast i32* %var to i64*
+  %other_bc2 = bitcast i64* %other_bc to i16*
+  call void @llvm.dbg.declare(metadata i16* %other_bc2, metadata !15, metadata !13), !dbg !16
+  unreachable
+}
+; Ensure that the first arg to dbg.declare points to the alloca, not bitcasts
+; CHECK-LABEL: define void @debug_declare_morecasts
+; CHECK-NEXT: %var = alloca i8, i32 8, align 8
+; CHECK: call void @llvm.dbg.declare(metadata i8* %var, metadata !15, metadata !13), !dbg !16
+
+define void @debug_declare_inttoptr() {
+  %var = alloca i32, i32 2, align 8
+  %i = ptrtoint i32* %var to i32
+  %p = inttoptr i32 %i to i8*
+  call void @llvm.dbg.declare(metadata i8* %p, metadata !15, metadata !13), !dbg !16
+  unreachable
+}
+; Ensure that we can look through ptrtoint/inttoptr
+; CHECK-LABEL: define void @debug_declare_inttoptr
+; CHECK-NEXT: alloca i8, i32 8, align 8
+; CHECK: call void @llvm.dbg.declare(metadata i8* %var, metadata !15, metadata !13), !dbg !16
+
+declare i8* @foo()
+define void @debug_declare_noalloca() {
+  %call = tail call i8* @foo()
+  %config_.i.i = getelementptr inbounds i8, i8* %call, i32 104, !dbg !16
+  %bc = bitcast i8* %config_.i.i to i16*, !dbg !16
+  tail call void @llvm.dbg.declare(metadata i16* %bc, metadata !15, metadata !13), !dbg !16
+  unreachable
+}
+; Don't modify dbg.declares which don't ultimately point to an alloca.
+; CHECK-LABEL: define void @debug_declare_noalloca()
+; CHECK: call void @llvm.dbg.declare(metadata i16* %bc, metadata !15, metadata !13), !dbg !16
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+; CHECK: !4 = !MDSubprogram(name: "debug_declare", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @debug_declare, variables: !2)
+
+!0 = !MDCompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (trunk 235150) (llvm/trunk 235152)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!1 = !MDFile(filename: "foo.c", directory: "/s/llvm/cmakebuild")
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !MDSubprogram(name: "debug_declare", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @debug_declare, variables: !2)
+!5 = !MDSubroutineType(types: !6)
+!6 = !{null, !7}
+!7 = !MDBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !MDSubprogram(name: "debug_declare_morecasts", scope: !1, file: !1, line: 8, type: !5, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @debug_declare_morecasts, variables: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.7.0 (trunk 235150) (llvm/trunk 235152)"}
+!12 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "val", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+!13 = !MDExpression()
+!14 = !MDLocation(line: 1, column: 24, scope: !4)
+!15 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "var", arg: 1, scope: !8, file: !1, line: 9, type: !7)
+!16 = !MDLocation(line: 9, column: 24, scope: !8)
diff --git a/test/Transforms/NaCl/simplify-struct-reg-pad-crash.ll b/test/Transforms/NaCl/simplify-struct-reg-pad-crash.ll
new file mode 100644
index 000000000000..2a8541353281
--- /dev/null
+++ b/test/Transforms/NaCl/simplify-struct-reg-pad-crash.ll
@@ -0,0 +1,21 @@
+; RUN: not opt < %s -simplify-struct-reg-signatures -S
+
+%struct = type { i32, i32 }
+
+declare i32 @__hypothetical_personality_1(%struct)
+
+declare void @something_to_invoke()
+
+; landingpad with struct
+define void @landingpad_is_struct() {
+  invoke void @something_to_invoke()
+    to label %OK unwind label %Err
+
+OK:
+  ret void
+
+Err:
+  %exn = landingpad i32 personality i32(%struct)* @__hypothetical_personality_1
+      cleanup
+  resume i32 %exn
+}
\ No newline at end of file
diff --git a/test/Transforms/NaCl/simplify-struct-reg-resume-crash.ll b/test/Transforms/NaCl/simplify-struct-reg-resume-crash.ll
new file mode 100644
index 000000000000..0f7e519a8793
--- /dev/null
+++ b/test/Transforms/NaCl/simplify-struct-reg-resume-crash.ll
@@ -0,0 +1,20 @@
+; RUN: not opt < %s -simplify-struct-reg-signatures -S
+
+%struct = type { i8*, void(%struct)* }
+
+declare i32 @__gxx_personality_v0(...)
+declare void @something_to_invoke()
+
+; landingpad with struct
+define void @landingpad_is_struct(%struct %str) {
+  invoke void @something_to_invoke()
+    to label %OK unwind label %Err
+
+OK:
+  ret void
+
+Err:
+  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+    cleanup
+  resume %struct %str
+}
\ No newline at end of file
diff --git a/test/Transforms/NaCl/simplify-struct-reg-signatures.ll b/test/Transforms/NaCl/simplify-struct-reg-signatures.ll
new file mode 100644
index 000000000000..7f89cf91307e
--- /dev/null
+++ b/test/Transforms/NaCl/simplify-struct-reg-signatures.ll
@@ -0,0 +1,276 @@
+; RUN: opt %s -simplify-struct-reg-signatures -S | FileCheck %s
+
+declare i32 @__gxx_personality_v0(...)
+
+%struct = type { i32, i32 }
+
+%rec_struct = type {%rec_struct*}
+%rec_problem_struct = type{void (%rec_problem_struct)*}
+%rec_pair_1 = type {%rec_pair_2*}
+%rec_pair_2 = type {%rec_pair_1*}
+%rec_returning = type { %rec_returning (%rec_returning)* }
+%direct_def = type { void(%struct)*, %struct }
+
+; new type declarations:
+; CHECK: %struct = type { i32, i32 }
+; CHECK-NEXT: %rec_struct = type { %rec_struct* }
+; CHECK-NEXT: %rec_problem_struct.simplified = type { void (%rec_problem_struct.simplified*)* }
+; CHECK-NEXT: %rec_pair_1 = type { %rec_pair_2* }
+; CHECK-NEXT: %rec_pair_2 = type { %rec_pair_1* }
+; CHECK-NEXT: %rec_returning.simplified = type { void (%rec_returning.simplified*, %rec_returning.simplified*)* }
+; CHECK-NEXT: %direct_def.simplified = type { void (%struct*)*, %struct }
+
+; Leave intrinsics alone:
+; CHECK: { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32)
+declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32)
+
+; CHECK-LABEL: define void @call_intrinsic()
+define void @call_intrinsic() {
+  %a = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 5, i32 5)
+; CHECK-NEXT: %a = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 5, i32 5)
+  ret void
+}
+
+; externs
+declare void @extern_func(%struct)
+declare %struct @struct_returning_extern(i32, %struct)
+
+; verify that parameters are mapped correctly: single param, two, and combo
+; with non-struct regs
+; CHECK-NOT: declare void @extern_func(%struct)
+; CHECK-NOT: declare %struct @struct_returning_extern(i32, %struct)
+; CHECK-LABEL: declare void @extern_func(%struct* byval)
+; CHECK-LABEL: declare void @struct_returning_extern(%struct* sret, i32, %struct* byval)
+
+define void @main(%struct* byval %ptr) {
+  %val = load %struct, %struct* %ptr
+  call void @extern_func(%struct %val)
+  ret void
+}
+
+define void @two_param_func(%struct %val1, %struct %val2) {
+  call void @extern_func(%struct %val1)
+  call void @extern_func(%struct %val2)
+  ret void
+}
+
+; CHECK-LABEL: define void @two_param_func(%struct* byval %val1.ptr, %struct* byval %val2.ptr)
+; CHECK-NOT: define void @two_param_func(%struct %val1, %struct %val2)
+
+define i32 @another_func(i32 %a, %struct %str, i64 %b) {
+  call void @two_param_func(%struct %str, %struct %str)
+  call void @extern_func(%struct %str)
+  ret i32 0
+}
+
+; CHECK-LABEL: define i32 @another_func(i32 %a, %struct* byval %str.ptr, i64 %b)
+; CHECK: call void @two_param_func(%struct* byval %str.sreg.ptr, %struct* byval %str.sreg.ptr1)
+
+define %struct @returns_struct(i32 %an_int, %struct %val) {
+  %tmp = call %struct @struct_returning_extern(i32 %an_int, %struct %val)
+  %tmp2 = invoke %struct @struct_returning_extern(i32 1, %struct %tmp)
+    to label %Cont unwind label %Cleanup
+
+Cont:
+  ret %struct %tmp2
+Cleanup:
+  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+    cleanup
+  resume {i8*, i32} %exn
+}
+
+; verify return value and codegen
+; CHECK-LABEL: define void @returns_struct(%struct* sret %retVal, i32 %an_int, %struct* byval %val.ptr)
+; CHECK-NEXT:  %tmp2 = alloca %struct
+; CHECK-NEXT:  %tmp.sreg.ptr = alloca %struct
+; CHECK-NEXT:  %tmp = alloca %struct
+; CHECK-NEXT:  %val.sreg.ptr = alloca %struct
+; CHECK-NEXT:  %val.sreg = load %struct, %struct* %val.ptr
+; CHECK-NEXT:  store %struct %val.sreg, %struct* %val.sreg.ptr
+; CHECK-NEXT:  call void @struct_returning_extern(%struct* sret %tmp, i32 %an_int, %struct* byval %val.sreg.ptr)
+; CHECK-NEXT:  %tmp.sreg = load %struct, %struct* %tmp
+; CHECK-NEXT:  store %struct %tmp.sreg, %struct* %tmp.sreg.ptr
+; CHECK-NEXT:  invoke void @struct_returning_extern(%struct* sret %tmp2, i32 1, %struct* byval %tmp.sreg.ptr)
+; CHECK-NEXT:            to label %Cont unwind label %Cleanup
+; CHECK-DAG:   Cont:
+; CHECK-NEXT:    %tmp2.sreg = load %struct, %struct* %tmp2
+; CHECK-NEXT:    store %struct %tmp2.sreg, %struct* %retVal
+; CHECK-NEXT:    ret void
+; CHECK-DAG:   Cleanup:
+; CHECK-NEXT:    %exn = landingpad { i8*, i32 } personality i32 (...)* @__gxx_personality_v0
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    resume { i8*, i32 } %exn
+
+define i32 @lots_of_call_attrs() {
+  %tmp.0 = insertvalue %struct undef, i32 1, 0
+  %tmp.1 = insertvalue %struct %tmp.0, i32 2, 1
+  %ret = tail call zeroext i32 @another_func(i32 1, %struct %tmp.1, i64 2) readonly
+  ret i32 %ret
+}
+
+; verify attributes are copied
+; CHECK_LABEL: @lots_of_call_attrs
+; CHECK: %ret = tail call zeroext i32 @another_func(i32 1, %struct* byval %tmp.1.ptr, i64 2) #1
+; CHECK-NEXT: ret i32 %ret
+
+declare void @rec_struct_ok(%rec_struct*)
+declare void @rec_struct_mod(%rec_struct)
+
+; compliant recursive structs are kept as-is
+; CHECK-LABEL: declare void @rec_struct_ok(%rec_struct*)
+; CHECK-LABEL: declare void @rec_struct_mod(%rec_struct* byval)
+
+define void @rec_call_sreg(%rec_problem_struct %r) {
+  %tmp = extractvalue %rec_problem_struct %r, 0
+  call void %tmp(%rec_problem_struct %r)
+  ret void
+}
+
+; non-compliant structs are correctly mapped and calls are changed
+; CHECK-LABEL: define void @rec_call_sreg(%rec_problem_struct.simplified* byval %r.ptr)
+; CHECK: call void %tmp(%rec_problem_struct.simplified* byval %r.sreg.ptr)
+
+declare void @pairs(%rec_pair_1)
+
+define %rec_returning @rec_returning_fun(%rec_returning %str) {
+  %tmp = extractvalue %rec_returning %str, 0
+  %ret = call %rec_returning %tmp(%rec_returning %str)
+  ret %rec_returning %ret
+}
+
+; pair structs
+; CHECK-LABEL: declare void @pairs(%rec_pair_1* byval)
+; CHECK-LABEL: define void @rec_returning_fun(%rec_returning.simplified* sret %retVal, %rec_returning.simplified* byval %str.ptr)
+; CHECK-NEXT:   %ret = alloca %rec_returning.simplified
+; CHECK-NEXT:   %str.sreg.ptr = alloca %rec_returning.simplified
+; CHECK-NEXT:   %str.sreg = load %rec_returning.simplified, %rec_returning.simplified* %str.ptr
+; CHECK-NEXT:   %tmp = extractvalue %rec_returning.simplified %str.sreg, 0
+; CHECK-NEXT:   store %rec_returning.simplified %str.sreg, %rec_returning.simplified* %str.sreg.ptr
+; CHECK-NEXT:   call void %tmp(%rec_returning.simplified* sret %ret, %rec_returning.simplified* byval %str.sreg.ptr)
+; CHECK-NEXT:   %ret.sreg = load %rec_returning.simplified, %rec_returning.simplified* %ret
+; CHECK-NEXT:   store %rec_returning.simplified %ret.sreg, %rec_returning.simplified* %retVal
+; CHECK-NEXT:   ret void
+
+define void @direct_caller(%direct_def %def) {
+  %func = extractvalue %direct_def %def, 0
+  %param = extractvalue %direct_def %def, 1
+  call void %func(%struct %param)
+  ret void
+}
+
+; CHECK-LABEL: define void @direct_caller(%direct_def.simplified* byval %def.ptr)
+; CHECK-NEXT:  %param.ptr = alloca %struct
+; CHECK-NEXT:  %def.sreg = load %direct_def.simplified, %direct_def.simplified* %def.ptr
+; CHECK-NEXT:  %func = extractvalue %direct_def.simplified %def.sreg, 0
+; CHECK-NEXT:  %param = extractvalue %direct_def.simplified %def.sreg, 1
+; CHECK-NEXT:  store %struct %param, %struct* %param.ptr
+; CHECK-NEXT:  call void %func(%struct* byval %param.ptr)
+; CHECK-NEXT:  ret void
+
+; vararg functions are converted correctly
+declare void @vararg_ok(i32, ...)
+; CHECK-LABEL: declare void @vararg_ok(i32, ...)
+
+define void @vararg_problem(%rec_problem_struct %arg1, ...) {
+  ; CHECK-LABEL: define void @vararg_problem(%rec_problem_struct.simplified* byval %arg1.ptr, ...)
+   ret void
+}
+
+%vararg_fp_struct = type { i32, void (i32, ...)* }
+declare void @vararg_fp_fct(%vararg_fp_struct %arg)
+;CHECK-LABEL: declare void @vararg_fp_fct(%vararg_fp_struct* byval)
+
+define void @call_vararg(%vararg_fp_struct %param1, ...) {
+  %fptr = extractvalue %vararg_fp_struct %param1, 1
+  call void (i32, ...) %fptr(i32 0, i32 1)
+  ret void
+}
+
+; CHECK-LABEL: define void @call_vararg(%vararg_fp_struct* byval %param1.ptr, ...)
+; CHECK-NEXT:  %param1.sreg = load %vararg_fp_struct, %vararg_fp_struct* %param1.ptr
+; CHECK-NEXT:  %fptr = extractvalue %vararg_fp_struct %param1.sreg, 1
+; CHECK-NEXT:  call void (i32, ...) %fptr(i32 0, i32 1)
+; CHECK-NEXT:  ret void
+
+%vararg_fp_problem_struct = type { void(%vararg_fp_problem_struct)* }
+define void @vararg_fp_problem_call(%vararg_fp_problem_struct* byval %param) {
+  %fct_ptr = getelementptr %vararg_fp_problem_struct, %vararg_fp_problem_struct* %param, i32 0, i32 0
+  %fct = load void(%vararg_fp_problem_struct)*, void(%vararg_fp_problem_struct)** %fct_ptr
+  %param_for_call = load %vararg_fp_problem_struct, %vararg_fp_problem_struct* %param
+  call void %fct(%vararg_fp_problem_struct %param_for_call)
+  ret void
+}
+
+; CHECK-LABEL: define void @vararg_fp_problem_call(%vararg_fp_problem_struct.simplified* byval %param)
+; CHECK-NEXT:  %param_for_call.ptr = alloca %vararg_fp_problem_struct.simplified
+; CHECK-NEXT:  %fct_ptr = getelementptr %vararg_fp_problem_struct.simplified, %vararg_fp_problem_struct.simplified* %param, i32 0, i32 0
+; CHECK-NEXT:  %fct = load void (%vararg_fp_problem_struct.simplified*)*, void (%vararg_fp_problem_struct.simplified*)** %fct_ptr
+; CHECK-NEXT:  %param_for_call = load %vararg_fp_problem_struct.simplified, %vararg_fp_problem_struct.simplified* %param
+; CHECK-NEXT:  store %vararg_fp_problem_struct.simplified %param_for_call, %vararg_fp_problem_struct.simplified* %param_for_call.ptr
+; CHECK-NEXT:  call void %fct(%vararg_fp_problem_struct.simplified* byval %param_for_call.ptr)
+; CHECK-NEXT:  ret void
+
+define void @call_with_array([4 x void(%struct)*] %fptrs, %struct %str) {
+  %fptr = extractvalue [4 x void(%struct)*] %fptrs, 2
+  call void %fptr(%struct %str)
+  ret void
+}
+
+; CHECK-LABEL: define void @call_with_array([4 x void (%struct*)*]* byval %fptrs.ptr, %struct* byval %str.ptr)
+; CHECK-NEXT:  %str.sreg.ptr = alloca %struct
+; CHECK-NEXT:  %fptrs.sreg = load [4 x void (%struct*)*], [4 x void (%struct*)*]* %fptrs.ptr
+; CHECK-NEXT:  %str.sreg = load %struct, %struct* %str.ptr
+; CHECK-NEXT:  %fptr = extractvalue [4 x void (%struct*)*] %fptrs.sreg, 2
+; CHECK-NEXT:  store %struct %str.sreg, %struct* %str.sreg.ptr
+; CHECK-NEXT:  call void %fptr(%struct* byval %str.sreg.ptr)
+; CHECK-NEXT:  ret void
+
+define void @call_with_array_ptr([4 x void(%struct)*]* %fptrs, %struct %str) {
+  %fptr_ptr = getelementptr [4 x void(%struct)*], [4 x void(%struct)*]* %fptrs, i32 0, i32 2
+  %fptr = load void(%struct)*, void(%struct)** %fptr_ptr
+  call void %fptr(%struct %str)
+  ret void
+}
+
+; CHECK-LABEL: define void @call_with_array_ptr([4 x void (%struct*)*]* %fptrs, %struct* byval %str.ptr)
+; CHECK-NEXT:  %str.sreg.ptr = alloca %struct
+; CHECK-NEXT:  %str.sreg = load %struct, %struct* %str.ptr
+; CHECK-NEXT:  %fptr_ptr = getelementptr [4 x void (%struct*)*], [4 x void (%struct*)*]* %fptrs, i32 0, i32 2
+; CHECK-NEXT:  %fptr = load void (%struct*)*, void (%struct*)** %fptr_ptr
+; CHECK-NEXT:  store %struct %str.sreg, %struct* %str.sreg.ptr
+; CHECK-NEXT:  call void %fptr(%struct* byval %str.sreg.ptr)
+; CHECK-NEXT:  ret void
+
+define void @call_with_vector(<4 x void (%struct)*> %fptrs, %struct %str) {
+  %fptr = extractelement <4 x void (%struct)*> %fptrs, i32 2
+  call void %fptr(%struct %str)
+  ret void
+}
+
+; CHECK-LABEL: define void @call_with_vector(<4 x void (%struct*)*> %fptrs, %struct* byval %str.ptr)
+; CHECK-NEXT:  %str.sreg.ptr = alloca %struct
+; CHECK-NEXT:  %str.sreg = load %struct, %struct* %str.ptr
+; CHECK-NEXT:  %fptr = extractelement <4 x void (%struct*)*> %fptrs, i32 2
+; CHECK-NEXT:  store %struct %str.sreg, %struct* %str.sreg.ptr
+; CHECK-NEXT:  call void %fptr(%struct* byval %str.sreg.ptr)
+; CHECK-NEXT:  ret void
+
+define void @call_with_array_vect([4 x <2 x void(%struct)*>] %fptrs, %struct %str) {
+  %vect = extractvalue [4 x <2 x void(%struct)*>] %fptrs, 2
+  %fptr = extractelement <2 x void (%struct)*> %vect, i32 1
+  call void %fptr(%struct %str)
+  ret void
+}
+
+; CHECK-LABEL: define void @call_with_array_vect([4 x <2 x void (%struct*)*>]* byval %fptrs.ptr, %struct* byval %str.ptr)
+; CHECK-NEXT:  %str.sreg.ptr = alloca %struct
+; CHECK-NEXT:  %fptrs.sreg = load [4 x <2 x void (%struct*)*>], [4 x <2 x void (%struct*)*>]* %fptrs.ptr
+; CHECK-NEXT:  %str.sreg = load %struct, %struct* %str.ptr
+; CHECK-NEXT:  %vect = extractvalue [4 x <2 x void (%struct*)*>] %fptrs.sreg, 2
+; CHECK-NEXT:  %fptr = extractelement <2 x void (%struct*)*> %vect, i32 1
+; CHECK-NEXT:  store %struct %str.sreg, %struct* %str.sreg.ptr
+; CHECK-NEXT:  call void %fptr(%struct* byval %str.sreg.ptr)
+; CHECK-NEXT:  ret void
+
+; this is at the end, corresponds to the call marked as readonly
+; CHECK: attributes #1 = { readonly }
\ No newline at end of file
diff --git a/test/Transforms/NaCl/simplify-struct-reg-vararg-crash.ll b/test/Transforms/NaCl/simplify-struct-reg-vararg-crash.ll
new file mode 100644
index 000000000000..2b0e59fe8334
--- /dev/null
+++ b/test/Transforms/NaCl/simplify-struct-reg-vararg-crash.ll
@@ -0,0 +1,10 @@
+; RUN: not opt < %s -simplify-struct-reg-signatures -S
+
+%struct = type { i32, i32 }
+
+declare void @vararg_fct(...)
+
+define void @vararg_caller_with_agg(%struct %str) {
+  call void(...)* @vararg_fct(%struct %str)
+  ret void
+}
\ No newline at end of file
diff --git a/test/Transforms/NaCl/strip-attributes.ll b/test/Transforms/NaCl/strip-attributes.ll
new file mode 100644
index 000000000000..9923e1cfcf78
--- /dev/null
+++ b/test/Transforms/NaCl/strip-attributes.ll
@@ -0,0 +1,66 @@
+; RUN: opt -S -nacl-strip-attributes %s 2>&1 | FileCheck %s
+
+
+; Check that we emit a warning for some special meaning sections:
+; CHECK: Warning: func_init_array will have its section (.init_array) stripped.
+; CHECK-NOT: Warning: __rustc_debug_gdb_scripts_section__ will have its section
+
+@var = unnamed_addr global i32 0
+; CHECK: @var = global i32 0
+
+@__rustc_debug_gdb_scripts_section__ = internal unnamed_addr constant [34 x i8] c"\01gdb_load_rust_pretty_printers.py\00", section ".debug_gdb_scripts", align 1
+; CHECK: @__rustc_debug_gdb_scripts_section__ = internal constant [34 x i8] c"\01gdb_load_rust_pretty_printers.py\00", align 1
+
+define void @func_section() section ".some_section" {
+  ret void
+}
+; CHECK-LABEL: define void @func_section() {
+
+define void @func_init_array() section ".init_array" {
+  ret void
+}
+; CHECK-LABEL: define void @func_init_array() {
+
+
+define fastcc void @func_attrs(i32 inreg, i32 zeroext)
+    unnamed_addr noreturn nounwind readonly align 8 {
+  ret void
+}
+; CHECK-LABEL: define void @func_attrs(i32, i32) {
+
+define hidden void @hidden_visibility() {
+  ret void
+}
+; CHECK-LABEL: define void @hidden_visibility() {
+
+define protected void @protected_visibility() {
+  ret void
+}
+; CHECK-LABEL: define void @protected_visibility() {
+
+
+define void @call_attrs() {
+  call fastcc void @func_attrs(i32 inreg 10, i32 zeroext 20) noreturn nounwind readonly
+  ret void
+}
+; CHECK-LABEL: define void @call_attrs()
+; CHECK: call void @func_attrs(i32 10, i32 20){{$}}
+
+
+; We currently don't attempt to strip attributes from intrinsic
+; declarations because the reader automatically inserts attributes
+; based on built-in knowledge of intrinsics, so it is difficult to get
+; rid of them here.
+declare i8* @llvm.nacl.read.tp()
+; CHECK: declare i8* @llvm.nacl.read.tp() #{{[0-9]+}}
+
+define void @arithmetic_attrs() {
+  %add = add nsw i32 1, 2
+  %shl = shl nuw i32 3, 4
+  %lshr = lshr exact i32 2, 1
+  ret void
+}
+; CHECK-LABEL: define void @arithmetic_attrs() {
+; CHECK-NEXT: %add = add i32 1, 2
+; CHECK-NEXT: %shl = shl i32 3, 4
+; CHECK-NEXT: %lshr = lshr i32 2, 1
diff --git a/test/Transforms/NaCl/strip-branchweight-metadata.ll b/test/Transforms/NaCl/strip-branchweight-metadata.ll
new file mode 100644
index 000000000000..a51f1852a7a6
--- /dev/null
+++ b/test/Transforms/NaCl/strip-branchweight-metadata.ll
@@ -0,0 +1,29 @@
+; RUN: opt -S -strip-metadata %s | FileCheck %s
+
+; Test that !prof metadata is removed from branches
+; CHECK: @foo
+; CHECK-NOT: !prof
+define i32 @foo(i32 %c) {
+  switch i32 %c, label %3 [
+    i32 5, label %4
+    i32 0, label %1
+    i32 4, label %2
+  ], !prof !0
+
+; <label>:1                                       ; preds = %0
+  br label %4
+
+; <label>:2                                       ; preds = %0
+  br label %4
+
+; <label>:3                                       ; preds = %0
+  br label %4
+
+; <label>:4                                       ; preds = %0, %3, %2, %1
+  %.0 = phi i32 [ -1, %1 ], [ 99, %2 ], [ 1, %3 ], [ 0, %0 ]
+  ret i32 %.0
+}
+
+; CHECK: ret i32 %.0
+; CHECK-NOT: !0 =
+!0 = !{!"branch_weights", i32 4, i32 256, i32 8, i32 4}
diff --git a/test/Transforms/NaCl/strip-meta-leaves-debug.ll b/test/Transforms/NaCl/strip-meta-leaves-debug.ll
new file mode 100644
index 000000000000..2182be0f3caa
--- /dev/null
+++ b/test/Transforms/NaCl/strip-meta-leaves-debug.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -strip-metadata %s | FileCheck %s --check-prefix=STRIPMETA
+; RUN: opt -S -strip-module-flags %s | FileCheck %s --check-prefix=STRIPMODF
+; RUN: opt -S -strip-metadata -strip-module-flags -strip-debug %s | FileCheck %s --check-prefix=STRIPALL
+
+define i32 @foo(i32 %c) {
+; STRIPMETA: @foo
+; STRIPMETA-NEXT: call void @llvm.dbg{{.*}}, !dbg
+; STRIPMETA-NEXT: ret{{.*}}, !dbg
+; STRIPMODF: @foo
+; STRIPMODF-NEXT: call void @llvm.dbg{{.*}}, !dbg
+; STRIPMODF-NEXT: ret{{.*}}, !dbg
+; STRIPALL: @foo
+; STRIPALL-NOT: !dbg
+  tail call void @llvm.dbg.value(metadata i32 %c, i64 0, metadata !9, metadata !13), !dbg !14
+  ret i32 %c, !dbg !15
+}
+
+; STRIPMETA: @llvm.dbg.value
+; STRIPMODF: @llvm.dbg.value
+; STRIPALL: ret i32
+; STRIPALL-NOT: @llvm.dbg.value
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+; STRIPMETA-NOT: MadeUpMetadata
+; STRIPMODF-NOT: MadeUpMetadata
+!MadeUpMetadata = !{}
+
+; STRIPMETA: !llvm.dbg.cu
+; STRIPMODF: !llvm.dbg.cu
+!llvm.dbg.cu = !{!0}
+
+; STRIPMETA: llvm.module.flags
+; STRIPMODF-NOT: llvm.module.flags
+; STRIPALL-NOT: llvm.module.flags
+!llvm.module.flags = !{!10, !11, !24}
+
+; STRIPMETA: !0 =
+; STRIPMODF: !0 =
+
+
+; STRIPMETA: Debug Info Version
+; STRIPMODF-NOT: Debug Info Version
+; STRIPALL-NOT: Debug Info Version
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+
+; STRIPMETA: Linker Options
+; STRIPMODF-NOT: Linker Options
+; STRIPALL-NOT: Linker Options
+!24 = !{i32 6, !"Linker Options", !{!{!"-lz"}, !{!"-framework", !"Cocoa"}, !{!"-lmath"}}}
+
+
+!llvm.ident = !{!12}
+
+!0 = !MDCompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (trunk 235150) (llvm/trunk 235152)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !3, globals: !2, imports: !2)
+!1 = !MDFile(filename: "foo.c", directory: "/s/llvm/cmakebuild")
+!2 = !{}
+!3 = !{!4}
+!4 = !MDSubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, function: i32 (i32)* @foo, variables: !8)
+!5 = !MDSubroutineType(types: !6)
+!6 = !{!7, !7}
+!7 = !MDBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{!9}
+!9 = !MDLocalVariable(tag: DW_TAG_arg_variable, name: "c", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{!"clang version 3.7.0 (trunk 235150) (llvm/trunk 235152)"}
+!13 = !MDExpression()
+!14 = !MDLocation(line: 1, column: 13, scope: !4)
+!15 = !MDLocation(line: 2, column: 3, scope: !4)
+
diff --git a/test/Transforms/NaCl/strip-tbaa-metadata.ll b/test/Transforms/NaCl/strip-tbaa-metadata.ll
new file mode 100644
index 000000000000..4c38e70cf2f2
--- /dev/null
+++ b/test/Transforms/NaCl/strip-tbaa-metadata.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -strip-metadata %s | FileCheck %s
+
+; Test that !tbaa is removed from loads/stores.
+; CHECK: @foo
+; CHECK-NOT: !tbaa
+define double @foo(i32* nocapture %ptr1, double* nocapture %ptr2) nounwind readonly {
+  store i32 99999, i32* %ptr1, align 1, !tbaa !0
+  %1 = load double, double* %ptr2, align 8, !tbaa !3
+  ret double %1
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+; Test that !tbaa is removed from calls.
+; CHECK: @bar
+; CHECK-NOT: !tbaa
+define void @bar(i8* nocapture %p, i8* nocapture %q,
+       i8* nocapture %s) nounwind {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %p, i8* %q,
+                                            i64 16, i32 1, i1 false), !tbaa !4
+  store i8 2, i8* %s, align 1, !tbaa !5
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %q, i8* %p,
+                                            i64 16, i32 1, i1 false), !tbaa !4
+; CHECK ret void
+  ret void
+}
+
+; Test that the metadata nodes aren't left over.
+; CHECK-NOT: !0 =
+
+!0 = !{!"int", !1}
+!1 = !{!"omnipotent char", !2}
+!2 = !{!"Simple C/C++ TBAA"}
+!3 = !{!"double", !1}
+!4 = !{!"A", !1}
+!5 = !{!"B", !1}
diff --git a/tools/opt/CMakeLists.txt b/tools/opt/CMakeLists.txt
index 518396e36028..b5a9be90e31e 100644
--- a/tools/opt/CMakeLists.txt
+++ b/tools/opt/CMakeLists.txt
@@ -7,6 +7,10 @@ set(LLVM_LINK_COMPONENTS
   Coroutines
   IPO
   IRReader
+  Linker
+  # @LOCALMOD-BEGIN
+  JSBackend
+  # @LOCALMOD-END
   InstCombine
   Instrumentation
   MC
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index a93c06c1d13d..a30ef344dc43 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/LinkAllIR.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -77,9 +78,9 @@ static cl::opt<std::string> PassPipeline(
 
 // Other command line options...
 //
-static cl::opt<std::string>
-InputFilename(cl::Positional, cl::desc("<input bitcode file>"),
-    cl::init("-"), cl::value_desc("filename"));
+static cl::list<std::string> // XXX EMSCRIPTEN: support multiple input files, link them
+InputFilenames(cl::Positional, cl::ZeroOrMore,
+               cl::desc("<input bitcode files>"));
 
 static cl::opt<std::string>
 OutputFilename("o", cl::desc("Override output filename"),
@@ -433,7 +434,40 @@ int main(int argc, char **argv) {
   }
 
   // Load the input module...
-  std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
+  std::unique_ptr<Module> M;
+
+  // XXX EMSCRIPTEN: support for multiple files
+  if (InputFilenames.size() == 0)
+    M = parseIRFile("-", Err, Context);
+  else if (InputFilenames.size() == 1)
+    M = parseIRFile(InputFilenames[0], Err, Context);
+  else {
+    // link them in
+    M = nullptr;
+    std::unique_ptr<Linker> L;
+
+    for (unsigned i = 0; i < InputFilenames.size(); ++i) {
+      std::unique_ptr<Module> MM = parseIRFile(InputFilenames[i], Err, Context);
+      if (!MM.get()) {
+        errs() << argv[0] << ": error loading file '" <<InputFilenames[i]<< "'\n";
+        return 1;
+      }
+
+      if (!NoVerify && verifyModule(*MM, &errs())) {
+        errs() << argv[0] << ": " << InputFilenames[i]
+               << ": error: input module is broken!\n";
+        return 1;
+      }
+
+      if (i == 0) {
+        M.swap(MM);
+        L = make_unique<Linker>(*M);
+      } else {
+        if (L->linkInModule(std::move(MM)))
+          return 1;
+      }
+    }
+  }
 
   if (!M) {
     Err.print(argv[0], errs());
@@ -448,7 +482,7 @@ int main(int argc, char **argv) {
   // pass pipelines.  Otherwise we can crash on broken code during
   // doInitialization().
   if (!NoVerify && verifyModule(*M, &errs())) {
-    errs() << argv[0] << ": " << InputFilename
+    errs() << argv[0] << ": "
            << ": error: input module is broken!\n";
     return 1;
   }