From aab74908dc6b027fc4edd134bfeeaa3af9f141d2 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Mon, 25 Nov 2024 10:18:33 -0500
Subject: [PATCH 1/2] codegen: add a pass for late conversion of known modify
 ops to call atomicrmw

The ExpandAtomicModify can recognize our pseudo-intrinsic
julia.atomicmodify and convert it into some of known atomicrmw
expressions, or simplify it with more inlining, as applicable.

This ensures that now our `@atomic` modify is as fast as
`Threads.Atomic` for the cases we implement now.
---
 src/Makefile                      |   8 +-
 src/aotcompile.cpp                | 199 +++++++++----
 src/cgutils.cpp                   |  82 ++++--
 src/codegen.cpp                   | 193 ++++++++++--
 src/jitlayers.cpp                 | 454 +++++++++++-----------------
 src/jitlayers.h                   |  17 +-
 src/julia.expmap.in               |   1 -
 src/llvm-expand-atomic-modify.cpp | 473 ++++++++++++++++++++++++++++++
 src/llvm-julia-passes.inc         |   1 +
 src/passes.h                      |   5 +
 src/pipeline.cpp                  |   4 +-
 test/llvmpasses/atomic-modify.ll  | 288 ++++++++++++++++++
 12 files changed, 1346 insertions(+), 379 deletions(-)
 create mode 100644 src/llvm-expand-atomic-modify.cpp
 create mode 100644 test/llvmpasses/atomic-modify.ll

diff --git a/src/Makefile b/src/Makefile
index c605d6c70573b..6a6f604f3c5fc 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -79,7 +79,8 @@ endif
 CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \
 	llvm-pass-helpers llvm-ptls llvm-propagate-addrspaces null_sysimage \
 	llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \
-	llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures pipeline llvm_api \
+	llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures llvm-expand-atomic-modify \
+	pipeline llvm_api \
 	$(GC_CODEGEN_SRCS)
 FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir)
 CG_LLVM_LIBS := all
@@ -338,7 +339,7 @@ $(BUILDDIR)/julia_flisp.boot: $(addprefix $(SRCDIR)/,jlfrontend.scm flisp/aliase
 		$(call cygpath_w,$(SRCDIR)/mk_julia_flisp_boot.scm) $(call cygpath_w,$(dir $<)) $(notdir $<) $(call cygpath_w,$@))
 
 # additional dependency links
-$(BUILDDIR)/codegen-stubs.o $(BUILDDIR)/codegen-stubs.dbg.obj: $(SRCDIR)/intrinsics.h
+$(BUILDDIR)/codegen-stubs.o $(BUILDDIR)/codegen-stubs.dbg.obj: $(addprefix $(SRCDIR)/,intrinsics.h llvm-julia-passes.inc)
 $(BUILDDIR)/aotcompile.o $(BUILDDIR)/aotcompile.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/processor.h
 $(BUILDDIR)/ast.o $(BUILDDIR)/ast.dbg.obj: $(BUILDDIR)/julia_flisp.boot.inc $(SRCDIR)/flisp/*.h
 $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/iddict.c $(SRCDIR)/idset.c $(SRCDIR)/builtin_proto.h
@@ -378,7 +379,8 @@ $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $
 $(BUILDDIR)/staticdata.o $(BUILDDIR)/staticdata.dbg.obj: $(SRCDIR)/staticdata_utils.c $(SRCDIR)/precompile_utils.c $(SRCDIR)/processor.h $(SRCDIR)/builtin_proto.h
 $(BUILDDIR)/toplevel.o $(BUILDDIR)/toplevel.dbg.obj: $(SRCDIR)/builtin_proto.h
 $(BUILDDIR)/ircode.o $(BUILDDIR)/ircode.dbg.obj: $(SRCDIR)/serialize.h $(SRCDIR)/common_symbols1.inc $(SRCDIR)/common_symbols2.inc
-$(BUILDDIR)/pipeline.o $(BUILDDIR)/pipeline.dbg.obj: $(SRCDIR)/passes.h $(SRCDIR)/jitlayers.h
+$(BUILDDIR)/pipeline.o $(BUILDDIR)/pipeline.dbg.obj: $(addprefix $(SRCDIR)/,passes.h jitlayers.h llvm-julia-passes.inc)
+$(BUILDDIR)/llvm_api.o $(BUILDDIR)/llvm_api.dbg.obj: $(SRCDIR)/llvm-julia-passes.inc
 
 $(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc-common.o gc-stock.o gc.dbg.obj init.c init.dbg.obj task.o task.dbg.obj): $(addprefix $(SRCDIR)/,threading.h)
 $(addprefix $(BUILDDIR)/,APInt-C.o APInt-C.dbg.obj runtime_intrinsics.o runtime_intrinsics.dbg.obj): $(SRCDIR)/APInt-C.h
diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp
index d687f44808409..7c9d8aec1a1c9 100644
--- a/src/aotcompile.cpp
+++ b/src/aotcompile.cpp
@@ -330,7 +330,11 @@ class egal_set {
 };
 }
 using ::egal_set;
-typedef DenseMap<jl_code_instance_t*, std::pair<orc::ThreadSafeModule, jl_llvm_functions_t>> jl_compiled_functions_t;
+struct jl_compiled_function_t {
+   orc::ThreadSafeModule TSM;
+   jl_llvm_functions_t decls;
+};
+typedef DenseMap<jl_code_instance_t*, jl_compiled_function_t> jl_compiled_functions_t;
 
 static void record_method_roots(egal_set &method_roots, jl_method_instance_t *mi)
 {
@@ -376,7 +380,7 @@ static void aot_optimize_roots(jl_codegen_params_t &params, egal_set &method_roo
                 std::string OldName(GV->getName());
                 StringRef NewName(mref->second->getName());
                 for (auto &def : compiled_functions) {
-                    orc::ThreadSafeModule &TSM = std::get<0>(def.second);
+                    orc::ThreadSafeModule &TSM = def.second.TSM;
                     Module &M = *TSM.getModuleUnlocked();
                     if (GlobalValue *GV2 = M.getNamedValue(OldName)) {
                         if (GV2 == GV)
@@ -402,7 +406,7 @@ static void aot_optimize_roots(jl_codegen_params_t &params, egal_set &method_roo
 
 static void resolve_workqueue(jl_codegen_params_t &params, egal_set &method_roots, jl_compiled_functions_t &compiled_functions)
 {
-    decltype(params.workqueue) workqueue;
+    jl_workqueue_t workqueue;
     std::swap(params.workqueue, workqueue);
     jl_code_instance_t *codeinst = NULL;
     JL_GC_PUSH1(&codeinst);
@@ -418,7 +422,7 @@ static void resolve_workqueue(jl_codegen_params_t &params, egal_set &method_root
         {
             auto it = compiled_functions.find(codeinst);
             if (it != compiled_functions.end()) {
-                auto &decls = it->second.second;
+                auto &decls = it->second.decls;
                 invokeName = decls.functionObject;
                 if (decls.functionObject == "jl_fptr_args") {
                     preal_decl = decls.specFunctionObject;
@@ -442,8 +446,11 @@ static void resolve_workqueue(jl_codegen_params_t &params, egal_set &method_root
         }
         if (preal_decl.empty()) {
             pinvoke = emit_tojlinvoke(codeinst, invokeName, mod, params);
-            if (!proto.specsig)
+            if (!proto.specsig) {
                 proto.decl->replaceAllUsesWith(pinvoke);
+                proto.decl->eraseFromParent();
+                proto.decl = pinvoke;
+            }
         }
         if (proto.specsig && !preal_specsig) {
             // get or build an fptr1 that can invoke codeinst
@@ -462,9 +469,12 @@ static void resolve_workqueue(jl_codegen_params_t &params, egal_set &method_root
         }
         if (!preal_decl.empty()) {
             // merge and/or rename this prototype to the real function
-            if (Value *specfun = mod->getNamedValue(preal_decl)) {
-                if (proto.decl != specfun)
+            if (Function *specfun = cast_or_null<Function>(mod->getNamedValue(preal_decl))) {
+                if (proto.decl != specfun) {
                     proto.decl->replaceAllUsesWith(specfun);
+                    proto.decl->eraseFromParent();
+                    proto.decl = specfun;
+                }
             }
             else {
                 proto.decl->setName(preal_decl);
@@ -482,9 +492,12 @@ static void resolve_workqueue(jl_codegen_params_t &params, egal_set &method_root
             assert(ocinvokeDecl != "jl_fptr_const_return");
             assert(ocinvokeDecl != "jl_fptr_sparam");
             // merge and/or rename this prototype to the real function
-            if (Value *specfun = mod->getNamedValue(ocinvokeDecl)) {
-                if (proto.oc != specfun)
+            if (Function *specfun = cast_or_null<Function>(mod->getNamedValue(ocinvokeDecl))) {
+                if (proto.oc != specfun) {
                     proto.oc->replaceAllUsesWith(specfun);
+                    proto.oc->eraseFromParent();
+                    proto.oc = specfun;
+                }
             }
             else {
                 proto.oc->setName(ocinvokeDecl);
@@ -496,6 +509,7 @@ static void resolve_workqueue(jl_codegen_params_t &params, egal_set &method_root
     JL_GC_POP();
 }
 
+
 /// Link the function in the source module into the destination module if
 /// needed, setting up mapping information.
 /// Similar to orc::cloneFunctionDecl, but more complete for greater correctness
@@ -577,8 +591,8 @@ static void generate_cfunc_thunks(jl_codegen_params_t &params, jl_compiled_funct
                 codeinst = it->second;
                 JL_GC_PROMISE_ROOTED(codeinst);
                 auto defs = compiled_functions.find(codeinst);
-                defM = std::get<0>(defs->second).getModuleUnlocked();
-                const jl_llvm_functions_t &decls = std::get<1>(defs->second);
+                defM = defs->second.TSM.getModuleUnlocked();
+                const jl_llvm_functions_t &decls = defs->second.decls;
                 func = decls.functionObject;
                 StringRef specfunc = decls.specFunctionObject;
                 jl_value_t *astrt = codeinst->rettype;
@@ -624,6 +638,25 @@ static void generate_cfunc_thunks(jl_codegen_params_t &params, jl_compiled_funct
     }
 }
 
+// destructively move the contents of src into dest
+// this assumes that the targets of the two modules are the same
+// including the DataLayout and ModuleFlags (for example)
+// and that there is no module-level assembly
+// Comdat is also removed, since this needs to be re-added later
+static void jl_merge_module(Linker &L, orc::ThreadSafeModule srcTSM) JL_NOTSAFEPOINT
+{
+    srcTSM.consumingModuleDo([&L](std::unique_ptr<Module> src) JL_NOTSAFEPOINT {
+        bool error = L.linkInModule(std::move(src));
+        assert(!error && "linking llvmcall modules failed");
+        (void)error;
+    });
+}
+
+static bool canPartition(const Function &F)
+{
+    return !F.hasFnAttribute(Attribute::AlwaysInline) &&
+           !F.hasFnAttribute(Attribute::InlineHint);
+}
 
 // takes the running content that has collected in the shadow module and dump it to disk
 // this builds the object file portion of the sysimage files for fast startup
@@ -743,7 +776,7 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm
     orc::ThreadSafeModule backing;
     if (!llvmmod) {
         ctx = jl_ExecutionEngine->makeContext();
-        backing = jl_create_ts_module("text", ctx);
+        backing = jl_create_ts_module("text", ctx, jl_ExecutionEngine->getDataLayout(), jl_ExecutionEngine->getTargetTriple());
     }
     orc::ThreadSafeModule &clone = llvmmod ? *unwrap(llvmmod) : backing;
     auto ctxt = clone.getContext();
@@ -760,6 +793,7 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm
     assert(params.imaging_mode); // `_imaging_mode` controls if broken features like code-coverage are disabled
     params.external_linkage = external_linkage;
     params.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0);
+    bool safepoint_on_entry = params.safepoint_on_entry;
     JL_GC_PUSH3(&params.temporary_roots, &method_roots.list, &method_roots.keyset);
     jl_compiled_functions_t compiled_functions;
     size_t i, l;
@@ -774,17 +808,8 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm
             assert(jl_is_code_info(src));
             if (compiled_functions.count(codeinst))
                 continue; // skip any duplicates that accidentally made there way in here (or make this an error?)
-            if (external_linkage) {
-                uint8_t specsigflags;
-                jl_callptr_t invoke;
-                void *fptr;
-                jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
-                if (invoke != NULL && (specsigflags & 0b100)) {
-                    // this codeinst is already available externally
-                    // TODO: for performance, avoid generating the src code when we know it would reach here anyways
-                    continue;
-                }
-            }
+            if (jl_ir_inlining_cost((jl_value_t*)src) < UINT16_MAX)
+                params.safepoint_on_entry = false; // ensure we don't block ExpandAtomicModifyPass from inlining this code if applicable
             orc::ThreadSafeModule result_m = jl_create_ts_module(name_from_method_instance(jl_get_ci_mi(codeinst)),
                     params.tsctx, clone.getModuleUnlocked()->getDataLayout(),
                     Triple(clone.getModuleUnlocked()->getTargetTriple()));
@@ -793,6 +818,7 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm
                 decls.functionObject = "jl_fptr_const_return";
             else
                 decls = jl_emit_codeinst(result_m, codeinst, src, params);
+            params.safepoint_on_entry = safepoint_on_entry;
             record_method_roots(method_roots, jl_get_ci_mi(codeinst));
             if (result_m)
                 compiled_functions[codeinst] = {std::move(result_m), std::move(decls)};
@@ -823,7 +849,6 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm
     size_t idx = 0;
     for (auto &global : params.global_targets) {
         gvars[idx] = global.second->getName().str();
-        global.second->setInitializer(literal_static_pointer_val(global.first, global.second->getValueType()));
         assert(gvars_set.insert(global.second).second && "Duplicate gvar in params!");
         assert(gvars_names.insert(gvars[idx]).second && "Duplicate gvar name in params!");
         data->jl_value_to_llvm[idx] = global.first;
@@ -854,11 +879,27 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm
     {
         Linker L(*clone.getModuleUnlocked());
         for (auto &def : compiled_functions) {
-            jl_merge_module(clone, std::move(std::get<0>(def.second)));
             jl_code_instance_t *this_code = def.first;
-            jl_llvm_functions_t decls = std::get<1>(def.second);
+            JL_GC_PROMISE_ROOTED(this_code);
+            jl_llvm_functions_t &decls = def.second.decls;
             StringRef func = decls.functionObject;
             StringRef cfunc = decls.specFunctionObject;
+            orc::ThreadSafeModule &M = def.second.TSM;
+            if (external_linkage) {
+                uint8_t specsigflags;
+                jl_callptr_t invoke;
+                void *fptr;
+                jl_read_codeinst_invoke(this_code, &specsigflags, &invoke, &fptr, 0);
+                if (invoke != NULL && (specsigflags & 0b100)) {
+                    // this codeinst is already available externally: keep it only if canPartition demands it for local use
+                    // TODO: for performance, avoid generating the src code when we know it would reach here anyways?
+                    if (M.withModuleDo([&](Module &M) { return !canPartition(*cast<Function>(M.getNamedValue(cfunc))); })) {
+                        jl_merge_module(L, std::move(M));
+                    }
+                    continue;
+                }
+            }
+            jl_merge_module(L, std::move(M));
             uint32_t func_id = 0;
             uint32_t cfunc_id = 0;
             if (func == "jl_fptr_args") {
@@ -885,6 +926,52 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm
             }
             data->jl_fvar_map[this_code] = std::make_tuple(func_id, cfunc_id);
         }
+        bool Changed = true;
+        while (Changed) {
+            Changed = false;
+            // make sure everything referenced got included though, since some functions aren't
+            // correctly implemented by staticdata for external use, and so codegen won't emit
+            // an external reference but expects a private copy here instead
+            for (auto &def : compiled_functions) {
+                orc::ThreadSafeModule &M = def.second.TSM;
+                if (!M)
+                    continue;
+                jl_llvm_functions_t &decls = def.second.decls;
+                StringRef func = decls.functionObject;
+                StringRef cfunc = decls.specFunctionObject;
+                if (func != "jl_fptr_args" &&
+                    func != "jl_fptr_sparam" &&
+                    func != "jl_f_opaque_closure_call" &&
+                    clone.getModuleUnlocked()->getNamedValue(func)) {
+                    jl_merge_module(L, std::move(M));
+                    Changed = true;
+                    continue;
+                }
+                if (!cfunc.empty() && clone.getModuleUnlocked()->getNamedValue(cfunc)) {
+                    Changed = true;
+                    jl_merge_module(L, std::move(M));
+                }
+            }
+        }
+#ifndef NDEBUG
+        // make sure we didn't forget anything that we promised to include in here
+        for (auto &def : compiled_functions) {
+            jl_llvm_functions_t &decls = def.second.decls;
+            StringRef func = decls.functionObject;
+            StringRef cfunc = decls.specFunctionObject;
+            if (func != "jl_fptr_args" &&
+                func != "jl_fptr_sparam" &&
+                func != "jl_f_opaque_closure_call") {
+                GlobalValue *F = clone.getModuleUnlocked()->getNamedValue(func);
+                assert(!F || !F->isDeclaration());
+            }
+            if (!cfunc.empty()) {
+                GlobalValue *F = clone.getModuleUnlocked()->getNamedValue(cfunc);
+                assert(!F || !F->isDeclaration());
+            }
+        }
+#endif
+        compiled_functions.clear();
         if (params._shared_module) {
             bool error = L.linkInModule(std::move(params._shared_module));
             assert(!error && "Error linking in shared module");
@@ -894,15 +981,35 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm
 
     // now get references to the globals in the merged module
     // and set them to be internalized and initialized at startup
+    // filter out any gvars that got optimized away
+    idx = 0;
+    size_t newoffset = 0;
+    size_t newidx = 0;
     for (auto &global : gvars) {
         //Safe b/c context is locked by params
-        GlobalVariable *G = cast<GlobalVariable>(clone.getModuleUnlocked()->getNamedValue(global));
-        assert(G->hasInitializer());
-        G->setLinkage(GlobalValue::InternalLinkage);
-        G->setDSOLocal(true);
-        data->jl_sysimg_gvars.push_back(G);
+        GlobalVariable *G = cast_or_null<GlobalVariable>(clone.getModuleUnlocked()->getNamedValue(global));
+        if (G != nullptr) {
+            assert(!G->hasInitializer());
+            G->setInitializer(Constant::getNullValue(G->getValueType()));
+            G->setLinkage(GlobalValue::InternalLinkage);
+            G->setDSOLocal(true);
+            assert(newidx == data->jl_sysimg_gvars.size());
+            if (idx < offset) {
+                data->jl_value_to_llvm[newidx] = data->jl_value_to_llvm[idx];
+                newoffset = newidx + 1;
+            }
+            else {
+                data->jl_external_to_llvm[newidx - newoffset] = data->jl_external_to_llvm[idx - offset];
+            }
+            data->jl_sysimg_gvars.push_back(G);
+            newidx++;
+        }
+        idx++;
     }
-    CreateNativeGlobals += gvars.size();
+    data->jl_value_to_llvm.resize(newoffset);
+    data->jl_external_to_llvm.resize(newidx - newoffset);
+    gvars.clear();
+    CreateNativeGlobals += idx;
 
     data->M = std::move(clone);
     return (void*)data;
@@ -1126,11 +1233,6 @@ struct Partition {
     size_t weight;
 };
 
-static bool canPartition(const Function &F)
-{
-    return !F.hasFnAttribute(Attribute::AlwaysInline);
-}
-
 static inline bool verify_partitioning(const SmallVectorImpl<Partition> &partitions, const Module &M, DenseMap<GlobalValue *, unsigned> &fvars, DenseMap<GlobalValue *, unsigned> &gvars) {
     bool bad = false;
 #ifndef JL_NDEBUG
@@ -1583,7 +1685,8 @@ static void materializePreserved(Module &M, Partition &partition) {
             // This just avoids a hashtable lookup.
             GV->setLinkage(GlobalValue::InternalLinkage);
             assert(GV->hasDefaultVisibility());
-        } else {
+        }
+        else {
             Preserve.insert(GV);
         }
     }
@@ -2094,11 +2197,6 @@ void jl_dump_native_impl(void *native_code,
             addComdat(&GA, TheTriple);
         }
 
-        // Wipe the global initializers, we'll reset them at load time
-        for (auto gv : data->jl_sysimg_gvars) {
-            cast<GlobalVariable>(gv)->setInitializer(Constant::getNullValue(gv->getValueType()));
-        }
-
         // add metadata information
         if (imaging_mode) {
             multiversioning_preannotate(dataM);
@@ -2359,17 +2457,16 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t *dump, jl_method_instance_t *mi, jl_
     dump->TSM = nullptr;
     if (src && jl_is_code_info(src)) {
         auto ctx = jl_ExecutionEngine->makeContext();
-        orc::ThreadSafeModule m = jl_create_ts_module(name_from_method_instance(mi), ctx);
+        const auto &DL = jl_ExecutionEngine->getDataLayout();
+        const auto &TT = jl_ExecutionEngine->getTargetTriple();
+        orc::ThreadSafeModule m = jl_create_ts_module(name_from_method_instance(mi), ctx, DL, TT);
         Function *F = nullptr;
         {
             uint64_t compiler_start_time = 0;
             uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled);
             if (measure_compile_time_enabled)
                 compiler_start_time = jl_hrtime();
-            auto target_info = m.withModuleDo([&](Module &M) {
-                return std::make_pair(M.getDataLayout(), Triple(M.getTargetTriple()));
-            });
-            jl_codegen_params_t output(ctx, std::move(target_info.first), std::move(target_info.second));
+            jl_codegen_params_t output(ctx, DL, TT);
             output.params = &params;
             output.imaging_mode = jl_options.image_codegen;
             output.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0);
@@ -2389,7 +2486,7 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t *dump, jl_method_instance_t *mi, jl_
                 jl_code_instance_t *codeinst = jl_type_infer(mi, latestworld, SOURCE_MODE_NOT_REQUIRED);
                 if (codeinst == nullptr || compiled_functions.count(codeinst))
                     continue;
-                orc::ThreadSafeModule decl_m = jl_create_ts_module("extern", ctx);
+                orc::ThreadSafeModule decl_m = jl_create_ts_module("extern", ctx, DL, TT);
                 jl_llvm_functions_t decls;
                 if (jl_atomic_load_relaxed(&codeinst->invoke) == jl_fptr_const_return_addr)
                     decls.functionObject = "jl_fptr_const_return";
@@ -2398,6 +2495,8 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t *dump, jl_method_instance_t *mi, jl_
                 compiled_functions[codeinst] = {std::move(decl_m), std::move(decls)};
             }
             generate_cfunc_thunks(output, compiled_functions);
+            emit_always_inline(m, output);
+            output.workqueue.clear();
             compiled_functions.clear();
             output.temporary_roots = nullptr;
             JL_GC_POP(); // GC the global_targets array contents now since reflection doesn't need it
@@ -2412,7 +2511,7 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t *dump, jl_method_instance_t *mi, jl_
                     }
                     else {
                         auto p = literal_static_pointer_val(global.first, global.second->getValueType());
-                        Type *elty = PointerType::get(output.getContext(), 0);
+                        Type *elty = PointerType::get(p->getContext(), 0);
                         // For pretty printing, when LLVM inlines the global initializer into its loads
                         auto alias = GlobalAlias::create(elty, 0, GlobalValue::PrivateLinkage, global.second->getName() + ".jit", p, global.second->getParent());
                         global.second->setInitializer(ConstantExpr::getBitCast(alias, global.second->getValueType()));
diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index d9b7b98e40ef4..64d6f6eb54de8 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -2265,8 +2265,10 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j
         return mark_julia_slot(intcast, jltype, NULL, ctx.tbaa().tbaa_stack);
 }
 
+static Function *emit_modifyhelper(jl_codectx_t &ctx2, const jl_cgval_t &op, const jl_cgval_t &modifyop, jl_value_t *jltype, Type *elty, jl_cgval_t rhs, const Twine &fname, bool gcstack_arg);
+
 static jl_cgval_t typed_store(jl_codectx_t &ctx,
-        Value *ptr, jl_cgval_t rhs, jl_cgval_t cmp,
+        Value *ptr, jl_cgval_t rhs, jl_cgval_t cmpop,
         jl_value_t *jltype, MDNode *tbaa, MDNode *aliasscope,
         Value *parent,  // for the write barrier, NULL if no barrier needed
         bool isboxed, AtomicOrdering Order, AtomicOrdering FailOrder, unsigned alignment,
@@ -2275,10 +2277,10 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
         jl_module_t *mod, jl_sym_t *var)
 {
     auto newval = [&](const jl_cgval_t &lhs) {
-        const jl_cgval_t argv[3] = { cmp, lhs, rhs };
+        const jl_cgval_t argv[3] = { cmpop, lhs, rhs };
         jl_cgval_t ret;
         if (modifyop) {
-            ret = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type);
+            ret = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type, true);
         }
         else {
             Value *callval = emit_jlcall(ctx, jlapplygeneric_func, nullptr, argv, 3, julia_call);
@@ -2302,7 +2304,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
             return rhs;
         }
         else if (isreplacefield) {
-            Value *Success = emit_f_is(ctx, cmp, ghostValue(ctx, jltype));
+            Value *Success = emit_f_is(ctx, cmpop, ghostValue(ctx, jltype));
             Success = ctx.builder.CreateZExt(Success, getInt8Ty(ctx.builder.getContext()));
             const jl_cgval_t argv[2] = {ghostValue(ctx, jltype), mark_julia_type(ctx, Success, false, jl_bool_type)};
             jl_datatype_t *rettyp = jl_apply_cmpswap_type(jltype);
@@ -2403,6 +2405,46 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
         ai.decorateInst(store);
         instr = store;
     }
+    else if (ismodifyfield && modifyop && !needlock && Order != AtomicOrdering::NotAtomic && !isboxed && realelty == elty && !intcast && elty->isIntegerTy() && !jl_type_hasptr(jltype)) {
+        // emit this only if we have a possibility of optimizing it
+        if (Order == AtomicOrdering::Unordered)
+            Order = AtomicOrdering::Monotonic;
+        if (jl_is_pointerfree(rhs.typ) && !rhs.isghost && (rhs.constant || rhs.isboxed || rhs.ispointer())) {
+            // if this value can be loaded from memory, do that now so that it is sequenced before the atomicmodify
+            // and the IR is less dependent on what was emitted before now to create this rhs.
+            // Inlining should do okay to clean this up later if there are parts we don't need.
+            rhs = jl_cgval_t(emit_unbox(ctx, julia_type_to_llvm(ctx, rhs.typ), rhs, rhs.typ), rhs.typ, NULL);
+        }
+        bool gcstack_arg = JL_FEAT_TEST(ctx,gcstack_arg);
+        Function *op = emit_modifyhelper(ctx, cmpop, *modifyop, jltype, elty, rhs, fname, gcstack_arg);
+        std::string intr_name = "julia.atomicmodify.i";
+        intr_name += utostr(cast<IntegerType>(elty)->getBitWidth());
+        intr_name += ".p";
+        intr_name += utostr(ptr->getType()->getPointerAddressSpace());
+        FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name,
+                FunctionType::get(StructType::get(elty, elty), {ptr->getType(), ctx.builder.getPtrTy(), ctx.builder.getInt8Ty(), ctx.builder.getInt8Ty()}, true),
+                AttributeList::get(elty->getContext(),
+                  Attributes(elty->getContext(), {Attribute::NoMerge}), // prevent llvm from merging calls to different functions
+                  AttributeSet(),
+                  None));
+        SmallVector<Value*,0> Args = {ptr, op, ctx.builder.getInt8((unsigned)Order), ctx.builder.getInt8(SyncScope::System)};
+        if (rhs.V)
+            Args.push_back(rhs.V);
+        if (rhs.Vboxed)
+            Args.push_back(rhs.Vboxed);
+        if (rhs.TIndex)
+            Args.push_back(rhs.TIndex);
+        Args.append(rhs.inline_roots);
+        if (gcstack_arg)
+            Args.push_back(ctx.pgcstack);
+        auto oldnew = ctx.builder.CreateCall(intr, Args);
+        oldnew->addParamAttr(0, Attribute::getWithAlignment(oldnew->getContext(), Align(alignment)));
+        //jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa);
+        //ai.noalias = MDNode::concatenate(aliasscope, ai.noalias);
+        //ai.decorateInst(oldnew);
+        oldval = mark_julia_type(ctx, ctx.builder.CreateExtractValue(oldnew, 0), isboxed, jltype);
+        rhs = mark_julia_type(ctx, ctx.builder.CreateExtractValue(oldnew, 1), isboxed, jltype);
+    }
     else {
         // replacefield, modifyfield, swapfield, setfieldonce (isboxed && atomic)
         DoneBB = BasicBlock::Create(ctx.builder.getContext(), "done_xchg", ctx.f);
@@ -2416,7 +2458,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
                 assert(jl_is_concrete_type(jltype));
                 needloop = ((jl_datatype_t*)jltype)->layout->flags.haspadding ||
                           !((jl_datatype_t*)jltype)->layout->flags.isbitsegal;
-                Value *SameType = emit_isa(ctx, cmp, jltype, Twine()).first;
+                Value *SameType = emit_isa(ctx, cmpop, jltype, Twine()).first;
                 if (SameType != ConstantInt::getTrue(ctx.builder.getContext())) {
                     BasicBlock *SkipBB = BasicBlock::Create(ctx.builder.getContext(), "skip_xchg", ctx.f);
                     BasicBlock *BB = BasicBlock::Create(ctx.builder.getContext(), "ok_xchg", ctx.f);
@@ -2436,22 +2478,22 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
                     Current->addIncoming(instr, SkipBB);
                     ctx.builder.SetInsertPoint(BB);
                 }
-                cmp = update_julia_type(ctx, cmp, jltype);
+                cmpop = update_julia_type(ctx, cmpop, jltype);
                 if (intcast) {
-                    emit_unbox_store(ctx, cmp, intcast, ctx.tbaa().tbaa_stack, MaybeAlign(), intcast->getAlign());
+                    emit_unbox_store(ctx, cmpop, intcast, ctx.tbaa().tbaa_stack, MaybeAlign(), intcast->getAlign());
                     Compare = ctx.builder.CreateLoad(realelty, intcast);
                 }
                 else {
-                    Compare = emit_unbox(ctx, realelty, cmp, jltype);
+                    Compare = emit_unbox(ctx, realelty, cmpop, jltype);
                 }
                 if (realelty != elty)
                     Compare = ctx.builder.CreateZExt(Compare, elty);
             }
-            else if (cmp.isboxed || cmp.constant || jl_pointer_egal(jltype)) {
-                Compare = boxed(ctx, cmp);
-                needloop = !jl_pointer_egal(jltype) && !jl_pointer_egal(cmp.typ);
-                if (needloop && !cmp.isboxed) // try to use the same box in the compare now and later
-                    cmp = mark_julia_type(ctx, Compare, true, cmp.typ);
+            else if (cmpop.isboxed || cmpop.constant || jl_pointer_egal(jltype)) {
+                Compare = boxed(ctx, cmpop);
+                needloop = !jl_pointer_egal(jltype) && !jl_pointer_egal(cmpop.typ);
+                if (needloop && !cmpop.isboxed) // try to use the same box in the compare now and later
+                    cmpop = mark_julia_type(ctx, Compare, true, cmpop.typ);
             }
             else {
                 Compare = Constant::getNullValue(ctx.types().T_prjlvalue); // TODO: does this need to be an invalid bit pattern?
@@ -2485,7 +2527,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
         }
         if (ismodifyfield) {
             if (needlock)
-                emit_lockstate_value(ctx, needlock, false);
+                emit_lockstate_value(ctx, needlock, false); // unlock
             Value *realCompare = Compare;
             if (realelty != elty)
                 realCompare = ctx.builder.CreateTrunc(realCompare, realelty);
@@ -2520,8 +2562,8 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
             if (realelty != elty)
                 r = ctx.builder.CreateZExt(r, elty);
             if (needlock)
-                emit_lockstate_value(ctx, needlock, true);
-            cmp = oldval;
+                emit_lockstate_value(ctx, needlock, true); // relock
+            cmpop = oldval;
         }
         Value *Done;
         if (Order == AtomicOrdering::NotAtomic) {
@@ -2541,7 +2583,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
             if (issetfieldonce)
                 Success = ctx.builder.CreateIsNull(first_ptr);
             else
-                Success = emit_f_is(ctx, oldval, cmp, first_ptr, nullptr);
+                Success = emit_f_is(ctx, oldval, cmpop, first_ptr, nullptr);
             if (needloop && ismodifyfield)
                 CmpPhi->addIncoming(load, ctx.builder.GetInsertBlock());
             assert(Succ == nullptr);
@@ -2599,12 +2641,12 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx,
                     Done = ctx.builder.CreateIsNotNull(first_ptr);
                 }
                 else {
-                    // Done = !(!Success && (first_ptr != NULL && oldval == cmp))
+                    // Done = !(!Success && (first_ptr != NULL && oldval == cmpop))
                     Done = emit_guarded_test(ctx, ctx.builder.CreateNot(Success), false, [&] {
                         Value *first_ptr = nullptr;
                         if (maybe_null_if_boxed)
                             first_ptr = isboxed ? realinstr : extract_first_ptr(ctx, realinstr);
-                        return emit_f_is(ctx, oldval, cmp, first_ptr, nullptr);
+                        return emit_f_is(ctx, oldval, cmpop, first_ptr, nullptr);
                     });
                     Done = ctx.builder.CreateNot(Done);
                 }
@@ -4024,7 +4066,7 @@ static jl_cgval_t union_store(jl_codectx_t &ctx,
                 emit_lockstate_value(ctx, needlock, false);
             const jl_cgval_t argv[3] = { cmp, oldval, rhs };
             if (modifyop) {
-                rhs = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type);
+                rhs = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type, true);
             }
             else {
                 Value *callval = emit_jlcall(ctx, jlapplygeneric_func, nullptr, argv, 3, julia_call);
diff --git a/src/codegen.cpp b/src/codegen.cpp
index af86b568c2b0f..473b0709e9f93 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -2076,7 +2076,7 @@ static CallInst *emit_jlcall(jl_codectx_t &ctx, JuliaFunction<> *theFptr, Value
 static Value *emit_f_is(jl_codectx_t &ctx, const jl_cgval_t &arg1, const jl_cgval_t &arg2,
                         Value *nullcheck1 = nullptr, Value *nullcheck2 = nullptr);
 static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t nargs, ArrayRef<jl_cgval_t> argv, bool is_promotable=false);
-static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayRef<jl_cgval_t> argv, size_t nargs, jl_value_t *rt);
+static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayRef<jl_cgval_t> argv, size_t nargs, jl_value_t *rt, bool always_inline);
 
 static Value *literal_pointer_val(jl_codectx_t &ctx, jl_value_t *p);
 static unsigned julia_alignment(jl_value_t *jt);
@@ -5124,10 +5124,7 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos
         namep += cast<Function>(TheCallee)->getName();
         GlobalVariable *GV = cast_or_null<GlobalVariable>(jl_Module->getNamedValue(namep));
         if (GV == nullptr) {
-            GV = new GlobalVariable(*jl_Module, TheCallee->getType(), false,
-                                    GlobalVariable::ExternalLinkage,
-                                    Constant::getNullValue(TheCallee->getType()),
-                                    namep);
+            GV = new GlobalVariable(*jl_Module, TheCallee->getType(), false, GlobalVariable::ExternalLinkage, nullptr, namep);
             ctx.emission_context.external_fns[std::make_tuple(fromexternal, true)] = GV;
         }
         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
@@ -5174,10 +5171,7 @@ static jl_cgval_t emit_call_specfun_boxed(jl_codectx_t &ctx, jl_value_t *jlretty
         GlobalVariable *GV = cast_or_null<GlobalVariable>(jl_Module->getNamedValue(namep));
         Type *pfunc = PointerType::getUnqual(ctx.builder.getContext());
         if (GV == nullptr) {
-            GV = new GlobalVariable(*jl_Module, pfunc, false,
-                                    GlobalVariable::ExternalLinkage,
-                                    Constant::getNullValue(pfunc),
-                                    namep);
+            GV = new GlobalVariable(*jl_Module, pfunc, false, GlobalVariable::ExternalLinkage, nullptr, namep);
             ctx.emission_context.external_fns[std::make_tuple(fromexternal, false)] = GV;
         }
         jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const);
@@ -5206,10 +5200,10 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, jl_expr_t *ex, jl_value_t *rt)
         if (argv[i].typ == jl_bottom_type)
             return jl_cgval_t();
     }
-    return emit_invoke(ctx, lival, argv, nargs, rt);
+    return emit_invoke(ctx, lival, argv, nargs, rt, false);
 }
 
-static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayRef<jl_cgval_t> argv, size_t nargs, jl_value_t *rt)
+static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayRef<jl_cgval_t> argv, size_t nargs, jl_value_t *rt, bool always_inline)
 {
     ++EmittedInvokes;
     bool handled = false;
@@ -5265,44 +5259,52 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR
                         std::string name;
                         StringRef protoname;
                         bool need_to_emit = true;
-                        bool cache_valid = ctx.use_cache || ctx.external_linkage;
+                        bool cache_valid = (ctx.use_cache || ctx.external_linkage);
                         bool external = false;
 
                         // Check if we already queued this up
                         auto it = ctx.call_targets.find(codeinst);
-                        if (need_to_emit && it != ctx.call_targets.end()) {
+                        if (it != ctx.call_targets.end()) {
                             assert(it->second.specsig == specsig);
                             protoname = it->second.decl->getName();
-                            need_to_emit = cache_valid = false;
+                            if (always_inline)
+                                it->second.private_linkage = true;
+                            else
+                                it->second.external_linkage = true;
                         }
-
-                        // Check if it is already compiled (either JIT or externally)
-                        if (need_to_emit && cache_valid) {
-                            // optimization: emit the correct name immediately, if we know it
+                        // Check if it is already compiled (either JIT or externally), and if so, re-use that name if possible
+                        // This is just an optimization to emit the correct name immediately, if we know it, since the JIT and AOT code will be able to do this later also
+                        if (cache_valid) {
                             // TODO: use `emitted` map here too to try to consolidate names?
                             uint8_t specsigflags;
                             jl_callptr_t invoke;
                             void *fptr;
                             jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
                             if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr) {
-                                protoname = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
                                 if (ctx.external_linkage) {
                                     // TODO: Add !specsig support to aotcompile.cpp
                                     // Check that the codeinst is containing native code
                                     if (specsig && (specsigflags & 0b100)) {
-                                        external = true;
+                                        external = !always_inline;
                                         need_to_emit = false;
                                     }
                                 }
                                 else { // ctx.use_cache
                                     need_to_emit = false;
                                 }
+                                if (!need_to_emit && protoname.empty())
+                                    protoname = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
                             }
                         }
-                        if (need_to_emit) {
+                        if (it != ctx.call_targets.end())
+                            need_to_emit = false;
+                        else if (always_inline)
+                            need_to_emit = true;
+                        if (protoname.empty()) {
                             raw_string_ostream(name) << (specsig ? "j_" : "j1_") << name_from_method_instance(mi) << "_" << jl_atomic_fetch_add_relaxed(&globalUniqueGeneratedNames, 1);
                             protoname = StringRef(name);
                         }
+
                         jl_returninfo_t::CallingConv cc = jl_returninfo_t::CallingConv::Boxed;
                         unsigned return_roots = 0;
                         if (specsig)
@@ -5311,7 +5313,7 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR
                             result = emit_call_specfun_boxed(ctx, codeinst->rettype, protoname, external ? codeinst : nullptr, argv, nargs, rt);
                         if (need_to_emit) {
                             Function *trampoline_decl = cast<Function>(jl_Module->getNamedValue(protoname));
-                            ctx.call_targets[codeinst] = {cc, return_roots, trampoline_decl, nullptr, specsig};
+                            ctx.call_targets[codeinst] = {cc, return_roots, trampoline_decl, nullptr, specsig, !always_inline, always_inline};
                         }
                     }
                 }
@@ -6257,7 +6259,7 @@ static std::pair<Function*, Function*> get_oc_function(jl_codectx_t &ctx, jl_met
     }
 
     if (need_to_emit) {
-        ctx.call_targets[ci] = {cc, return_roots, specsig ? specF : F, specsig ? F : nullptr, specsig};
+        ctx.call_targets[ci] = {cc, return_roots, specsig ? specF : F, specsig ? F : nullptr, specsig, true, false};
     }
 
     JL_GC_POP();
@@ -6780,6 +6782,71 @@ Function *get_or_emit_fptr1(StringRef preal_decl, Module *M)
     return cast<Function>(M->getOrInsertFunction(preal_decl, get_func_sig(M->getContext()), get_func_attrs(M->getContext())).getCallee());
 }
 
+static Function *emit_modifyhelper(jl_codectx_t &ctx2, const jl_cgval_t &op, const jl_cgval_t &modifyop, jl_value_t *jltype, Type *elty, jl_cgval_t rhs, const Twine &fname, bool gcstack_arg)
+{
+    Module *M = ctx2.f->getParent();
+    jl_codectx_t ctx(M->getContext(), ctx2.emission_context, ctx2.min_world, ctx2.max_world);
+    SmallVector<Type*> ArgTy;
+    ArgTy.push_back(elty);
+    if (rhs.V)
+        ArgTy.push_back(rhs.V->getType());
+    if (rhs.Vboxed)
+        ArgTy.push_back(rhs.Vboxed->getType());
+    if (rhs.TIndex)
+        ArgTy.push_back(rhs.TIndex->getType());
+    for (auto &root : rhs.inline_roots)
+        ArgTy.push_back(root->getType());
+    if (gcstack_arg)
+        ArgTy.push_back(ctx.builder.getPtrTy());
+    FunctionType *FT = FunctionType::get(elty, ArgTy, false);
+    Function *w = Function::Create(FT, GlobalVariable::PrivateLinkage, "", M);
+    jl_init_function(w, ctx.emission_context.TargetTriple);
+    w->addFnAttr(Attribute::AlwaysInline);
+    w->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+    Function::arg_iterator AI = w->arg_begin();
+    Argument *A = &*AI++;
+    // rebuild a copy of rhs from the arguments
+    if (rhs.V)
+        rhs.V = &*AI++;
+    if (rhs.Vboxed)
+        rhs.Vboxed = &*AI++;
+    if (rhs.TIndex)
+        rhs.TIndex = &*AI++;
+    for (size_t i = 0; i < rhs.inline_roots.size(); i++)
+        rhs.inline_roots[i] = &*AI++;
+    rhs.promotion_point = nullptr;
+    rhs.promotion_ssa = -1;
+    if (gcstack_arg) {
+        w->setCallingConv(CallingConv::Swift);
+        AttrBuilder param(ctx.builder.getContext());
+        param.addAttribute(Attribute::SwiftSelf);
+        param.addAttribute(Attribute::NonNull);
+        Argument *gcstackarg = &*AI++;
+        gcstackarg->addAttrs(param);
+        gcstackarg->setName("pgcstack_arg");
+        ctx.pgcstack = gcstackarg;
+    }
+    assert(AI == w->arg_end());
+    ctx.f = w;
+    ctx.rettype = jltype;
+    BasicBlock *b0 = BasicBlock::Create(ctx.builder.getContext(), "top", w);
+    ctx.builder.SetInsertPoint(b0);
+    DebugLoc noDbg;
+    ctx.builder.SetCurrentDebugLocation(noDbg);
+    allocate_gc_frame(ctx, b0);
+    const jl_cgval_t argv[3] = { op, mark_julia_type(ctx, A, false, jltype), rhs };
+    jl_cgval_t ret = emit_invoke(ctx, modifyop, argv, 3, (jl_value_t*)jl_any_type, true);
+    emit_typecheck(ctx, ret, jltype, fname);
+    ret = update_julia_type(ctx, ret, jltype);
+    ctx.builder.CreateRet(emit_unbox(ctx, elty, ret, jltype));
+    if (ctx.topalloca->use_empty()) {
+      ctx.topalloca->eraseFromParent();
+      ctx.topalloca = nullptr;
+    }
+    return w;
+}
+
+
 Function *emit_tojlinvoke(jl_code_instance_t *codeinst, Value *theFunc, Module *M, jl_codegen_params_t &params) JL_NOTSAFEPOINT
 {
     ++EmittedToJLInvokes;
@@ -8998,7 +9065,7 @@ static jl_llvm_functions_t
     Instruction &prologue_end = ctx.builder.GetInsertBlock()->back();
 
     // step 11a. Emit the entry safepoint
-    if (JL_FEAT_TEST(ctx, safepoint_on_entry))
+    if (params.safepoint_on_entry && JL_FEAT_TEST(ctx, safepoint_on_entry))
         emit_gc_safepoint(ctx.builder, ctx.types().T_size, get_current_ptls(ctx), ctx.tbaa().tbaa_const);
 
     // step 11b. Do codegen in control flow order
@@ -9822,6 +9889,84 @@ jl_llvm_functions_t jl_emit_codeinst(
     return decls;
 }
 
+/// Stolen from IRMover.cpp, since it is needlessly private there
+void linkFunctionBody(Function &Dst, Function &Src)
+{
+    assert(Dst.isDeclaration() && !Src.isDeclaration());
+
+    // Link in the operands without remapping.
+    if (Src.hasPrefixData())
+        Dst.setPrefixData(Src.getPrefixData());
+    if (Src.hasPrologueData())
+        Dst.setPrologueData(Src.getPrologueData());
+    if (Src.hasPersonalityFn())
+        Dst.setPersonalityFn(Src.getPersonalityFn());
+    if (Src.hasPersonalityFn())
+        Dst.setPersonalityFn(Src.getPersonalityFn());
+    assert(Src.IsNewDbgInfoFormat == Dst.IsNewDbgInfoFormat);
+
+    // Copy over the metadata attachments without remapping.
+    Dst.copyMetadata(&Src, 0);
+
+    // Steal arguments and splice the body of Src into Dst.
+    Dst.stealArgumentListFrom(Src);
+    Dst.splice(Dst.end(), &Src);
+}
+
+void emit_always_inline(orc::ThreadSafeModule &result_m, jl_codegen_params_t &params) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
+{
+    jl_workqueue_t &edges = params.workqueue;
+    bool always_inline = false;
+    for (auto &it : edges) {
+        if (it.second.private_linkage)
+            always_inline = true;
+    }
+    if (!always_inline)
+        return;
+    jl_task_t *ct = jl_current_task;
+    int8_t gc_state = jl_gc_unsafe_enter(ct->ptls); // codegen may contain safepoints (such as jl_subtype calls)
+    jl_code_info_t *src = nullptr;
+    params.safepoint_on_entry = false;
+    params.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0);
+    JL_GC_PUSH2(&params.temporary_roots, &src);
+    for (auto &it : edges) {
+        jl_code_instance_t *codeinst = it.first;
+        auto &proto = it.second;
+        if (!proto.private_linkage)
+            continue;
+        if (proto.decl->isDeclaration()) {
+            src = (jl_code_info_t*)jl_atomic_load_relaxed(&codeinst->inferred);
+            jl_method_instance_t *mi = jl_get_ci_mi(codeinst);
+            jl_method_t *def = mi->def.method;
+            if (src && (jl_value_t*)src != jl_nothing && jl_is_method(def) && jl_ir_inlining_cost((jl_value_t*)src) < UINT16_MAX)
+                src = jl_uncompress_ir(def, codeinst, (jl_value_t*)src);
+            if (src && jl_is_code_info(src) && jl_ir_inlining_cost((jl_value_t*)src) < UINT16_MAX) {
+                jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, src, params); // contains safepoints
+                if (!result_m)
+                    break;
+                // TODO: jl_optimize_roots(params, mi, *result_m.getModuleUnlocked()); // contains safepoints
+                Module &M = *result_m.getModuleUnlocked();
+                if (decls.functionObject != "jl_fptr_args" &&
+                    decls.functionObject != "jl_fptr_sparam" &&
+                    decls.functionObject != "jl_f_opaque_closure_call") {
+                    Function *F = M.getFunction(decls.functionObject);
+                    F->eraseFromParent();
+                }
+                if (!decls.specFunctionObject.empty()) {
+                    Function *specF = M.getFunction(decls.specFunctionObject);
+                    linkFunctionBody(*proto.decl, *specF);
+                    proto.decl->addFnAttr(Attribute::InlineHint);
+                    proto.decl->setLinkage(proto.external_linkage ? GlobalValue::AvailableExternallyLinkage : GlobalValue::PrivateLinkage);
+                    specF->eraseFromParent();
+                }
+            }
+        }
+    }
+    params.temporary_roots = nullptr;
+    JL_GC_POP();
+    jl_gc_unsafe_leave(ct->ptls, gc_state);
+}
+
 // --- initialization ---
 static auto gv_for_global = new SmallVector<std::pair<jl_value_t**, JuliaVariable*>, 0>();
 static void global_jlvalue_to_llvm(JuliaVariable *var, jl_value_t **addr)
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index bf49b7010b97b..99fb1b8f09bfb 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -71,7 +71,6 @@ STATISTIC(OptO0, "Number of modules optimized at level -O0");
 STATISTIC(OptO1, "Number of modules optimized at level -O1");
 STATISTIC(OptO2, "Number of modules optimized at level -O2");
 STATISTIC(OptO3, "Number of modules optimized at level -O3");
-STATISTIC(ModulesMerged, "Number of modules merged");
 STATISTIC(InternedGlobals, "Number of global constants interned in the string pool");
 
 #ifdef _COMPILER_MSAN_ENABLED_
@@ -339,177 +338,196 @@ static DenseMap<jl_code_instance_t*, SmallVector<jl_code_instance_t*,0>> incompl
 static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t &params, bool forceall=false) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER
 {
     jl_task_t *ct = jl_current_task;
-    decltype(params.workqueue) edges;
+    jl_workqueue_t edges;
     std::swap(params.workqueue, edges);
     for (auto &it : edges) {
         jl_code_instance_t *codeinst = it.first;
         JL_GC_PROMISE_ROOTED(codeinst);
         auto &proto = it.second;
-        // try to emit code for this item from the workqueue
-        StringRef invokeName = "";
-        StringRef preal_decl = "";
-        bool preal_specsig = false;
-        jl_callptr_t invoke = nullptr;
-        bool isedge = false;
-        assert(params.cache);
-        // Checking the cache here is merely an optimization and not strictly required
-        // But it must be consistent with the following invokenames lookup, which is protected by the engine_lock
-        uint8_t specsigflags;
-        void *fptr;
-        void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile) JL_NOTSAFEPOINT; // declare it is not a safepoint (or deadlock) in this file due to 0 parameter
-        jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
-        //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr)
-        if (invoke == jl_fptr_args_addr) {
-            preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
-        }
-        else if (specsigflags & 0b1) {
-            preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
-            preal_specsig = true;
-        }
-        bool force = forceall || invoke != nullptr;
-        if (preal_decl.empty()) {
-            auto it = invokenames.find(codeinst);
-            if (it != invokenames.end()) {
-                auto &decls = it->second;
-                invokeName = decls.functionObject;
-                if (decls.functionObject == "jl_fptr_args") {
-                    preal_decl = decls.specFunctionObject;
-                    isedge = true;
-                }
-                else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") {
-                    preal_decl = decls.specFunctionObject;
-                    preal_specsig = true;
-                    isedge = true;
-                }
-                force = true;
+        if (proto.external_linkage || proto.decl->isDeclaration()) { // if it is not expected externally and has a definition locally, there is no need to patch this edge up
+            // try to emit code for this item from the workqueue
+            StringRef invokeName = "";
+            StringRef preal_decl = "";
+            bool preal_specsig = false;
+            jl_callptr_t invoke = nullptr;
+            bool isedge = false;
+            assert(params.cache);
+            // Checking the cache here is merely an optimization and not strictly required
+            // But it must be consistent with the following invokenames lookup, which is protected by the engine_lock
+            uint8_t specsigflags;
+            void *fptr;
+            void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile) JL_NOTSAFEPOINT; // declare it is not a safepoint (or deadlock) in this file due to 0 parameter
+            jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
+            //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr)
+            if (invoke == jl_fptr_args_addr) {
+                preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
             }
-        }
-        if (preal_decl.empty()) {
-            // there may be an equivalent method already compiled (or at least registered with the JIT to compile), in which case we should be using that instead
-            jl_code_instance_t *compiled_ci = jl_get_ci_equiv(codeinst, 0);
-            if (compiled_ci != codeinst) {
-                codeinst = compiled_ci;
-                uint8_t specsigflags;
-                void *fptr;
-                jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
-                //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr)
-                if (invoke == jl_fptr_args_addr) {
-                    preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
-                }
-                else if (specsigflags & 0b1) {
-                    preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
-                    preal_specsig = true;
-                }
-                if (preal_decl.empty()) {
-                    auto it = invokenames.find(codeinst);
-                    if (it != invokenames.end()) {
-                        auto &decls = it->second;
-                        invokeName = decls.functionObject;
-                        if (decls.functionObject == "jl_fptr_args") {
-                            preal_decl = decls.specFunctionObject;
-                            isedge = true;
-                        }
-                        else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") {
-                            preal_decl = decls.specFunctionObject;
-                            preal_specsig = true;
-                            isedge = true;
-                        }
-                    }
-                }
+            else if (specsigflags & 0b1) {
+                preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
+                preal_specsig = true;
             }
-        }
-        if (!preal_decl.empty() || force) {
-            // if we have a prototype emitted, compare it to what we emitted earlier
-            Module *mod = proto.decl->getParent();
-            assert(proto.decl->isDeclaration());
-            Function *pinvoke = nullptr;
+            bool force = forceall || invoke != nullptr;
             if (preal_decl.empty()) {
-                if (invoke != nullptr && invokeName.empty()) {
-                    assert(invoke != jl_fptr_args_addr);
-                    if (invoke == jl_fptr_sparam_addr)
-                        invokeName = "jl_fptr_sparam";
-                    else if (invoke == jl_f_opaque_closure_call_addr)
-                        invokeName = "jl_f_opaque_closure_call";
-                    else
-                        invokeName = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst);
+                auto it = invokenames.find(codeinst);
+                if (it != invokenames.end()) {
+                    auto &decls = it->second;
+                    invokeName = decls.functionObject;
+                    if (decls.functionObject == "jl_fptr_args") {
+                        preal_decl = decls.specFunctionObject;
+                        isedge = true;
+                    }
+                    else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") {
+                        preal_decl = decls.specFunctionObject;
+                        preal_specsig = true;
+                        isedge = true;
+                    }
+                    force = true;
                 }
-                pinvoke = emit_tojlinvoke(codeinst, invokeName, mod, params);
-                if (!proto.specsig)
-                    proto.decl->replaceAllUsesWith(pinvoke);
-                isedge = false;
-            }
-            if (proto.specsig && !preal_specsig) {
-                // get or build an fptr1 that can invoke codeinst
-                if (pinvoke == nullptr)
-                    pinvoke = get_or_emit_fptr1(preal_decl, mod);
-                // emit specsig-to-(jl)invoke conversion
-                proto.decl->setLinkage(GlobalVariable::InternalLinkage);
-                //protodecl->setAlwaysInline();
-                jl_init_function(proto.decl, params.TargetTriple);
-                // TODO: maybe this can be cached in codeinst->specfptr?
-                int8_t gc_state = jl_gc_unsafe_enter(ct->ptls); // codegen may contain safepoints (such as jl_subtype calls)
-                jl_method_instance_t *mi = jl_get_ci_mi(codeinst);
-                size_t nrealargs = jl_nparams(mi->specTypes); // number of actual arguments being passed
-                bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure;
-                emit_specsig_to_fptr1(proto.decl, proto.cc, proto.return_roots, mi->specTypes, codeinst->rettype, is_opaque_closure, nrealargs, params, pinvoke);
-                jl_gc_unsafe_leave(ct->ptls, gc_state);
-                preal_decl = ""; // no need to fixup the name
             }
-            if (!preal_decl.empty()) {
-                // merge and/or rename this prototype to the real function
-                if (Value *specfun = mod->getNamedValue(preal_decl)) {
-                    if (proto.decl != specfun)
-                        proto.decl->replaceAllUsesWith(specfun);
-                }
-                else {
-                    proto.decl->setName(preal_decl);
+            if (preal_decl.empty()) {
+                // there may be an equivalent method already compiled (or at least registered with the JIT to compile), in which case we should be using that instead
+                jl_code_instance_t *compiled_ci = jl_get_ci_equiv(codeinst, 0);
+                if (compiled_ci != codeinst) {
+                    codeinst = compiled_ci;
+                    uint8_t specsigflags;
+                    void *fptr;
+                    jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0);
+                    //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr)
+                    if (invoke == jl_fptr_args_addr) {
+                        preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
+                    }
+                    else if (specsigflags & 0b1) {
+                        preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst);
+                        preal_specsig = true;
+                    }
+                    if (preal_decl.empty()) {
+                        auto it = invokenames.find(codeinst);
+                        if (it != invokenames.end()) {
+                            auto &decls = it->second;
+                            invokeName = decls.functionObject;
+                            if (decls.functionObject == "jl_fptr_args") {
+                                preal_decl = decls.specFunctionObject;
+                                isedge = true;
+                            }
+                            else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") {
+                                preal_decl = decls.specFunctionObject;
+                                preal_specsig = true;
+                                isedge = true;
+                            }
+                        }
+                    }
                 }
             }
-            if (proto.oc) { // additionally, if we are dealing with an OC constructor, then we might also need to fix up the fptr1 reference too
-                assert(proto.specsig);
-                StringRef ocinvokeDecl = invokeName;
-                if (invoke != nullptr && ocinvokeDecl.empty()) {
-                    // check for some special tokens used by opaque_closure.c and convert those to their real functions
-                    assert(invoke != jl_fptr_args_addr);
-                    assert(invoke != jl_fptr_sparam_addr);
-                    if (invoke == jl_fptr_interpret_call_addr)
-                        ocinvokeDecl = "jl_fptr_interpret_call";
-                    else if (invoke == jl_fptr_const_return_addr)
-                        ocinvokeDecl = "jl_fptr_const_return";
-                    else if (invoke == jl_f_opaque_closure_call_addr)
-                        ocinvokeDecl = "jl_f_opaque_closure_call";
-                    //else if (invoke == jl_interpret_opaque_closure_addr)
-                    else
-                        ocinvokeDecl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst);
+            if (!preal_decl.empty() || force) {
+                // if we have a prototype emitted, compare it to what we emitted earlier
+                Module *mod = proto.decl->getParent();
+                Function *pinvoke = nullptr;
+                if (proto.decl->isDeclaration()) {
+                    if (preal_decl.empty()) {
+                        if (invoke != nullptr && invokeName.empty()) {
+                            assert(invoke != jl_fptr_args_addr);
+                            if (invoke == jl_fptr_sparam_addr)
+                                invokeName = "jl_fptr_sparam";
+                            else if (invoke == jl_f_opaque_closure_call_addr)
+                                invokeName = "jl_f_opaque_closure_call";
+                            else
+                                invokeName = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst);
+                        }
+                        pinvoke = emit_tojlinvoke(codeinst, invokeName, mod, params);
+                        if (!proto.specsig) {
+                            proto.decl->replaceAllUsesWith(pinvoke);
+                            proto.decl->eraseFromParent();
+                            proto.decl = pinvoke;
+                        }
+                        isedge = false;
+                    }
+                    if (proto.specsig && !preal_specsig) {
+                        // get or build an fptr1 that can invoke codeinst
+                        if (pinvoke == nullptr)
+                            pinvoke = get_or_emit_fptr1(preal_decl, mod);
+                        // emit specsig-to-(jl)invoke conversion
+                        proto.decl->setLinkage(GlobalVariable::InternalLinkage);
+                        //protodecl->setAlwaysInline();
+                        jl_init_function(proto.decl, params.TargetTriple);
+                        // TODO: maybe this can be cached in codeinst->specfptr?
+                        int8_t gc_state = jl_gc_unsafe_enter(ct->ptls); // codegen may contain safepoints (such as jl_subtype calls)
+                        jl_method_instance_t *mi = jl_get_ci_mi(codeinst);
+                        size_t nrealargs = jl_nparams(mi->specTypes); // number of actual arguments being passed
+                        bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure;
+                        emit_specsig_to_fptr1(proto.decl, proto.cc, proto.return_roots, mi->specTypes, codeinst->rettype, is_opaque_closure, nrealargs, params, pinvoke);
+                        jl_gc_unsafe_leave(ct->ptls, gc_state);
+                        preal_decl = ""; // no need to fixup the name
+                    }
                 }
-                // if OC expected a specialized specsig dispatch, but we don't have it, use the inner trampoline here too
-                // XXX: this invoke translation logic is supposed to exactly match new_opaque_closure
-                if (!preal_specsig || ocinvokeDecl == "jl_f_opaque_closure_call" || ocinvokeDecl == "jl_fptr_interpret_call" || ocinvokeDecl == "jl_fptr_const_return") {
-                    if (pinvoke == nullptr)
-                        ocinvokeDecl = get_or_emit_fptr1(preal_decl, mod)->getName();
-                    else
-                        ocinvokeDecl = pinvoke->getName();
+                else if (proto.specsig && !preal_specsig) {
+                    // privatize our definition, since for some reason we couldn't use the external one but have an internal one
+                    proto.decl->setLinkage(GlobalValue::PrivateLinkage);
+                    preal_decl = ""; // no need to fixup the name
                 }
-                assert(!ocinvokeDecl.empty());
-                assert(ocinvokeDecl != "jl_fptr_args");
-                assert(ocinvokeDecl != "jl_fptr_sparam");
-                // merge and/or rename this prototype to the real function
-                if (Value *specfun = mod->getNamedValue(ocinvokeDecl)) {
-                    if (proto.oc != specfun)
-                        proto.oc->replaceAllUsesWith(specfun);
+                if (!preal_decl.empty()) {
+                    // merge and/or rename this prototype to the real function
+                    if (Function *specfun = cast_or_null<Function>(mod->getNamedValue(preal_decl))) {
+                        if (proto.decl != specfun) {
+                            proto.decl->replaceAllUsesWith(specfun);
+                            if (!proto.decl->isDeclaration() && specfun->isDeclaration())
+                                linkFunctionBody(*specfun, *proto.decl);
+                            proto.decl->eraseFromParent();
+                            proto.decl = specfun;
+                        }
+                    }
+                    else {
+                        proto.decl->setName(preal_decl);
+                    }
                 }
-                else {
-                    proto.oc->setName(ocinvokeDecl);
+                if (proto.oc) { // additionally, if we are dealing with an OC constructor, then we might also need to fix up the fptr1 reference too
+                    assert(proto.specsig);
+                    StringRef ocinvokeDecl = invokeName;
+                    if (invoke != nullptr && ocinvokeDecl.empty()) {
+                        // check for some special tokens used by opaque_closure.c and convert those to their real functions
+                        assert(invoke != jl_fptr_args_addr);
+                        assert(invoke != jl_fptr_sparam_addr);
+                        if (invoke == jl_fptr_interpret_call_addr)
+                            ocinvokeDecl = "jl_fptr_interpret_call";
+                        else if (invoke == jl_fptr_const_return_addr)
+                            ocinvokeDecl = "jl_fptr_const_return";
+                        else if (invoke == jl_f_opaque_closure_call_addr)
+                            ocinvokeDecl = "jl_f_opaque_closure_call";
+                        //else if (invoke == jl_interpret_opaque_closure_addr)
+                        else
+                            ocinvokeDecl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst);
+                    }
+                    // if OC expected a specialized specsig dispatch, but we don't have it, use the inner trampoline here too
+                    // XXX: this invoke translation logic is supposed to exactly match new_opaque_closure
+                    if (!preal_specsig || ocinvokeDecl == "jl_f_opaque_closure_call" || ocinvokeDecl == "jl_fptr_interpret_call" || ocinvokeDecl == "jl_fptr_const_return") {
+                        if (pinvoke == nullptr)
+                            ocinvokeDecl = get_or_emit_fptr1(preal_decl, mod)->getName();
+                        else
+                            ocinvokeDecl = pinvoke->getName();
+                    }
+                    assert(!ocinvokeDecl.empty());
+                    assert(ocinvokeDecl != "jl_fptr_args");
+                    assert(ocinvokeDecl != "jl_fptr_sparam");
+                    // merge and/or rename this prototype to the real function
+                    if (Function *specfun = cast_or_null<Function>(mod->getNamedValue(ocinvokeDecl))) {
+                        if (proto.oc != specfun) {
+                            proto.oc->replaceAllUsesWith(specfun);
+                            proto.oc->eraseFromParent();
+                            proto.oc = specfun;
+                        }
+                    }
+                    else {
+                        proto.oc->setName(ocinvokeDecl);
+                    }
                 }
             }
+            else {
+                isedge = true;
+                params.workqueue.push_back(it);
+                incomplete_rgraph[codeinst].push_back(callee);
+            }
+            if (isedge)
+                complete_graph[callee].push_back(codeinst);
         }
-        else {
-            isedge = true;
-            params.workqueue.push_back(it);
-            incomplete_rgraph[codeinst].push_back(callee);
-        }
-        if (isedge)
-            complete_graph[callee].push_back(codeinst);
     }
     return params.workqueue.size();
 }
@@ -580,10 +598,11 @@ static void complete_emit(jl_code_instance_t *edge) JL_NOTSAFEPOINT_LEAVE JL_NOT
             auto &params = std::get<0>(it->second);
             params.tsctx_lock = params.tsctx.getLock();
             assert(callee == it->first);
+            orc::ThreadSafeModule &M = emittedmodules[callee];
+            emit_always_inline(M, params); // may safepoint
             int waiting = jl_analyze_workqueue(callee, params); // may safepoint
             assert(!waiting); (void)waiting;
-            Module *M = emittedmodules[callee].getModuleUnlocked();
-            finish_params(M, params, sharedmodules);
+            finish_params(M.getModuleUnlocked(), params, sharedmodules);
             incompletemodules.erase(it);
         }
     }
@@ -796,6 +815,7 @@ void jl_emit_codeinst_to_jit_impl(
     invokenames[codeinst] = std::move(decls);
     complete_emit(codeinst);
     params.tsctx_lock = params.tsctx.getLock(); // re-acquire lock
+    emit_always_inline(result_m, params);
     int waiting = jl_analyze_workqueue(codeinst, params);
     if (waiting) {
         auto release = std::move(params.tsctx_lock); // unlock again before moving from it
@@ -1725,7 +1745,8 @@ struct JuliaOJIT::DLSymOptimizer {
                         Thunk = cast<Function>(GV.getInitializer()->stripPointerCasts());
                         assert(++Thunk->uses().begin() == Thunk->uses().end() && "Thunk should only have one use in PLT initializer!");
                         assert(Thunk->hasLocalLinkage() && "Thunk should not have non-local linkage!");
-                    } else {
+                    }
+                    else {
                         GV.setLinkage(GlobalValue::PrivateLinkage);
                     }
                     auto init = ConstantExpr::getIntToPtr(ConstantInt::get(M.getDataLayout().getIntPtrType(M.getContext()), (uintptr_t)addr), GV.getValueType());
@@ -2298,125 +2319,6 @@ void JuliaOJIT::optimizeDLSyms(Module &M) {
 
 JuliaOJIT *jl_ExecutionEngine;
 
-// destructively move the contents of src into dest
-// this assumes that the targets of the two modules are the same
-// including the DataLayout and ModuleFlags (for example)
-// and that there is no module-level assembly
-// Comdat is also removed, since the JIT doesn't need it
-void jl_merge_module(orc::ThreadSafeModule &destTSM, orc::ThreadSafeModule srcTSM)
-{
-    ++ModulesMerged;
-    destTSM.withModuleDo([&](Module &dest) JL_NOTSAFEPOINT {
-        srcTSM.withModuleDo([&](Module &src) JL_NOTSAFEPOINT {
-            assert(&dest != &src && "Cannot merge module with itself!");
-            assert(&dest.getContext() == &src.getContext() && "Cannot merge modules with different contexts!");
-            assert(dest.getDataLayout() == src.getDataLayout() && "Cannot merge modules with different data layouts!");
-            assert(dest.getTargetTriple() == src.getTargetTriple() && "Cannot merge modules with different target triples!");
-
-            for (auto &SG : make_early_inc_range(src.globals())) {
-                GlobalVariable *dG = cast_or_null<GlobalVariable>(dest.getNamedValue(SG.getName()));
-                if (SG.hasLocalLinkage()) {
-                    dG = nullptr;
-                }
-                // Replace a declaration with the definition:
-                if (dG && !dG->hasLocalLinkage()) {
-                    if (SG.isDeclaration()) {
-                        SG.replaceAllUsesWith(dG);
-                        SG.eraseFromParent();
-                        continue;
-                    }
-                    //// If we start using llvm.used, we need to enable and test this
-                    //else if (!dG->isDeclaration() && dG->hasAppendingLinkage() && SG.hasAppendingLinkage()) {
-                    //    auto *dCA = cast<ConstantArray>(dG->getInitializer());
-                    //    auto *sCA = cast<ConstantArray>(SG.getInitializer());
-                    //    SmallVector<Constant *, 16> Init;
-                    //    for (auto &Op : dCA->operands())
-                    //        Init.push_back(cast_or_null<Constant>(Op));
-                    //    for (auto &Op : sCA->operands())
-                    //        Init.push_back(cast_or_null<Constant>(Op));
-                    //    ArrayType *ATy = ArrayType::get(PointerType::get(dest.getContext()), Init.size());
-                    //    GlobalVariable *GV = new GlobalVariable(dest, ATy, dG->isConstant(),
-                    //            GlobalValue::AppendingLinkage, ConstantArray::get(ATy, Init), "",
-                    //            dG->getThreadLocalMode(), dG->getType()->getAddressSpace());
-                    //    GV->copyAttributesFrom(dG);
-                    //    SG.replaceAllUsesWith(GV);
-                    //    dG->replaceAllUsesWith(GV);
-                    //    GV->takeName(SG);
-                    //    SG.eraseFromParent();
-                    //    dG->eraseFromParent();
-                    //    continue;
-                    //}
-                    else {
-                        assert(dG->isDeclaration() || dG->getInitializer() == SG.getInitializer());
-                        dG->replaceAllUsesWith(&SG);
-                        dG->eraseFromParent();
-                    }
-                }
-                // Reparent the global variable:
-                SG.removeFromParent();
-                dest.insertGlobalVariable(&SG);
-                // Comdat is owned by the Module
-                SG.setComdat(nullptr);
-            }
-
-            for (auto &SG : make_early_inc_range(src)) {
-                Function *dG = cast_or_null<Function>(dest.getNamedValue(SG.getName()));
-                if (SG.hasLocalLinkage()) {
-                    dG = nullptr;
-                }
-                // Replace a declaration with the definition:
-                if (dG && !dG->hasLocalLinkage()) {
-                    if (SG.isDeclaration()) {
-                        SG.replaceAllUsesWith(dG);
-                        SG.eraseFromParent();
-                        continue;
-                    }
-                    else {
-                        assert(dG->isDeclaration());
-                        dG->replaceAllUsesWith(&SG);
-                        dG->eraseFromParent();
-                    }
-                }
-                // Reparent the global variable:
-                SG.removeFromParent();
-                dest.getFunctionList().push_back(&SG);
-                // Comdat is owned by the Module
-                SG.setComdat(nullptr);
-            }
-
-            for (auto &SG : make_early_inc_range(src.aliases())) {
-                GlobalAlias *dG = cast_or_null<GlobalAlias>(dest.getNamedValue(SG.getName()));
-                if (SG.hasLocalLinkage()) {
-                    dG = nullptr;
-                }
-                if (dG && !dG->hasLocalLinkage()) {
-                    if (!dG->isDeclaration()) { // aliases are always definitions, so this test is reversed from the above two
-                        SG.replaceAllUsesWith(dG);
-                        SG.eraseFromParent();
-                        continue;
-                    }
-                    else {
-                        dG->replaceAllUsesWith(&SG);
-                        dG->eraseFromParent();
-                    }
-                }
-                SG.removeFromParent();
-                dest.insertAlias(&SG);
-            }
-
-            // metadata nodes need to be explicitly merged not just copied
-            // so there are special passes here for each known type of metadata
-            NamedMDNode *sNMD = src.getNamedMetadata("llvm.dbg.cu");
-            if (sNMD) {
-                NamedMDNode *dNMD = dest.getOrInsertNamedMetadata("llvm.dbg.cu");
-                for (MDNode *I : sNMD->operands()) {
-                    dNMD->addOperand(I);
-                }
-            }
-        });
-    });
-}
-
 //TargetMachine pass-through methods
 
 std::unique_ptr<TargetMachine> JuliaOJIT::cloneTargetMachine() const
diff --git a/src/jitlayers.h b/src/jitlayers.h
index b411febd792b8..619e5f3757642 100644
--- a/src/jitlayers.h
+++ b/src/jitlayers.h
@@ -72,7 +72,6 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeContext, LLVMOrcThreadSafeCont
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeModule, LLVMOrcThreadSafeModuleRef)
 
 void addTargetPasses(legacy::PassManagerBase *PM, const Triple &triple, TargetIRAnalysis analysis) JL_NOTSAFEPOINT;
-void jl_merge_module(orc::ThreadSafeModule &dest, orc::ThreadSafeModule src) JL_NOTSAFEPOINT;
 GlobalVariable *jl_emit_RTLD_DEFAULT_var(Module *M) JL_NOTSAFEPOINT;
 DataLayout jl_create_datalayout(TargetMachine &TM) JL_NOTSAFEPOINT;
 
@@ -210,6 +209,12 @@ struct jl_codegen_call_target_t {
     llvm::Function *decl;
     llvm::Function *oc;
     bool specsig;
+    bool external_linkage; // whether codegen would like this edge to be externally-available
+    bool private_linkage; // whether codegen would like this edge to be internally-available
+    // external = ExternalLinkage (similar to "extern")
+    // private = InternalLinkage (similar to "static")
+    // external+private = AvailableExternallyLinkage+ExternalLinkage or ExternalLinkage (similar to "static inline")
+    // neither = unused
 };
 
 // reification of a call to jl_jit_abi_convert, so that it isn't necessary to parse the Modules to recover this info
@@ -231,7 +236,7 @@ struct jl_codegen_params_t {
     DataLayout DL;
     Triple TargetTriple;
 
-    inline LLVMContext &getContext() {
+    inline LLVMContext &getContext() JL_NOTSAFEPOINT {
         return *tsctx.getContext();
     }
     typedef StringMap<GlobalVariable*> SymMapGV;
@@ -268,6 +273,7 @@ struct jl_codegen_params_t {
     bool cache = false;
     bool external_linkage = false;
     bool imaging_mode;
+    bool safepoint_on_entry = true;
     bool use_swiftcc = true;
     jl_codegen_params_t(orc::ThreadSafeContext ctx, DataLayout DL, Triple triple) JL_NOTSAFEPOINT  JL_NOTSAFEPOINT_ENTER
       : tsctx(std::move(ctx)),
@@ -305,6 +311,9 @@ jl_llvm_functions_t jl_emit_codedecls(
         jl_code_instance_t *codeinst,
         jl_codegen_params_t &params);
 
+void linkFunctionBody(Function &Dst, Function &Src) JL_NOTSAFEPOINT;
+void emit_always_inline(orc::ThreadSafeModule &result_m, jl_codegen_params_t &params) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER;
+
 enum CompilationPolicy {
     Default = 0,
     Extern = 1,
@@ -660,8 +669,8 @@ class JuliaOJIT {
     OptSelLayerT OptSelLayer;
 };
 extern JuliaOJIT *jl_ExecutionEngine;
-std::unique_ptr<Module> jl_create_llvm_module(StringRef name, LLVMContext &ctx, const DataLayout &DL = jl_ExecutionEngine->getDataLayout(), const Triple &triple = jl_ExecutionEngine->getTargetTriple()) JL_NOTSAFEPOINT;
-inline orc::ThreadSafeModule jl_create_ts_module(StringRef name, orc::ThreadSafeContext ctx, const DataLayout &DL = jl_ExecutionEngine->getDataLayout(), const Triple &triple = jl_ExecutionEngine->getTargetTriple()) JL_NOTSAFEPOINT {
+std::unique_ptr<Module> jl_create_llvm_module(StringRef name, LLVMContext &ctx, const DataLayout &DL, const Triple &triple) JL_NOTSAFEPOINT;
+inline orc::ThreadSafeModule jl_create_ts_module(StringRef name, orc::ThreadSafeContext ctx, const DataLayout &DL, const Triple &triple) JL_NOTSAFEPOINT {
     auto lock = ctx.getLock();
     return orc::ThreadSafeModule(jl_create_llvm_module(name, *ctx.getContext(), DL, triple), ctx);
 }
diff --git a/src/julia.expmap.in b/src/julia.expmap.in
index b28a714e75f69..5a3fbce0d1a82 100644
--- a/src/julia.expmap.in
+++ b/src/julia.expmap.in
@@ -30,7 +30,6 @@
     _Z22jl_coverage_alloc_lineN4llvm9StringRefEi*;
     _Z22jl_malloc_data_pointerN4llvm9StringRefEi*;
     _jl_timing_*;
-    LLVMExtra*;
     JLJIT*;
     llvmGetPassPluginInfo*;
 
diff --git a/src/llvm-expand-atomic-modify.cpp b/src/llvm-expand-atomic-modify.cpp
new file mode 100644
index 0000000000000..7b7b3c8761c17
--- /dev/null
+++ b/src/llvm-expand-atomic-modify.cpp
@@ -0,0 +1,473 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+// TODO: move this feature into AtomicExpandImpl
+
+#include "llvm-version.h"
+#include "passes.h"
+
+#include <variant>
+
+#include <llvm-c/Core.h>
+#include <llvm-c/Types.h>
+
+#include <llvm/Analysis/InstSimplifyFolder.h>
+#include <llvm/CodeGen/AtomicExpandUtils.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/InstIterator.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Operator.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Value.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Pass.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <llvm/Transforms/Utils/LowerAtomic.h>
+
+#include "julia.h"
+#include "julia_assert.h"
+
+#define DEBUG_TYPE "expand-atomic-modify"
+#undef DEBUG
+
+using namespace llvm;
+
+// This pass takes fake call instructions that look like this which were emitted by the front end:
+//   (oldval, newval) = call atomicmodify.iN(ptr %op, ptr align(N) %ptr, i8 immarg %SSID, i8 immarg %Ordering, ...) !rmwattributes
+//   where op is a function with a prototype of `iN (iN arg, ...)`
+// Then rewrite that to
+//   oldval = atomicrmw op ptr, val ordering syncscope
+//   newval = op oldval, val
+// Or to an equivalent RMWCmpXchgLoop if `op` isn't valid for atomicrmw
+
+
+// from AtomicExpandImpl, with modification of failure order and added Attributes
+using CreateWeakCmpXchgInstFun =
+   std::function<void(IRBuilderBase &, Value *, Value *, Value *, Align,
+                     AtomicOrdering, SyncScope::ID, Instruction &Attributes,
+                     Value *&, Value *&)>;
+
+static void createWeakCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
+                                 Value *Loaded, Value *NewVal, Align AddrAlign,
+                                 AtomicOrdering MemOpOrder, SyncScope::ID SSID, Instruction &Attributes,
+                                 Value *&Success, Value *&NewLoaded) {
+  Type *OrigTy = NewVal->getType();
+
+  // This code can go away when cmpxchg supports FP types.
+  assert(!OrigTy->isPointerTy());
+  bool NeedBitcast = OrigTy->isFloatingPointTy();
+  if (NeedBitcast) {
+    IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
+    NewVal = Builder.CreateBitCast(NewVal, IntTy);
+    Loaded = Builder.CreateBitCast(Loaded, IntTy);
+  }
+
+  AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, AddrAlign, MemOpOrder,
+      AtomicOrdering::Monotonic, // why does LLVM use AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder) here
+      SSID);
+  Pair->copyMetadata(Attributes);
+  Success = Builder.CreateExtractValue(Pair, 1, "success");
+  NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+
+  if (NeedBitcast)
+    NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy);
+}
+
+// from AtomicExpandImpl, with modification of values returned
+std::pair<Value *, Value *> insertRMWCmpXchgLoop(
+    IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign,
+    AtomicOrdering MemOpOrder, SyncScope::ID SSID, Instruction &Attributes,
+    const std::function<Value *(IRBuilderBase &, Value *)> &PerformOp,
+    const CreateWeakCmpXchgInstFun &CreateWeakCmpXchg) {
+  LLVMContext &Ctx = Builder.getContext();
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Function *F = BB->getParent();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB =
+      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  LoadInst *InitLoaded = Builder.CreateAlignedLoad(ResultTy, Addr, AddrAlign);
+  InitLoaded->setOrdering(AtomicOrdering::Unordered); // n.b. the original LLVM pass is missing this call so is actually mildly UB
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal = PerformOp(Builder, Loaded);
+
+  Value *NewLoaded = nullptr;
+  Value *Success = nullptr;
+
+  CreateWeakCmpXchg(Builder, Addr, Loaded, NewVal, AddrAlign,
+                MemOpOrder == AtomicOrdering::Unordered
+                    ? AtomicOrdering::Monotonic
+                    : MemOpOrder,
+                SSID, Attributes, Success, NewLoaded);
+  assert(Success && NewLoaded);
+
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  return {NewLoaded, NewVal};
+}
+
+// from AtomicExpandImpl
+struct ReplacementIRBuilder : IRBuilder<InstSimplifyFolder> {
+  // Preserves the DebugLoc from I, and preserves still valid metadata.
+  explicit ReplacementIRBuilder(Instruction *I, const DataLayout &DL)
+      : IRBuilder(I->getContext(), DL) {
+    SetInsertPoint(I);
+    this->CollectMetadataToCopy(I, {LLVMContext::MD_pcsections});
+  }
+};
+
+// Must check that either Target cannot observe or mutate global state
+// or that no trailing instructions does so either.
+// Depending on the choice, it can also decide whether it is better to move Target after RMW
+// or to move RMW before Target (or meet somewhere in the middle).
+// Currently conservatively implemented as there being no instruction in the
+// function which writes memory (which includes any atomics).
+// Excluding the Target itself, unless some other instruction might read memory to observe it.
+static bool canReorderWithRMW(Instruction &Target, bool verifyop)
+{
+  if (!verifyop)
+    return true;
+  Function &Op = *Target.getFunction();
+  // quick check: if Op is nosync and Target doesn't access any memory, then reordering is trivially valid
+  bool nosync = Op.hasNoSync();
+  if (nosync && !Target.mayReadOrWriteMemory())
+    return true;
+  // otherwise, scan the whole function to see if any function accesses memory
+  // in a way that would conflict with reordering the atomic read and write
+  bool mayRead = false;
+  for (auto &BB : Op) {
+    for (auto &I : BB) {
+      if (&I == &Target)
+        continue;
+      if (I.mayWriteToMemory())
+        return false;
+      if (!mayRead) {
+        mayRead = I.mayReadFromMemory();
+        if (!nosync && mayRead)
+          return false;
+      }
+    }
+  }
+  // if any other instruction read memory, then the ordering of any writes by the target instruction might be observed
+  return !(mayRead && Target.mayWriteToMemory());
+}
+
+static std::variant<AtomicRMWInst::BinOp,bool> patternMatchAtomicRMWOp(Value *Old, Use **ValOp, Value *RetVal)
+{
+  bool verifyop = RetVal == nullptr;
+  assert(verifyop ? isa<Argument>(Old) : isa<AtomicRMWInst>(Old));
+  Function *Op = verifyop ? cast<Argument>(Old)->getParent() : nullptr;
+  if (verifyop && (Op->isDeclaration() || Op->isInterposable() || Op->isIntrinsic()))
+    return false;
+   // TODO: peek forward from Old through any trivial casts which don't affect the instruction (e.g. i64 to f64 and back)
+  if (RetVal == nullptr) {
+    if (Old->use_empty()) {
+      if (ValOp) *ValOp = nullptr;
+      return AtomicRMWInst::Xchg;
+    }
+    if (!Old->hasOneUse())
+      return false;
+    ReturnInst *Ret = nullptr;
+    for (auto &BB : *Op) {
+      if (isa<ReturnInst>(BB.getTerminator())) {
+        if (Ret != nullptr)
+          return false;
+        Ret = cast<ReturnInst>(BB.getTerminator());
+      }
+    }
+    if (Ret == nullptr)
+      return false;
+    // Now examine the instruction list
+    RetVal = Ret->getReturnValue();
+    if (!RetVal->hasOneUse())
+      return false;
+  }
+  if (RetVal == Old) {
+    // special token indicating to convert to an atomic fence
+    if (ValOp) *ValOp = nullptr;
+    return AtomicRMWInst::Or;
+  }
+  if (Old->use_empty()) {
+    if (ValOp) *ValOp = nullptr;
+    return AtomicRMWInst::Xchg;
+  }
+  if (auto BinOp = dyn_cast<BinaryOperator>(RetVal)) {
+    if ((BinOp->getOperand(0) == Old || (BinOp->isCommutative() && BinOp->getOperand(1) == Old)) && canReorderWithRMW(*BinOp, verifyop)) {
+      if (ValOp) *ValOp = &BinOp->getOperandUse(BinOp->getOperand(0) == Old ? 1 : 0);
+      switch (BinOp->getOpcode()) {
+        case Instruction::Add:
+          return AtomicRMWInst::Add;
+        case Instruction::Sub:
+          return AtomicRMWInst::Sub;
+        case Instruction::And:
+          return AtomicRMWInst::And;
+        case Instruction::Or:
+          return AtomicRMWInst::Or;
+        case Instruction::Xor:
+          return AtomicRMWInst::Xor;
+        case Instruction::FAdd:
+          return AtomicRMWInst::FAdd;
+        case Instruction::FSub:
+          return AtomicRMWInst::FSub;
+        default:
+          break;
+      }
+    }
+    if (BinOp->getOpcode() == Instruction::Xor) {
+      if (auto CI = dyn_cast<ConstantInt>(BinOp->getOperand(1))) {
+        if (CI->isAllOnesValue()) {
+          BinOp = dyn_cast<BinaryOperator>(BinOp->getOperand(0));
+          if (BinOp && BinOp->hasOneUse() && BinOp->getOpcode() == Instruction::And) {
+            if ((BinOp->getOperand(0) == Old || (BinOp->isCommutative() && BinOp->getOperand(1) == Old)) && canReorderWithRMW(*BinOp, verifyop)) {
+              if (ValOp) *ValOp = &BinOp->getOperandUse(BinOp->getOperand(0) == Old ? 1 : 0);
+              return AtomicRMWInst::Nand;
+            }
+          }
+        }
+      }
+    }
+    return false;
+  } else if (auto Intr = dyn_cast<IntrinsicInst>(RetVal)) {
+    if (Intr->arg_size() == 2) {
+      if ((Intr->getOperand(0) == Old || (Intr->isCommutative() && Intr->getOperand(1) == Old)) && canReorderWithRMW(*Intr, verifyop)) {
+        if (ValOp) *ValOp = &Intr->getOperandUse(Intr->getOperand(0) == Old ? 1 : 0);
+        switch (Intr->getIntrinsicID()) {
+          case Intrinsic::minnum:
+            return AtomicRMWInst::FMin;
+          case Intrinsic::maxnum:
+            return AtomicRMWInst::FMax;
+          case Intrinsic::smax:
+            return AtomicRMWInst::Max;
+          case Intrinsic::umax:
+            return AtomicRMWInst::UMax;
+          case Intrinsic::smin:
+            return AtomicRMWInst::Min;
+          case Intrinsic::umin:
+            return AtomicRMWInst::UMin;
+#if JL_LLVM_VERSION >= 200000
+          case Intrinsic::usub_sat:
+           return AtomicRMWInst::USubSat;
+#endif
+        }
+      }
+    }
+    return false;
+  }
+  else if (auto Intr = dyn_cast<CallInst>(RetVal)) {
+    // TODO: decide inlining cost of Op, or check alwaysinline/inlinehint, before this?
+    for (auto &Arg : Intr->args()) {
+      if (Arg == Old) {
+        if (canReorderWithRMW(*Intr, verifyop)) {
+          if (ValOp) *ValOp = &Arg;
+          return true;
+        }
+        return false;
+      }
+    }
+  }
+  // TODO: does this need to deal with F->hasFnAttribute(Attribute::StrictFP)?
+  // TODO: does Fneg and Neg have expansions?
+  // TODO: be able to ignore some simple bitcasts (particularly f64 to i64)
+  // TODO: handle longer sequences (UIncWrap, UDecWrap, USubCond, and target-specific ones for CUDA)
+  return false;
+}
+
+void expandAtomicModifyToCmpXchg(CallInst &Modify,
+                                 const CreateWeakCmpXchgInstFun &CreateWeakCmpXchg) {
+  Value *Ptr = Modify.getOperand(0);
+  Function *Op = dyn_cast<Function>(Modify.getOperand(1));
+  if (!Op) {
+      Modify.getParent()->getParent()->print(errs());
+      llvm_unreachable("expected immarg for function argument");
+  }
+  AtomicOrdering Ordering = (AtomicOrdering)cast<ConstantInt>(Modify.getOperand(2))->getZExtValue();
+  SyncScope::ID SSID = (SyncScope::ID)cast<ConstantInt>(Modify.getOperand(3))->getZExtValue();
+  MaybeAlign Alignment = Modify.getParamAlign(0);
+  unsigned user_arg_start = Modify.getFunctionType()->getNumParams();
+  Type *Ty = Modify.getFunctionType()->getReturnType()->getStructElementType(0);
+
+  ReplacementIRBuilder Builder(&Modify, Modify.getModule()->getDataLayout());
+  Builder.setIsFPConstrained(Modify.hasFnAttr(Attribute::StrictFP));
+
+  CallInst *ModifyOp;
+  {
+    SmallVector<Value*> Args(1 + Modify.arg_size() - user_arg_start);
+    Args[0] = UndefValue::get(Ty); // Undef used as placeholder for Loaded / RMW;
+    for (size_t argi = 0; argi < Modify.arg_size() - user_arg_start; ++argi) {
+      Args[argi + 1] = Modify.getArgOperand(argi + user_arg_start);
+    }
+    SmallVector<OperandBundleDef> Defs;
+    Modify.getOperandBundlesAsDefs(Defs);
+    ModifyOp = Builder.CreateCall(Op, Args, Defs);
+    ModifyOp->setCallingConv(Op->getCallingConv());
+  }
+  Use *LoadedOp = &ModifyOp->getOperandUse(0);
+
+  Value *OldVal = nullptr;
+  Value *NewVal = nullptr;
+  auto BinOp = patternMatchAtomicRMWOp(Op->getArg(0), nullptr, nullptr);
+  if (BinOp != decltype(BinOp)(false)) {
+    Builder.SetInsertPoint(ModifyOp);
+    AtomicRMWInst *RMW = Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, Ptr, UndefValue::get(Ty), Alignment, Ordering, SSID); // Undef used as placeholder
+    RMW->copyMetadata(Modify);
+    Builder.SetInsertPoint(&Modify);
+    LoadedOp->set(RMW);
+    for (int attempts = 0; ; ) {
+      FreezeInst *TrackReturn = Builder.Insert(new FreezeInst(ModifyOp)); // Create a temporary TrackingVH so we can recover the NewVal after inlining
+      InlineFunctionInfo IFI;
+      if (!InlineFunction(*ModifyOp, IFI).isSuccess()) {
+        // Undo the attempt, since inlining failed
+        BinOp = false;
+        TrackReturn->eraseFromParent();
+        break;
+      }
+      ModifyOp = nullptr;
+      NewVal = TrackReturn->getOperand(0);
+      TrackReturn->eraseFromParent();
+      // NewVal might have been folded away by inlining so redo patternMatchAtomicRMWOp here
+      // tracing from RMW to NewVal, in case instsimplify folded something
+      Use *ValOp;
+      BinOp = patternMatchAtomicRMWOp(RMW, &ValOp, NewVal);
+      if (BinOp == decltype(BinOp)(true)) {
+        ModifyOp = cast<CallInst>(ValOp->getUser());
+        LoadedOp = ValOp;
+        assert(LoadedOp->get() == RMW);
+        RMW->moveBefore(ModifyOp); // NewValInst is a user of RMW, and RMW has no other dependants (per patternMatchAtomicRMWOp)
+        BinOp = false;
+        if (++attempts > 3)
+          break;
+        if (auto FOp = ModifyOp->getCalledFunction())
+          BinOp = patternMatchAtomicRMWOp(FOp->getArg(LoadedOp->getOperandNo()), nullptr, nullptr);
+        else
+          break;
+        if (BinOp == decltype(BinOp)(false))
+          break;
+      } else {
+        assert(BinOp != decltype(BinOp)(true));
+        auto RMWOp = std::get<AtomicRMWInst::BinOp>(BinOp);
+        assert(RMWOp != AtomicRMWInst::BAD_BINOP);
+        assert(isa<UndefValue>(RMW->getOperand(1))); // RMW was previously being used as the placeholder for Val
+        Value *Val;
+        if (ValOp != nullptr) {
+          RMW->moveBefore(cast<Instruction>(ValOp->getUser())); // ValOp is a user of RMW, and RMW has no other dependants (per patternMatchAtomicRMWOp)
+          Val = ValOp->get();
+        } else if (RMWOp == AtomicRMWInst::Xchg) {
+          Val = NewVal;
+        } else {
+          // convert to an atomic fence of the form: atomicrmw or %ptr, 0
+          assert(RMWOp == AtomicRMWInst::Or);
+          Val = ConstantInt::getNullValue(Ty);
+        }
+        RMW->setOperation(RMWOp);
+        RMW->setOperand(1, Val);
+        OldVal = RMW;
+        break;
+      }
+    }
+    if (BinOp == decltype(BinOp)(false)) {
+      LoadedOp->set(UndefValue::get(Ty));
+      RMW->eraseFromParent();
+    }
+  }
+
+  if (BinOp == decltype(BinOp)(false)) {
+    // FIXME: If FP exceptions are observable, we should force them off for the
+    // loop for the FP atomics.
+    std::tie(OldVal, NewVal) = insertRMWCmpXchgLoop(
+      Builder, Ty,  Ptr, *Alignment, Ordering, SSID, Modify,
+      [&](IRBuilderBase &Builder, Value *Loaded) JL_NOTSAFEPOINT {
+        LoadedOp->set(Loaded);
+        ModifyOp->moveBefore(*Builder.GetInsertBlock(), Builder.GetInsertPoint());
+        return ModifyOp;
+      },
+      CreateWeakCmpXchg);
+  }
+
+  for (auto user : make_early_inc_range(Modify.users())) {
+    if (auto EV = dyn_cast<ExtractValueInst>(user)) {
+      if (EV->getNumIndices() == 1) {
+        if (EV->use_empty()) {
+          EV->eraseFromParent();
+          continue;
+        }
+        else if (EV->getIndices()[0] == 0) {
+          EV->replaceAllUsesWith(OldVal);
+          EV->eraseFromParent();
+          continue;
+        } else if (EV->getIndices()[0] == 1) {
+          EV->replaceAllUsesWith(NewVal);
+          EV->eraseFromParent();
+          continue;
+        }
+      }
+    }
+  }
+  if (!Modify.use_empty()) {
+    auto OldNewVal = Builder.CreateInsertValue(UndefValue::get(Modify.getType()), OldVal, 0);
+    OldNewVal = Builder.CreateInsertValue(OldNewVal, NewVal, 1);
+    Modify.replaceAllUsesWith(OldNewVal);
+  }
+  Modify.eraseFromParent();
+}
+
+static bool expandAtomicModify(Function &F) {
+  SmallVector<CallInst*> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (Instruction &I : instructions(F))
+    if (auto CI = dyn_cast<CallInst>(&I)) {
+      auto callee = dyn_cast_or_null<Function>(CI->getCalledOperand());
+      if (callee && callee->getName().starts_with("julia.atomicmodify.")) {
+        assert(CI->getFunctionType() == callee->getFunctionType());
+        AtomicInsts.push_back(CI);
+      }
+    }
+
+  bool MadeChange = !AtomicInsts.empty();
+  for (auto *I : AtomicInsts)
+    expandAtomicModifyToCmpXchg(*I, createWeakCmpXchgInstFun);
+  return MadeChange;
+}
+
+PreservedAnalyses ExpandAtomicModifyPass::run(Function &F, FunctionAnalysisManager &AM)
+{
+    if (expandAtomicModify(F)) {
+        return PreservedAnalyses::none();
+    }
+    return PreservedAnalyses::all();
+}
diff --git a/src/llvm-julia-passes.inc b/src/llvm-julia-passes.inc
index 0cc36f799db00..bd223499f37af 100644
--- a/src/llvm-julia-passes.inc
+++ b/src/llvm-julia-passes.inc
@@ -16,6 +16,7 @@ FUNCTION_PASS("AllocOpt", AllocOptPass())
 FUNCTION_PASS("PropagateJuliaAddrspaces", PropagateJuliaAddrspacesPass())
 FUNCTION_PASS("GCInvariantVerifier", GCInvariantVerifierPass())
 FUNCTION_PASS("FinalLowerGC", FinalLowerGCPass())
+FUNCTION_PASS("ExpandAtomicModify", ExpandAtomicModifyPass())
 #endif
 
 //Loop passes
diff --git a/src/passes.h b/src/passes.h
index 83721525d6f7e..0c5a124ade952 100644
--- a/src/passes.h
+++ b/src/passes.h
@@ -43,6 +43,11 @@ struct FinalLowerGCPass : PassInfoMixin<FinalLowerGCPass> {
     static bool isRequired() { return true; }
 };
 
+struct ExpandAtomicModifyPass : PassInfoMixin<ExpandAtomicModifyPass> {
+    PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT;
+};
+
+
 // Module Passes
 struct CPUFeaturesPass : PassInfoMixin<CPUFeaturesPass> {
     PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) JL_NOTSAFEPOINT;
diff --git a/src/pipeline.cpp b/src/pipeline.cpp
index eb93943653b34..f91db6fc037d7 100644
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -574,6 +574,7 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder *
             FunctionPassManager FPM;
             JULIA_PASS(FPM.addPass(LateLowerGCPass()));
             JULIA_PASS(FPM.addPass(FinalLowerGCPass()));
+            JULIA_PASS(FPM.addPass(ExpandAtomicModifyPass())); // after LateLowerGCPass so that all IPO is valid
             MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         }
         JULIA_PASS(MPM.addPass(LowerPTLSPass(options.dump_native)));
@@ -590,7 +591,8 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder *
             FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions()));
             MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         }
-    } else if (!options.remove_ni) {
+    }
+    else if (!options.remove_ni) {
         JULIA_PASS(MPM.addPass(RemoveNIPass()));
     }
     MPM.addPass(AfterIntrinsicLoweringMarkerPass());
diff --git a/test/llvmpasses/atomic-modify.ll b/test/llvmpasses/atomic-modify.ll
new file mode 100644
index 0000000000000..23e1949f3ad0a
--- /dev/null
+++ b/test/llvmpasses/atomic-modify.ll
@@ -0,0 +1,288 @@
+; This file is a part of Julia. License is MIT: https://julialang.org/license
+
+; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='ExpandAtomicModify' -S %s | FileCheck %s
+
+declare {i8, i8} @julia.atomicmodify.i8(ptr, ptr, i8, i8, ...)
+declare {double, double} @julia.atomicmodify.f64(ptr, ptr, i8, i8, ...)
+declare double @llvm.maxnum.f64(double %Val0, double %Val1)
+
+define i8 @add.i8(i8 %x, i8 %y) {
+    %z = add i8 %x, %y
+    ret i8 %z
+}
+
+define i8 @sub.i8(i8 %x, i8 %y) {
+    %z = sub i8 %x, %y
+    ret i8 %z
+}
+
+define i8 @subx.i8(i8 %x, i8 %y) {
+    %z = sub i8 %y, %x
+    ret i8 %z
+}
+
+define i8 @add.i8.zext(i8 %x, i1 %y) {
+    %y8 = zext i1 %y to i8
+    %z = add i8 %x, %y8
+    ret i8 %z
+}
+
+define i8 @and.i8(i8 %x, i8 %y) {
+    %z = and i8 %x, %y
+    ret i8 %z
+}
+
+define i8 @nand.i8(i8 %x, i8 %y) {
+    %z = and i8 %x, %y
+    %w = xor i8 %z, -1
+    ret i8 %w
+}
+
+define i8 @nand.i8.zext(i8 %x, i1 %y) {
+    %y8 = zext i1 %y to i8
+    %z = and i8 %y8, %x
+    %w = xor i8 %z, -1
+    ret i8 %w
+}
+
+define i8 @xchg.i8(i8 %x, i8 %y) {
+    ret i8 %y
+}
+
+define double @fadd.f64(double %x, double %y) {
+    %z = fadd double %y, %x
+    ret double %z
+}
+
+define double @fmax.f64(double %x, double %y) {
+    %z = call double @llvm.maxnum.f64(double %y, double %x)
+    ret double %z
+}
+
+define internal i8 @0(i8 %x, i8 %y) unnamed_addr {
+    %z = call i8 @add.i8(i8 %x, i8 %y)
+    ret i8 %z
+}
+
+define internal i8 @1(i8 %x, i8 %y) unnamed_addr {
+    %z = call i8 @0(i8 %x, i8 %y)
+    ret i8 %z
+}
+
+define internal i8 @2(i8 %x, i8 %y, ptr %f) unnamed_addr {
+    %z = call i8 %f(i8 %x, i8 %y)
+    ret i8 %z
+}
+
+define i8 @mod_i8_add(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_add
+; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1
+; CHECK: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8, i8 5, i8 1, i8 %b)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_add_new(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_add
+; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1
+; CHECK-NEXT: [[newval:%.*]] = add i8 %0, %b
+; CHECK-NEXT: ret i8 [[newval]]
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8, i8 5, i8 1, i8 %b)
+  %newval = extractvalue {i8, i8} %oldnew, 1
+  ret i8 %newval
+}
+
+define i8 @mod_i8_addfence(ptr %a) {
+; CHECK-LABEL: @mod_i8_addfence
+; CHECK: %0 = atomicrmw or ptr %a, i8 0 release, align 1
+; CHECK-NEXT: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8, i8 5, i8 1, i8 0)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_add_zext(ptr %a, i1 %b) {
+; CHECK-LABEL: @mod_i8_add_zext
+; CHECK: [[b8:%.*]] = zext i1 %b to i8
+; CHECK: %0 = atomicrmw add ptr %a, i8 [[b8]] release, align 1
+; CHECK: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8.zext, i8 5, i8 1, i1 %b)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_add_zext_new(ptr %a, i1 %b) {
+; CHECK-LABEL: @mod_i8_add_zext
+; CHECK: [[b8:%.*]] = zext i1 %b to i8
+; CHECK-NEXT: %0 = atomicrmw add ptr %a, i8 [[b8]] release, align 1
+; CHECK-NEXT: [[newval:%.*]] = add i8 %0, [[b8]]
+; CHECK-NEXT: ret i8 [[newval]]
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8.zext, i8 5, i8 1, i1 %b)
+  %newval = extractvalue {i8, i8} %oldnew, 1
+  ret i8 %newval
+}
+
+define i8 @mod_i8_sub(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_sub
+; CHECK: %0 = atomicrmw sub ptr %a, i8 %b release, align 1
+; CHECK: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @sub.i8, i8 5, i8 1, i8 %b)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_subx(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_subx
+; CHECK: [[newval:%.*]] = call i8 @subx.i8(i8 %loaded, i8 %b)
+; CHECK: [[success:%.*]] = cmpxchg ptr %a, i8 %loaded, i8 [[newval]]
+; CHECK: [[oldval:%.*]] = extractvalue { i8, i1 } [[success:%.*]], 0
+; CHECK: ret i8 [[oldval]]
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @subx.i8, i8 5, i8 1, i8 %b)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_subx_new(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_subx_new
+; CHECK: [[newval:%.*]] = call i8 @subx.i8(i8 %loaded, i8 %b)
+; CHECK: [[oldval:%.*]] = cmpxchg ptr %a, i8 %loaded, i8 [[newval]]
+; CHECK: ret i8 [[newval]]
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @subx.i8, i8 5, i8 1, i8 %b)
+  %newval = extractvalue {i8, i8} %oldnew, 1
+  ret i8 %newval
+}
+
+define i8 @mod_i8_nand(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_nand
+; CHECK: %0 = atomicrmw nand ptr %a, i8 %b release, align 1
+; CHECK: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @nand.i8, i8 5, i8 1, i8 %b)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_nand_new(ptr %a, i1 %b) {
+; CHECK-LABEL: @mod_i8_nand_new
+; CHECK: [[b8:%.*]] = zext i1 %b to i8
+; CHECK: %0 = atomicrmw nand ptr %a, i8 [[b8]] release, align 1
+; CHECK: [[newand:%.*]] = and i8 [[b8]], %0
+; CHECK: [[newval:%.*]] = xor i8 [[newand:%.*]], -1
+; CHECK: ret i8 [[newval]]
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @nand.i8.zext, i8 5, i8 1, i1 %b)
+  %newval = extractvalue {i8, i8} %oldnew, 1
+  ret i8 %newval
+}
+
+define i8 @mod_i8_andxchg(ptr %a) {
+; CHECK-LABEL: @mod_i8_andxchg
+; CHECK: %0 = atomicrmw xchg ptr %a, i8 0 release, align 1
+; CHECK-NEXT: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @and.i8, i8 5, i8 1, i8 0)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_xchg(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_xchg
+; CHECK: %0 = atomicrmw xchg ptr %a, i8 %b release, align 1
+; CHECK-NEXT: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @xchg.i8, i8 5, i8 1, i8 %b)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_xchg_new(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_xchg_new
+; CHECK: %0 = atomicrmw xchg ptr %a, i8 %b release, align 1
+; CHECK-NEXT: ret i8 %b
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @xchg.i8, i8 5, i8 1, i8 %b)
+  %newval = extractvalue {i8, i8} %oldnew, 1
+  ret i8 %newval
+}
+
+define double @mod_i8_fadd(ptr %a, double %b) {
+; CHECK-LABEL: @mod_i8_fadd
+; CHECK: %0 = atomicrmw fadd ptr %a, double %b release, align 8
+; CHECK: ret double %0
+top:
+  %oldnew = call {double, double} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.f64(ptr align(8) %a, ptr @fadd.f64, i8 5, i8 1, double %b)
+  %oldval = extractvalue {double, double} %oldnew, 0
+  ret double %oldval
+}
+
+define double @mod_i8_fmax(ptr %a, double %b) {
+; CHECK-LABEL: @mod_i8_fmax
+; CHECK: %0 = atomicrmw fmax ptr %a, double %b release, align 8
+; CHECK: ret double %0
+top:
+  %oldnew = call {double, double} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.f64(ptr align(8) %a, ptr @fmax.f64, i8 5, i8 1, double %b)
+  %oldval = extractvalue {double, double} %oldnew, 0
+  ret double %oldval
+}
+
+define i8 @mod_i8_indirect0(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_indirect0
+; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1
+; CHECK: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @0, i8 5, i8 1, i8 %b)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_indirect1(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_indirect1
+; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1
+; CHECK: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @1, i8 5, i8 1, i8 %b)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_indirect2(ptr %a, i8 %b, ptr %f) {
+; CHECK-LABEL: @mod_i8_indirect2
+; CHECK: [[newval:%.*]] = call i8 %f(i8 %loaded, i8 %b)
+; CHECK: [[success:%.*]] = cmpxchg ptr %a, i8 %loaded, i8 [[newval]]
+; CHECK: [[oldval:%.*]] = extractvalue { i8, i1 } [[success:%.*]], 0
+; CHECK: ret i8 [[oldval]]
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @2, i8 5, i8 1, i8 %b, ptr %f)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}
+
+define i8 @mod_i8_indirect2_new(ptr %a, i8 %b, ptr %f) {
+; CHECK-LABEL: @mod_i8_indirect2_new
+; CHECK: [[newval:%.*]] = call i8 %f(i8 %loaded, i8 %b)
+; CHECK: [[oldval:%.*]] = cmpxchg ptr %a, i8 %loaded, i8 [[newval]]
+; CHECK: ret i8 [[newval]]
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @2, i8 5, i8 1, i8 %b, ptr %f)
+  %newval = extractvalue {i8, i8} %oldnew, 1
+  ret i8 %newval
+}
+
+define i8 @mod_i8_indirect3(ptr %a, i8 %b) {
+; CHECK-LABEL: @mod_i8_indirect3
+; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1
+; CHECK: ret i8 %0
+top:
+  %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @2, i8 5, i8 1, i8 %b, ptr @0)
+  %oldval = extractvalue {i8, i8} %oldnew, 0
+  ret i8 %oldval
+}

From 5cfdf66d30f40f6a42f6f70ac522ac208d5dfc92 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Thu, 9 Jan 2025 20:25:52 +0000
Subject: [PATCH 2/2] remove deprecated Threads.Atomics

---
 base/atomics.jl      | 176 +++++--------------------------------------
 test/threads_exec.jl |  38 +++-------
 2 files changed, 30 insertions(+), 184 deletions(-)

diff --git a/base/atomics.jl b/base/atomics.jl
index e6f3a5654cbf7..432c9120939ac 100644
--- a/base/atomics.jl
+++ b/base/atomics.jl
@@ -1,7 +1,5 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-using Core.Intrinsics: llvmcall
-
 import .Base: setindex!, getindex, unsafe_convert
 import .Base.Sys: ARCH, WORD_SIZE
 
@@ -13,34 +11,6 @@ export
     atomic_and!, atomic_nand!, atomic_or!, atomic_xor!,
     atomic_max!, atomic_min!,
     atomic_fence
-##
-# Filter out unsupported atomic types on platforms
-# - 128-bit atomics do not exist on AArch32.
-# - Omitting 128-bit types on 32bit x86 and ppc64
-# - LLVM doesn't currently support atomics on floats for ppc64
-#   C++20 is adding limited support for atomics on float, but as of
-#   now Clang does not support that yet.
-if Sys.ARCH === :i686 || startswith(string(Sys.ARCH), "arm") ||
-   Sys.ARCH === :powerpc64le || Sys.ARCH === :ppc64le
-    const inttypes = (Int8, Int16, Int32, Int64,
-                      UInt8, UInt16, UInt32, UInt64)
-else
-    const inttypes = (Int8, Int16, Int32, Int64, Int128,
-                      UInt8, UInt16, UInt32, UInt64, UInt128)
-end
-const floattypes = (Float16, Float32, Float64)
-const arithmetictypes = (inttypes..., floattypes...)
-# TODO: Support Ptr
-if Sys.ARCH === :powerpc64le || Sys.ARCH === :ppc64le
-    const atomictypes = (inttypes..., Bool)
-else
-    const atomictypes = (arithmetictypes..., Bool)
-end
-
-const IntTypes = Union{inttypes...}
-const FloatTypes = Union{floattypes...}
-const ArithmeticTypes = Union{arithmetictypes...}
-const AtomicTypes = Union{atomictypes...}
 
 """
     Threads.Atomic{T}
@@ -48,10 +18,6 @@ const AtomicTypes = Union{atomictypes...}
 Holds a reference to an object of type `T`, ensuring that it is only
 accessed atomically, i.e. in a thread-safe manner.
 
-Only certain "simple" types can be used atomically, namely the
-primitive boolean, integer, and float-point types. These are `Bool`,
-`Int8`...`Int128`, `UInt8`...`UInt128`, and `Float16`...`Float64`.
-
 New atomic objects can be created from a non-atomic values; if none is
 specified, the atomic object is initialized with zero.
 
@@ -72,10 +38,10 @@ julia> x[]
 Atomic operations use an `atomic_` prefix, such as [`atomic_add!`](@ref),
 [`atomic_xchg!`](@ref), etc.
 """
-mutable struct Atomic{T<:AtomicTypes}
-    value::T
-    Atomic{T}() where {T<:AtomicTypes} = new(zero(T))
-    Atomic{T}(value) where {T<:AtomicTypes} = new(value)
+mutable struct Atomic{T}
+    @atomic value::T
+    Atomic{T}() where {T} = new(zero(T))
+    Atomic{T}(value) where {T} = new(value)
 end
 
 Atomic() = Atomic{Int}()
@@ -332,120 +298,21 @@ julia> x[]
 """
 function atomic_min! end
 
-unsafe_convert(::Type{Ptr{T}}, x::Atomic{T}) where {T} = convert(Ptr{T}, pointer_from_objref(x))
-setindex!(x::Atomic{T}, v) where {T} = setindex!(x, convert(T, v))
-
-const llvmtypes = IdDict{Any,String}(
-    Bool => "i8",  # julia represents bools with 8-bits for now. # TODO: is this okay?
-    Int8 => "i8", UInt8 => "i8",
-    Int16 => "i16", UInt16 => "i16",
-    Int32 => "i32", UInt32 => "i32",
-    Int64 => "i64", UInt64 => "i64",
-    Int128 => "i128", UInt128 => "i128",
-    Float16 => "half",
-    Float32 => "float",
-    Float64 => "double",
-)
-inttype(::Type{T}) where {T<:Integer} = T
-inttype(::Type{Float16}) = Int16
-inttype(::Type{Float32}) = Int32
-inttype(::Type{Float64}) = Int64
-
-
-import ..Base.gc_alignment
-
-# All atomic operations have acquire and/or release semantics, depending on
-# whether the load or store values. Most of the time, this is what one wants
-# anyway, and it's only moderately expensive on most hardware.
-for typ in atomictypes
-    lt = llvmtypes[typ]
-    ilt = llvmtypes[inttype(typ)]
-    rt = "$lt, $lt*"
-    irt = "$ilt, $ilt*"
-    @eval getindex(x::Atomic{$typ}) =
-        GC.@preserve x llvmcall($"""
-                 %ptr = bitcast i8* %0 to $lt*
-                 %rv = load atomic $rt %ptr acquire, align $(gc_alignment(typ))
-                 ret $lt %rv
-                 """, $typ, Tuple{Ptr{$typ}}, unsafe_convert(Ptr{$typ}, x))
-    @eval setindex!(x::Atomic{$typ}, v::$typ) =
-        GC.@preserve x llvmcall($"""
-                 %ptr = bitcast i8* %0 to $lt*
-                 store atomic $lt %1, $lt* %ptr release, align $(gc_alignment(typ))
-                 ret void
-                 """, Cvoid, Tuple{Ptr{$typ}, $typ}, unsafe_convert(Ptr{$typ}, x), v)
-
-    # Note: atomic_cas! succeeded (i.e. it stored "new") if and only if the result is "cmp"
-    if typ <: Integer
-        @eval atomic_cas!(x::Atomic{$typ}, cmp::$typ, new::$typ) =
-            GC.@preserve x llvmcall($"""
-                     %ptr = bitcast i8* %0 to $lt*
-                     %rs = cmpxchg $lt* %ptr, $lt %1, $lt %2 acq_rel acquire
-                     %rv = extractvalue { $lt, i1 } %rs, 0
-                     ret $lt %rv
-                     """, $typ, Tuple{Ptr{$typ},$typ,$typ},
-                     unsafe_convert(Ptr{$typ}, x), cmp, new)
-    else
-        @eval atomic_cas!(x::Atomic{$typ}, cmp::$typ, new::$typ) =
-            GC.@preserve x llvmcall($"""
-                     %iptr = bitcast i8* %0 to $ilt*
-                     %icmp = bitcast $lt %1 to $ilt
-                     %inew = bitcast $lt %2 to $ilt
-                     %irs = cmpxchg $ilt* %iptr, $ilt %icmp, $ilt %inew acq_rel acquire
-                     %irv = extractvalue { $ilt, i1 } %irs, 0
-                     %rv = bitcast $ilt %irv to $lt
-                     ret $lt %rv
-                     """, $typ, Tuple{Ptr{$typ},$typ,$typ},
-                     unsafe_convert(Ptr{$typ}, x), cmp, new)
-    end
-
-    arithmetic_ops = [:add, :sub]
-    for rmwop in [arithmetic_ops..., :xchg, :and, :nand, :or, :xor, :max, :min]
-        rmw = string(rmwop)
-        fn = Symbol("atomic_", rmw, "!")
-        if (rmw == "max" || rmw == "min") && typ <: Unsigned
-            # LLVM distinguishes signedness in the operation, not the integer type.
-            rmw = "u" * rmw
-        end
-        if rmwop in arithmetic_ops && !(typ <: ArithmeticTypes) continue end
-        if typ <: Integer
-            @eval $fn(x::Atomic{$typ}, v::$typ) =
-                GC.@preserve x llvmcall($"""
-                         %ptr = bitcast i8* %0 to $lt*
-                         %rv = atomicrmw $rmw $lt* %ptr, $lt %1 acq_rel
-                         ret $lt %rv
-                         """, $typ, Tuple{Ptr{$typ}, $typ}, unsafe_convert(Ptr{$typ}, x), v)
-        else
-            rmwop === :xchg || continue
-            @eval $fn(x::Atomic{$typ}, v::$typ) =
-                GC.@preserve x llvmcall($"""
-                         %iptr = bitcast i8* %0 to $ilt*
-                         %ival = bitcast $lt %1 to $ilt
-                         %irv = atomicrmw $rmw $ilt* %iptr, $ilt %ival acq_rel
-                         %rv = bitcast $ilt %irv to $lt
-                         ret $lt %rv
-                         """, $typ, Tuple{Ptr{$typ}, $typ}, unsafe_convert(Ptr{$typ}, x), v)
-        end
-    end
-end
-
-# Provide atomic floating-point operations via atomic_cas!
-const opnames = Dict{Symbol, Symbol}(:+ => :add, :- => :sub)
-for op in [:+, :-, :max, :min]
-    opname = get(opnames, op, op)
-    @eval function $(Symbol("atomic_", opname, "!"))(var::Atomic{T}, val::T) where T<:FloatTypes
-        IT = inttype(T)
-        old = var[]
-        while true
-            new = $op(old, val)
-            cmp = old
-            old = atomic_cas!(var, cmp, new)
-            reinterpret(IT, old) == reinterpret(IT, cmp) && return old
-            # Temporary solution before we have gc transition support in codegen.
-            ccall(:jl_gc_safepoint, Cvoid, ())
-        end
-    end
-end
+#const nand = (~) ∘ (&) # ComposedFunction generated very poor code quality
+nand(x, y) = ~(x & y)
+
+getindex(x::Atomic) = @atomic :acquire x.value
+setindex!(x::Atomic, v) = (@atomic :release x.value = v; x)
+atomic_cas!(x::Atomic, cmp, new) = (@atomicreplace :acquire_release :acquire x.value cmp => new).old
+atomic_add!(x::Atomic, v) = (@atomic :acquire_release x.value + v).first
+atomic_sub!(x::Atomic, v) = (@atomic :acquire_release x.value - v).first
+atomic_and!(x::Atomic, v) = (@atomic :acquire_release x.value & v).first
+atomic_or!(x::Atomic, v) = (@atomic :acquire_release x.value | v).first
+atomic_xor!(x::Atomic, v) = (@atomic :acquire_release x.value ⊻ v).first
+atomic_nand!(x::Atomic, v) = (@atomic :acquire_release x.value nand v).first
+atomic_xchg!(x::Atomic, v) = (@atomicswap :acquire_release x.value = v)
+atomic_min!(x::Atomic, v) = (@atomic :acquire_release x.value min v).first
+atomic_max!(x::Atomic, v) = (@atomic :acquire_release x.value max v).first
 
 """
     Threads.atomic_fence()
@@ -462,7 +329,4 @@ fences should not be necessary in most cases.
 
 For further details, see LLVM's `fence` instruction.
 """
-atomic_fence() = llvmcall("""
-                          fence seq_cst
-                          ret void
-                          """, Cvoid, Tuple{})
+atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent)
diff --git a/test/threads_exec.jl b/test/threads_exec.jl
index 629f474f53a38..dc0bc407d2fb5 100644
--- a/test/threads_exec.jl
+++ b/test/threads_exec.jl
@@ -334,29 +334,12 @@ using Base.Threads
 end
 end
 
-# Ensure only LLVM-supported types can be atomic
-@test_throws TypeError Atomic{BigInt}
-@test_throws TypeError Atomic{ComplexF64}
-
-if Sys.ARCH === :i686 || startswith(string(Sys.ARCH), "arm") ||
-   Sys.ARCH === :powerpc64le || Sys.ARCH === :ppc64le
-
-    @test_throws TypeError Atomic{Int128}()
-    @test_throws TypeError Atomic{UInt128}()
-end
-
-if Sys.ARCH === :powerpc64le || Sys.ARCH === :ppc64le
-    @test_throws TypeError Atomic{Float16}()
-    @test_throws TypeError Atomic{Float32}()
-    @test_throws TypeError Atomic{Float64}()
-end
-
 function test_atomic_bools()
     x = Atomic{Bool}(false)
-    # Arithmetic functions are not defined.
-    @test_throws MethodError atomic_add!(x, true)
-    @test_throws MethodError atomic_sub!(x, true)
-    # All the rest are:
+    # Arithmetic functions such as true+true returns Int
+    @test_throws TypeError atomic_add!(x, true)
+    @test_throws TypeError atomic_sub!(x, true)
+    # All the rest are supported:
     for v in [true, false]
         @test x[] == atomic_xchg!(x, v)
         @test v == atomic_cas!(x, v, !v)
@@ -462,10 +445,9 @@ end
 test_fence()
 
 # Test load / store with various types
-let atomictypes = intersect((Int8, Int16, Int32, Int64, Int128,
-                             UInt8, UInt16, UInt32, UInt64, UInt128,
-                             Float16, Float32, Float64),
-                            Base.Threads.atomictypes)
+let atomictypes = (Int8, Int16, Int32, Int64, Int128,
+                   UInt8, UInt16, UInt32, UInt64, UInt128,
+                   Float16, Float32, Float64)
     for T in atomictypes
         var = Atomic{T}()
         var[] = 42
@@ -493,7 +475,7 @@ function test_atomic_cas!(var::Atomic{T}, range::StepRange{Int,Int}) where T
         end
     end
 end
-for T in intersect((Int32, Int64, Float32, Float64), Base.Threads.atomictypes)
+for T in (Int32, Int64, Float32, Float64)
     var = Atomic{T}()
     nloops = 1000
     di = threadpoolsize(:default)
@@ -507,7 +489,7 @@ function test_atomic_xchg!(var::Atomic{T}, i::Int, accum::Atomic{Int}) where T
     old = atomic_xchg!(var, T(i))
     atomic_add!(accum, Int(old))
 end
-for T in intersect((Int32, Int64, Float32, Float64), Base.Threads.atomictypes)
+for T in (Int32, Int64, Float32, Float64)
     accum = Atomic{Int}()
     var = Atomic{T}()
     nloops = 1000
@@ -522,7 +504,7 @@ function test_atomic_float(varadd::Atomic{T}, varmax::Atomic{T}, varmin::Atomic{
     atomic_max!(varmax, T(i))
     atomic_min!(varmin, T(i))
 end
-for T in intersect((Int32, Int64, Float16, Float32, Float64), Base.Threads.atomictypes)
+for T in (Int32, Int64, Float16, Float32, Float64)
     varadd = Atomic{T}()
     varmax = Atomic{T}()
     varmin = Atomic{T}()