From aab74908dc6b027fc4edd134bfeeaa3af9f141d2 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Mon, 25 Nov 2024 10:18:33 -0500 Subject: [PATCH 1/2] codegen: add a pass for late conversion of known modify ops to call atomicrmw The ExpandAtomicModify can recognize our pseudo-intrinsic julia.atomicmodify and convert it into some of known atomicrmw expressions, or simplify it with more inlining, as applicable. This ensures that now our `@atomic` modify is as fast as `Threads.Atomic` for the cases we implement now. --- src/Makefile | 8 +- src/aotcompile.cpp | 199 +++++++++---- src/cgutils.cpp | 82 ++++-- src/codegen.cpp | 193 ++++++++++-- src/jitlayers.cpp | 454 +++++++++++----------------- src/jitlayers.h | 17 +- src/julia.expmap.in | 1 - src/llvm-expand-atomic-modify.cpp | 473 ++++++++++++++++++++++++++++++ src/llvm-julia-passes.inc | 1 + src/passes.h | 5 + src/pipeline.cpp | 4 +- test/llvmpasses/atomic-modify.ll | 288 ++++++++++++++++++ 12 files changed, 1346 insertions(+), 379 deletions(-) create mode 100644 src/llvm-expand-atomic-modify.cpp create mode 100644 test/llvmpasses/atomic-modify.ll diff --git a/src/Makefile b/src/Makefile index c605d6c70573b..6a6f604f3c5fc 100644 --- a/src/Makefile +++ b/src/Makefile @@ -79,7 +79,8 @@ endif CODEGEN_SRCS := codegen jitlayers aotcompile debuginfo disasm llvm-simdloop \ llvm-pass-helpers llvm-ptls llvm-propagate-addrspaces null_sysimage \ llvm-multiversioning llvm-alloc-opt llvm-alloc-helpers cgmemmgr llvm-remove-addrspaces \ - llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures pipeline llvm_api \ + llvm-remove-ni llvm-julia-licm llvm-demote-float16 llvm-cpufeatures llvm-expand-atomic-modify \ + pipeline llvm_api \ $(GC_CODEGEN_SRCS) FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir) CG_LLVM_LIBS := all @@ -338,7 +339,7 @@ $(BUILDDIR)/julia_flisp.boot: $(addprefix $(SRCDIR)/,jlfrontend.scm flisp/aliase $(call cygpath_w,$(SRCDIR)/mk_julia_flisp_boot.scm) $(call cygpath_w,$(dir $<)) $(notdir $<) $(call cygpath_w,$@)) # additional dependency links -$(BUILDDIR)/codegen-stubs.o $(BUILDDIR)/codegen-stubs.dbg.obj: $(SRCDIR)/intrinsics.h +$(BUILDDIR)/codegen-stubs.o $(BUILDDIR)/codegen-stubs.dbg.obj: $(addprefix $(SRCDIR)/,intrinsics.h llvm-julia-passes.inc) $(BUILDDIR)/aotcompile.o $(BUILDDIR)/aotcompile.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/processor.h $(BUILDDIR)/ast.o $(BUILDDIR)/ast.dbg.obj: $(BUILDDIR)/julia_flisp.boot.inc $(SRCDIR)/flisp/*.h $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/iddict.c $(SRCDIR)/idset.c $(SRCDIR)/builtin_proto.h @@ -378,7 +379,8 @@ $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $ $(BUILDDIR)/staticdata.o $(BUILDDIR)/staticdata.dbg.obj: $(SRCDIR)/staticdata_utils.c $(SRCDIR)/precompile_utils.c $(SRCDIR)/processor.h $(SRCDIR)/builtin_proto.h $(BUILDDIR)/toplevel.o $(BUILDDIR)/toplevel.dbg.obj: $(SRCDIR)/builtin_proto.h $(BUILDDIR)/ircode.o $(BUILDDIR)/ircode.dbg.obj: $(SRCDIR)/serialize.h $(SRCDIR)/common_symbols1.inc $(SRCDIR)/common_symbols2.inc -$(BUILDDIR)/pipeline.o $(BUILDDIR)/pipeline.dbg.obj: $(SRCDIR)/passes.h $(SRCDIR)/jitlayers.h +$(BUILDDIR)/pipeline.o $(BUILDDIR)/pipeline.dbg.obj: $(addprefix $(SRCDIR)/,passes.h jitlayers.h llvm-julia-passes.inc) +$(BUILDDIR)/llvm_api.o $(BUILDDIR)/llvm_api.dbg.obj: $(SRCDIR)/llvm-julia-passes.inc $(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc-common.o gc-stock.o gc.dbg.obj init.c init.dbg.obj task.o task.dbg.obj): $(addprefix $(SRCDIR)/,threading.h) $(addprefix $(BUILDDIR)/,APInt-C.o APInt-C.dbg.obj runtime_intrinsics.o runtime_intrinsics.dbg.obj): $(SRCDIR)/APInt-C.h diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index d687f44808409..7c9d8aec1a1c9 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -330,7 +330,11 @@ class egal_set { }; } using ::egal_set; -typedef DenseMap> jl_compiled_functions_t; +struct jl_compiled_function_t { + orc::ThreadSafeModule TSM; + jl_llvm_functions_t decls; +}; +typedef DenseMap jl_compiled_functions_t; static void record_method_roots(egal_set &method_roots, jl_method_instance_t *mi) { @@ -376,7 +380,7 @@ static void aot_optimize_roots(jl_codegen_params_t ¶ms, egal_set &method_roo std::string OldName(GV->getName()); StringRef NewName(mref->second->getName()); for (auto &def : compiled_functions) { - orc::ThreadSafeModule &TSM = std::get<0>(def.second); + orc::ThreadSafeModule &TSM = def.second.TSM; Module &M = *TSM.getModuleUnlocked(); if (GlobalValue *GV2 = M.getNamedValue(OldName)) { if (GV2 == GV) @@ -402,7 +406,7 @@ static void aot_optimize_roots(jl_codegen_params_t ¶ms, egal_set &method_roo static void resolve_workqueue(jl_codegen_params_t ¶ms, egal_set &method_roots, jl_compiled_functions_t &compiled_functions) { - decltype(params.workqueue) workqueue; + jl_workqueue_t workqueue; std::swap(params.workqueue, workqueue); jl_code_instance_t *codeinst = NULL; JL_GC_PUSH1(&codeinst); @@ -418,7 +422,7 @@ static void resolve_workqueue(jl_codegen_params_t ¶ms, egal_set &method_root { auto it = compiled_functions.find(codeinst); if (it != compiled_functions.end()) { - auto &decls = it->second.second; + auto &decls = it->second.decls; invokeName = decls.functionObject; if (decls.functionObject == "jl_fptr_args") { preal_decl = decls.specFunctionObject; @@ -442,8 +446,11 @@ static void resolve_workqueue(jl_codegen_params_t ¶ms, egal_set &method_root } if (preal_decl.empty()) { pinvoke = emit_tojlinvoke(codeinst, invokeName, mod, params); - if (!proto.specsig) + if (!proto.specsig) { proto.decl->replaceAllUsesWith(pinvoke); + proto.decl->eraseFromParent(); + proto.decl = pinvoke; + } } if (proto.specsig && !preal_specsig) { // get or build an fptr1 that can invoke codeinst @@ -462,9 +469,12 @@ static void resolve_workqueue(jl_codegen_params_t ¶ms, egal_set &method_root } if (!preal_decl.empty()) { // merge and/or rename this prototype to the real function - if (Value *specfun = mod->getNamedValue(preal_decl)) { - if (proto.decl != specfun) + if (Function *specfun = cast_or_null(mod->getNamedValue(preal_decl))) { + if (proto.decl != specfun) { proto.decl->replaceAllUsesWith(specfun); + proto.decl->eraseFromParent(); + proto.decl = specfun; + } } else { proto.decl->setName(preal_decl); @@ -482,9 +492,12 @@ static void resolve_workqueue(jl_codegen_params_t ¶ms, egal_set &method_root assert(ocinvokeDecl != "jl_fptr_const_return"); assert(ocinvokeDecl != "jl_fptr_sparam"); // merge and/or rename this prototype to the real function - if (Value *specfun = mod->getNamedValue(ocinvokeDecl)) { - if (proto.oc != specfun) + if (Function *specfun = cast_or_null(mod->getNamedValue(ocinvokeDecl))) { + if (proto.oc != specfun) { proto.oc->replaceAllUsesWith(specfun); + proto.oc->eraseFromParent(); + proto.oc = specfun; + } } else { proto.oc->setName(ocinvokeDecl); @@ -496,6 +509,7 @@ static void resolve_workqueue(jl_codegen_params_t ¶ms, egal_set &method_root JL_GC_POP(); } + /// Link the function in the source module into the destination module if /// needed, setting up mapping information. /// Similar to orc::cloneFunctionDecl, but more complete for greater correctness @@ -577,8 +591,8 @@ static void generate_cfunc_thunks(jl_codegen_params_t ¶ms, jl_compiled_funct codeinst = it->second; JL_GC_PROMISE_ROOTED(codeinst); auto defs = compiled_functions.find(codeinst); - defM = std::get<0>(defs->second).getModuleUnlocked(); - const jl_llvm_functions_t &decls = std::get<1>(defs->second); + defM = defs->second.TSM.getModuleUnlocked(); + const jl_llvm_functions_t &decls = defs->second.decls; func = decls.functionObject; StringRef specfunc = decls.specFunctionObject; jl_value_t *astrt = codeinst->rettype; @@ -624,6 +638,25 @@ static void generate_cfunc_thunks(jl_codegen_params_t ¶ms, jl_compiled_funct } } +// destructively move the contents of src into dest +// this assumes that the targets of the two modules are the same +// including the DataLayout and ModuleFlags (for example) +// and that there is no module-level assembly +// Comdat is also removed, since this needs to be re-added later +static void jl_merge_module(Linker &L, orc::ThreadSafeModule srcTSM) JL_NOTSAFEPOINT +{ + srcTSM.consumingModuleDo([&L](std::unique_ptr src) JL_NOTSAFEPOINT { + bool error = L.linkInModule(std::move(src)); + assert(!error && "linking llvmcall modules failed"); + (void)error; + }); +} + +static bool canPartition(const Function &F) +{ + return !F.hasFnAttribute(Attribute::AlwaysInline) && + !F.hasFnAttribute(Attribute::InlineHint); +} // takes the running content that has collected in the shadow module and dump it to disk // this builds the object file portion of the sysimage files for fast startup @@ -743,7 +776,7 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm orc::ThreadSafeModule backing; if (!llvmmod) { ctx = jl_ExecutionEngine->makeContext(); - backing = jl_create_ts_module("text", ctx); + backing = jl_create_ts_module("text", ctx, jl_ExecutionEngine->getDataLayout(), jl_ExecutionEngine->getTargetTriple()); } orc::ThreadSafeModule &clone = llvmmod ? *unwrap(llvmmod) : backing; auto ctxt = clone.getContext(); @@ -760,6 +793,7 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm assert(params.imaging_mode); // `_imaging_mode` controls if broken features like code-coverage are disabled params.external_linkage = external_linkage; params.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0); + bool safepoint_on_entry = params.safepoint_on_entry; JL_GC_PUSH3(¶ms.temporary_roots, &method_roots.list, &method_roots.keyset); jl_compiled_functions_t compiled_functions; size_t i, l; @@ -774,17 +808,8 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm assert(jl_is_code_info(src)); if (compiled_functions.count(codeinst)) continue; // skip any duplicates that accidentally made there way in here (or make this an error?) - if (external_linkage) { - uint8_t specsigflags; - jl_callptr_t invoke; - void *fptr; - jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0); - if (invoke != NULL && (specsigflags & 0b100)) { - // this codeinst is already available externally - // TODO: for performance, avoid generating the src code when we know it would reach here anyways - continue; - } - } + if (jl_ir_inlining_cost((jl_value_t*)src) < UINT16_MAX) + params.safepoint_on_entry = false; // ensure we don't block ExpandAtomicModifyPass from inlining this code if applicable orc::ThreadSafeModule result_m = jl_create_ts_module(name_from_method_instance(jl_get_ci_mi(codeinst)), params.tsctx, clone.getModuleUnlocked()->getDataLayout(), Triple(clone.getModuleUnlocked()->getTargetTriple())); @@ -793,6 +818,7 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm decls.functionObject = "jl_fptr_const_return"; else decls = jl_emit_codeinst(result_m, codeinst, src, params); + params.safepoint_on_entry = safepoint_on_entry; record_method_roots(method_roots, jl_get_ci_mi(codeinst)); if (result_m) compiled_functions[codeinst] = {std::move(result_m), std::move(decls)}; @@ -823,7 +849,6 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm size_t idx = 0; for (auto &global : params.global_targets) { gvars[idx] = global.second->getName().str(); - global.second->setInitializer(literal_static_pointer_val(global.first, global.second->getValueType())); assert(gvars_set.insert(global.second).second && "Duplicate gvar in params!"); assert(gvars_names.insert(gvars[idx]).second && "Duplicate gvar name in params!"); data->jl_value_to_llvm[idx] = global.first; @@ -854,11 +879,27 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm { Linker L(*clone.getModuleUnlocked()); for (auto &def : compiled_functions) { - jl_merge_module(clone, std::move(std::get<0>(def.second))); jl_code_instance_t *this_code = def.first; - jl_llvm_functions_t decls = std::get<1>(def.second); + JL_GC_PROMISE_ROOTED(this_code); + jl_llvm_functions_t &decls = def.second.decls; StringRef func = decls.functionObject; StringRef cfunc = decls.specFunctionObject; + orc::ThreadSafeModule &M = def.second.TSM; + if (external_linkage) { + uint8_t specsigflags; + jl_callptr_t invoke; + void *fptr; + jl_read_codeinst_invoke(this_code, &specsigflags, &invoke, &fptr, 0); + if (invoke != NULL && (specsigflags & 0b100)) { + // this codeinst is already available externally: keep it only if canPartition demands it for local use + // TODO: for performance, avoid generating the src code when we know it would reach here anyways? + if (M.withModuleDo([&](Module &M) { return !canPartition(*cast(M.getNamedValue(cfunc))); })) { + jl_merge_module(L, std::move(M)); + } + continue; + } + } + jl_merge_module(L, std::move(M)); uint32_t func_id = 0; uint32_t cfunc_id = 0; if (func == "jl_fptr_args") { @@ -885,6 +926,52 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm } data->jl_fvar_map[this_code] = std::make_tuple(func_id, cfunc_id); } + bool Changed = true; + while (Changed) { + Changed = false; + // make sure everything referenced got included though, since some functions aren't + // correctly implemented by staticdata for external use, and so codegen won't emit + // an external reference but expects a private copy here instead + for (auto &def : compiled_functions) { + orc::ThreadSafeModule &M = def.second.TSM; + if (!M) + continue; + jl_llvm_functions_t &decls = def.second.decls; + StringRef func = decls.functionObject; + StringRef cfunc = decls.specFunctionObject; + if (func != "jl_fptr_args" && + func != "jl_fptr_sparam" && + func != "jl_f_opaque_closure_call" && + clone.getModuleUnlocked()->getNamedValue(func)) { + jl_merge_module(L, std::move(M)); + Changed = true; + continue; + } + if (!cfunc.empty() && clone.getModuleUnlocked()->getNamedValue(cfunc)) { + Changed = true; + jl_merge_module(L, std::move(M)); + } + } + } +#ifndef NDEBUG + // make sure we didn't forget anything that we promised to include in here + for (auto &def : compiled_functions) { + jl_llvm_functions_t &decls = def.second.decls; + StringRef func = decls.functionObject; + StringRef cfunc = decls.specFunctionObject; + if (func != "jl_fptr_args" && + func != "jl_fptr_sparam" && + func != "jl_f_opaque_closure_call") { + GlobalValue *F = clone.getModuleUnlocked()->getNamedValue(func); + assert(!F || !F->isDeclaration()); + } + if (!cfunc.empty()) { + GlobalValue *F = clone.getModuleUnlocked()->getNamedValue(cfunc); + assert(!F || !F->isDeclaration()); + } + } +#endif + compiled_functions.clear(); if (params._shared_module) { bool error = L.linkInModule(std::move(params._shared_module)); assert(!error && "Error linking in shared module"); @@ -894,15 +981,35 @@ void *jl_emit_native_impl(jl_array_t *codeinfos, LLVMOrcThreadSafeModuleRef llvm // now get references to the globals in the merged module // and set them to be internalized and initialized at startup + // filter out any gvars that got optimized away + idx = 0; + size_t newoffset = 0; + size_t newidx = 0; for (auto &global : gvars) { //Safe b/c context is locked by params - GlobalVariable *G = cast(clone.getModuleUnlocked()->getNamedValue(global)); - assert(G->hasInitializer()); - G->setLinkage(GlobalValue::InternalLinkage); - G->setDSOLocal(true); - data->jl_sysimg_gvars.push_back(G); + GlobalVariable *G = cast_or_null(clone.getModuleUnlocked()->getNamedValue(global)); + if (G != nullptr) { + assert(!G->hasInitializer()); + G->setInitializer(Constant::getNullValue(G->getValueType())); + G->setLinkage(GlobalValue::InternalLinkage); + G->setDSOLocal(true); + assert(newidx == data->jl_sysimg_gvars.size()); + if (idx < offset) { + data->jl_value_to_llvm[newidx] = data->jl_value_to_llvm[idx]; + newoffset = newidx + 1; + } + else { + data->jl_external_to_llvm[newidx - newoffset] = data->jl_external_to_llvm[idx - offset]; + } + data->jl_sysimg_gvars.push_back(G); + newidx++; + } + idx++; } - CreateNativeGlobals += gvars.size(); + data->jl_value_to_llvm.resize(newoffset); + data->jl_external_to_llvm.resize(newidx - newoffset); + gvars.clear(); + CreateNativeGlobals += idx; data->M = std::move(clone); return (void*)data; @@ -1126,11 +1233,6 @@ struct Partition { size_t weight; }; -static bool canPartition(const Function &F) -{ - return !F.hasFnAttribute(Attribute::AlwaysInline); -} - static inline bool verify_partitioning(const SmallVectorImpl &partitions, const Module &M, DenseMap &fvars, DenseMap &gvars) { bool bad = false; #ifndef JL_NDEBUG @@ -1583,7 +1685,8 @@ static void materializePreserved(Module &M, Partition &partition) { // This just avoids a hashtable lookup. GV->setLinkage(GlobalValue::InternalLinkage); assert(GV->hasDefaultVisibility()); - } else { + } + else { Preserve.insert(GV); } } @@ -2094,11 +2197,6 @@ void jl_dump_native_impl(void *native_code, addComdat(&GA, TheTriple); } - // Wipe the global initializers, we'll reset them at load time - for (auto gv : data->jl_sysimg_gvars) { - cast(gv)->setInitializer(Constant::getNullValue(gv->getValueType())); - } - // add metadata information if (imaging_mode) { multiversioning_preannotate(dataM); @@ -2359,17 +2457,16 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t *dump, jl_method_instance_t *mi, jl_ dump->TSM = nullptr; if (src && jl_is_code_info(src)) { auto ctx = jl_ExecutionEngine->makeContext(); - orc::ThreadSafeModule m = jl_create_ts_module(name_from_method_instance(mi), ctx); + const auto &DL = jl_ExecutionEngine->getDataLayout(); + const auto &TT = jl_ExecutionEngine->getTargetTriple(); + orc::ThreadSafeModule m = jl_create_ts_module(name_from_method_instance(mi), ctx, DL, TT); Function *F = nullptr; { uint64_t compiler_start_time = 0; uint8_t measure_compile_time_enabled = jl_atomic_load_relaxed(&jl_measure_compile_time_enabled); if (measure_compile_time_enabled) compiler_start_time = jl_hrtime(); - auto target_info = m.withModuleDo([&](Module &M) { - return std::make_pair(M.getDataLayout(), Triple(M.getTargetTriple())); - }); - jl_codegen_params_t output(ctx, std::move(target_info.first), std::move(target_info.second)); + jl_codegen_params_t output(ctx, DL, TT); output.params = ¶ms; output.imaging_mode = jl_options.image_codegen; output.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0); @@ -2389,7 +2486,7 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t *dump, jl_method_instance_t *mi, jl_ jl_code_instance_t *codeinst = jl_type_infer(mi, latestworld, SOURCE_MODE_NOT_REQUIRED); if (codeinst == nullptr || compiled_functions.count(codeinst)) continue; - orc::ThreadSafeModule decl_m = jl_create_ts_module("extern", ctx); + orc::ThreadSafeModule decl_m = jl_create_ts_module("extern", ctx, DL, TT); jl_llvm_functions_t decls; if (jl_atomic_load_relaxed(&codeinst->invoke) == jl_fptr_const_return_addr) decls.functionObject = "jl_fptr_const_return"; @@ -2398,6 +2495,8 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t *dump, jl_method_instance_t *mi, jl_ compiled_functions[codeinst] = {std::move(decl_m), std::move(decls)}; } generate_cfunc_thunks(output, compiled_functions); + emit_always_inline(m, output); + output.workqueue.clear(); compiled_functions.clear(); output.temporary_roots = nullptr; JL_GC_POP(); // GC the global_targets array contents now since reflection doesn't need it @@ -2412,7 +2511,7 @@ void jl_get_llvmf_defn_impl(jl_llvmf_dump_t *dump, jl_method_instance_t *mi, jl_ } else { auto p = literal_static_pointer_val(global.first, global.second->getValueType()); - Type *elty = PointerType::get(output.getContext(), 0); + Type *elty = PointerType::get(p->getContext(), 0); // For pretty printing, when LLVM inlines the global initializer into its loads auto alias = GlobalAlias::create(elty, 0, GlobalValue::PrivateLinkage, global.second->getName() + ".jit", p, global.second->getParent()); global.second->setInitializer(ConstantExpr::getBitCast(alias, global.second->getValueType())); diff --git a/src/cgutils.cpp b/src/cgutils.cpp index d9b7b98e40ef4..64d6f6eb54de8 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -2265,8 +2265,10 @@ static jl_cgval_t typed_load(jl_codectx_t &ctx, Value *ptr, Value *idx_0based, j return mark_julia_slot(intcast, jltype, NULL, ctx.tbaa().tbaa_stack); } +static Function *emit_modifyhelper(jl_codectx_t &ctx2, const jl_cgval_t &op, const jl_cgval_t &modifyop, jl_value_t *jltype, Type *elty, jl_cgval_t rhs, const Twine &fname, bool gcstack_arg); + static jl_cgval_t typed_store(jl_codectx_t &ctx, - Value *ptr, jl_cgval_t rhs, jl_cgval_t cmp, + Value *ptr, jl_cgval_t rhs, jl_cgval_t cmpop, jl_value_t *jltype, MDNode *tbaa, MDNode *aliasscope, Value *parent, // for the write barrier, NULL if no barrier needed bool isboxed, AtomicOrdering Order, AtomicOrdering FailOrder, unsigned alignment, @@ -2275,10 +2277,10 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, jl_module_t *mod, jl_sym_t *var) { auto newval = [&](const jl_cgval_t &lhs) { - const jl_cgval_t argv[3] = { cmp, lhs, rhs }; + const jl_cgval_t argv[3] = { cmpop, lhs, rhs }; jl_cgval_t ret; if (modifyop) { - ret = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type); + ret = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type, true); } else { Value *callval = emit_jlcall(ctx, jlapplygeneric_func, nullptr, argv, 3, julia_call); @@ -2302,7 +2304,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, return rhs; } else if (isreplacefield) { - Value *Success = emit_f_is(ctx, cmp, ghostValue(ctx, jltype)); + Value *Success = emit_f_is(ctx, cmpop, ghostValue(ctx, jltype)); Success = ctx.builder.CreateZExt(Success, getInt8Ty(ctx.builder.getContext())); const jl_cgval_t argv[2] = {ghostValue(ctx, jltype), mark_julia_type(ctx, Success, false, jl_bool_type)}; jl_datatype_t *rettyp = jl_apply_cmpswap_type(jltype); @@ -2403,6 +2405,46 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, ai.decorateInst(store); instr = store; } + else if (ismodifyfield && modifyop && !needlock && Order != AtomicOrdering::NotAtomic && !isboxed && realelty == elty && !intcast && elty->isIntegerTy() && !jl_type_hasptr(jltype)) { + // emit this only if we have a possibility of optimizing it + if (Order == AtomicOrdering::Unordered) + Order = AtomicOrdering::Monotonic; + if (jl_is_pointerfree(rhs.typ) && !rhs.isghost && (rhs.constant || rhs.isboxed || rhs.ispointer())) { + // if this value can be loaded from memory, do that now so that it is sequenced before the atomicmodify + // and the IR is less dependent on what was emitted before now to create this rhs. + // Inlining should do okay to clean this up later if there are parts we don't need. + rhs = jl_cgval_t(emit_unbox(ctx, julia_type_to_llvm(ctx, rhs.typ), rhs, rhs.typ), rhs.typ, NULL); + } + bool gcstack_arg = JL_FEAT_TEST(ctx,gcstack_arg); + Function *op = emit_modifyhelper(ctx, cmpop, *modifyop, jltype, elty, rhs, fname, gcstack_arg); + std::string intr_name = "julia.atomicmodify.i"; + intr_name += utostr(cast(elty)->getBitWidth()); + intr_name += ".p"; + intr_name += utostr(ptr->getType()->getPointerAddressSpace()); + FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, + FunctionType::get(StructType::get(elty, elty), {ptr->getType(), ctx.builder.getPtrTy(), ctx.builder.getInt8Ty(), ctx.builder.getInt8Ty()}, true), + AttributeList::get(elty->getContext(), + Attributes(elty->getContext(), {Attribute::NoMerge}), // prevent llvm from merging calls to different functions + AttributeSet(), + None)); + SmallVector Args = {ptr, op, ctx.builder.getInt8((unsigned)Order), ctx.builder.getInt8(SyncScope::System)}; + if (rhs.V) + Args.push_back(rhs.V); + if (rhs.Vboxed) + Args.push_back(rhs.Vboxed); + if (rhs.TIndex) + Args.push_back(rhs.TIndex); + Args.append(rhs.inline_roots); + if (gcstack_arg) + Args.push_back(ctx.pgcstack); + auto oldnew = ctx.builder.CreateCall(intr, Args); + oldnew->addParamAttr(0, Attribute::getWithAlignment(oldnew->getContext(), Align(alignment))); + //jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, tbaa); + //ai.noalias = MDNode::concatenate(aliasscope, ai.noalias); + //ai.decorateInst(oldnew); + oldval = mark_julia_type(ctx, ctx.builder.CreateExtractValue(oldnew, 0), isboxed, jltype); + rhs = mark_julia_type(ctx, ctx.builder.CreateExtractValue(oldnew, 1), isboxed, jltype); + } else { // replacefield, modifyfield, swapfield, setfieldonce (isboxed && atomic) DoneBB = BasicBlock::Create(ctx.builder.getContext(), "done_xchg", ctx.f); @@ -2416,7 +2458,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, assert(jl_is_concrete_type(jltype)); needloop = ((jl_datatype_t*)jltype)->layout->flags.haspadding || !((jl_datatype_t*)jltype)->layout->flags.isbitsegal; - Value *SameType = emit_isa(ctx, cmp, jltype, Twine()).first; + Value *SameType = emit_isa(ctx, cmpop, jltype, Twine()).first; if (SameType != ConstantInt::getTrue(ctx.builder.getContext())) { BasicBlock *SkipBB = BasicBlock::Create(ctx.builder.getContext(), "skip_xchg", ctx.f); BasicBlock *BB = BasicBlock::Create(ctx.builder.getContext(), "ok_xchg", ctx.f); @@ -2436,22 +2478,22 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, Current->addIncoming(instr, SkipBB); ctx.builder.SetInsertPoint(BB); } - cmp = update_julia_type(ctx, cmp, jltype); + cmpop = update_julia_type(ctx, cmpop, jltype); if (intcast) { - emit_unbox_store(ctx, cmp, intcast, ctx.tbaa().tbaa_stack, MaybeAlign(), intcast->getAlign()); + emit_unbox_store(ctx, cmpop, intcast, ctx.tbaa().tbaa_stack, MaybeAlign(), intcast->getAlign()); Compare = ctx.builder.CreateLoad(realelty, intcast); } else { - Compare = emit_unbox(ctx, realelty, cmp, jltype); + Compare = emit_unbox(ctx, realelty, cmpop, jltype); } if (realelty != elty) Compare = ctx.builder.CreateZExt(Compare, elty); } - else if (cmp.isboxed || cmp.constant || jl_pointer_egal(jltype)) { - Compare = boxed(ctx, cmp); - needloop = !jl_pointer_egal(jltype) && !jl_pointer_egal(cmp.typ); - if (needloop && !cmp.isboxed) // try to use the same box in the compare now and later - cmp = mark_julia_type(ctx, Compare, true, cmp.typ); + else if (cmpop.isboxed || cmpop.constant || jl_pointer_egal(jltype)) { + Compare = boxed(ctx, cmpop); + needloop = !jl_pointer_egal(jltype) && !jl_pointer_egal(cmpop.typ); + if (needloop && !cmpop.isboxed) // try to use the same box in the compare now and later + cmpop = mark_julia_type(ctx, Compare, true, cmpop.typ); } else { Compare = Constant::getNullValue(ctx.types().T_prjlvalue); // TODO: does this need to be an invalid bit pattern? @@ -2485,7 +2527,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, } if (ismodifyfield) { if (needlock) - emit_lockstate_value(ctx, needlock, false); + emit_lockstate_value(ctx, needlock, false); // unlock Value *realCompare = Compare; if (realelty != elty) realCompare = ctx.builder.CreateTrunc(realCompare, realelty); @@ -2520,8 +2562,8 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, if (realelty != elty) r = ctx.builder.CreateZExt(r, elty); if (needlock) - emit_lockstate_value(ctx, needlock, true); - cmp = oldval; + emit_lockstate_value(ctx, needlock, true); // relock + cmpop = oldval; } Value *Done; if (Order == AtomicOrdering::NotAtomic) { @@ -2541,7 +2583,7 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, if (issetfieldonce) Success = ctx.builder.CreateIsNull(first_ptr); else - Success = emit_f_is(ctx, oldval, cmp, first_ptr, nullptr); + Success = emit_f_is(ctx, oldval, cmpop, first_ptr, nullptr); if (needloop && ismodifyfield) CmpPhi->addIncoming(load, ctx.builder.GetInsertBlock()); assert(Succ == nullptr); @@ -2599,12 +2641,12 @@ static jl_cgval_t typed_store(jl_codectx_t &ctx, Done = ctx.builder.CreateIsNotNull(first_ptr); } else { - // Done = !(!Success && (first_ptr != NULL && oldval == cmp)) + // Done = !(!Success && (first_ptr != NULL && oldval == cmpop)) Done = emit_guarded_test(ctx, ctx.builder.CreateNot(Success), false, [&] { Value *first_ptr = nullptr; if (maybe_null_if_boxed) first_ptr = isboxed ? realinstr : extract_first_ptr(ctx, realinstr); - return emit_f_is(ctx, oldval, cmp, first_ptr, nullptr); + return emit_f_is(ctx, oldval, cmpop, first_ptr, nullptr); }); Done = ctx.builder.CreateNot(Done); } @@ -4024,7 +4066,7 @@ static jl_cgval_t union_store(jl_codectx_t &ctx, emit_lockstate_value(ctx, needlock, false); const jl_cgval_t argv[3] = { cmp, oldval, rhs }; if (modifyop) { - rhs = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type); + rhs = emit_invoke(ctx, *modifyop, argv, 3, (jl_value_t*)jl_any_type, true); } else { Value *callval = emit_jlcall(ctx, jlapplygeneric_func, nullptr, argv, 3, julia_call); diff --git a/src/codegen.cpp b/src/codegen.cpp index af86b568c2b0f..473b0709e9f93 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -2076,7 +2076,7 @@ static CallInst *emit_jlcall(jl_codectx_t &ctx, JuliaFunction<> *theFptr, Value static Value *emit_f_is(jl_codectx_t &ctx, const jl_cgval_t &arg1, const jl_cgval_t &arg2, Value *nullcheck1 = nullptr, Value *nullcheck2 = nullptr); static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t nargs, ArrayRef argv, bool is_promotable=false); -static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayRef argv, size_t nargs, jl_value_t *rt); +static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayRef argv, size_t nargs, jl_value_t *rt, bool always_inline); static Value *literal_pointer_val(jl_codectx_t &ctx, jl_value_t *p); static unsigned julia_alignment(jl_value_t *jt); @@ -5124,10 +5124,7 @@ static jl_cgval_t emit_call_specfun_other(jl_codectx_t &ctx, bool is_opaque_clos namep += cast(TheCallee)->getName(); GlobalVariable *GV = cast_or_null(jl_Module->getNamedValue(namep)); if (GV == nullptr) { - GV = new GlobalVariable(*jl_Module, TheCallee->getType(), false, - GlobalVariable::ExternalLinkage, - Constant::getNullValue(TheCallee->getType()), - namep); + GV = new GlobalVariable(*jl_Module, TheCallee->getType(), false, GlobalVariable::ExternalLinkage, nullptr, namep); ctx.emission_context.external_fns[std::make_tuple(fromexternal, true)] = GV; } jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const); @@ -5174,10 +5171,7 @@ static jl_cgval_t emit_call_specfun_boxed(jl_codectx_t &ctx, jl_value_t *jlretty GlobalVariable *GV = cast_or_null(jl_Module->getNamedValue(namep)); Type *pfunc = PointerType::getUnqual(ctx.builder.getContext()); if (GV == nullptr) { - GV = new GlobalVariable(*jl_Module, pfunc, false, - GlobalVariable::ExternalLinkage, - Constant::getNullValue(pfunc), - namep); + GV = new GlobalVariable(*jl_Module, pfunc, false, GlobalVariable::ExternalLinkage, nullptr, namep); ctx.emission_context.external_fns[std::make_tuple(fromexternal, false)] = GV; } jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_const); @@ -5206,10 +5200,10 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, jl_expr_t *ex, jl_value_t *rt) if (argv[i].typ == jl_bottom_type) return jl_cgval_t(); } - return emit_invoke(ctx, lival, argv, nargs, rt); + return emit_invoke(ctx, lival, argv, nargs, rt, false); } -static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayRef argv, size_t nargs, jl_value_t *rt) +static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayRef argv, size_t nargs, jl_value_t *rt, bool always_inline) { ++EmittedInvokes; bool handled = false; @@ -5265,44 +5259,52 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR std::string name; StringRef protoname; bool need_to_emit = true; - bool cache_valid = ctx.use_cache || ctx.external_linkage; + bool cache_valid = (ctx.use_cache || ctx.external_linkage); bool external = false; // Check if we already queued this up auto it = ctx.call_targets.find(codeinst); - if (need_to_emit && it != ctx.call_targets.end()) { + if (it != ctx.call_targets.end()) { assert(it->second.specsig == specsig); protoname = it->second.decl->getName(); - need_to_emit = cache_valid = false; + if (always_inline) + it->second.private_linkage = true; + else + it->second.external_linkage = true; } - - // Check if it is already compiled (either JIT or externally) - if (need_to_emit && cache_valid) { - // optimization: emit the correct name immediately, if we know it + // Check if it is already compiled (either JIT or externally), and if so, re-use that name if possible + // This is just an optimization to emit the correct name immediately, if we know it, since the JIT and AOT code will be able to do this later also + if (cache_valid) { // TODO: use `emitted` map here too to try to consolidate names? uint8_t specsigflags; jl_callptr_t invoke; void *fptr; jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0); if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr) { - protoname = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); if (ctx.external_linkage) { // TODO: Add !specsig support to aotcompile.cpp // Check that the codeinst is containing native code if (specsig && (specsigflags & 0b100)) { - external = true; + external = !always_inline; need_to_emit = false; } } else { // ctx.use_cache need_to_emit = false; } + if (!need_to_emit && protoname.empty()) + protoname = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); } } - if (need_to_emit) { + if (it != ctx.call_targets.end()) + need_to_emit = false; + else if (always_inline) + need_to_emit = true; + if (protoname.empty()) { raw_string_ostream(name) << (specsig ? "j_" : "j1_") << name_from_method_instance(mi) << "_" << jl_atomic_fetch_add_relaxed(&globalUniqueGeneratedNames, 1); protoname = StringRef(name); } + jl_returninfo_t::CallingConv cc = jl_returninfo_t::CallingConv::Boxed; unsigned return_roots = 0; if (specsig) @@ -5311,7 +5313,7 @@ static jl_cgval_t emit_invoke(jl_codectx_t &ctx, const jl_cgval_t &lival, ArrayR result = emit_call_specfun_boxed(ctx, codeinst->rettype, protoname, external ? codeinst : nullptr, argv, nargs, rt); if (need_to_emit) { Function *trampoline_decl = cast(jl_Module->getNamedValue(protoname)); - ctx.call_targets[codeinst] = {cc, return_roots, trampoline_decl, nullptr, specsig}; + ctx.call_targets[codeinst] = {cc, return_roots, trampoline_decl, nullptr, specsig, !always_inline, always_inline}; } } } @@ -6257,7 +6259,7 @@ static std::pair get_oc_function(jl_codectx_t &ctx, jl_met } if (need_to_emit) { - ctx.call_targets[ci] = {cc, return_roots, specsig ? specF : F, specsig ? F : nullptr, specsig}; + ctx.call_targets[ci] = {cc, return_roots, specsig ? specF : F, specsig ? F : nullptr, specsig, true, false}; } JL_GC_POP(); @@ -6780,6 +6782,71 @@ Function *get_or_emit_fptr1(StringRef preal_decl, Module *M) return cast(M->getOrInsertFunction(preal_decl, get_func_sig(M->getContext()), get_func_attrs(M->getContext())).getCallee()); } +static Function *emit_modifyhelper(jl_codectx_t &ctx2, const jl_cgval_t &op, const jl_cgval_t &modifyop, jl_value_t *jltype, Type *elty, jl_cgval_t rhs, const Twine &fname, bool gcstack_arg) +{ + Module *M = ctx2.f->getParent(); + jl_codectx_t ctx(M->getContext(), ctx2.emission_context, ctx2.min_world, ctx2.max_world); + SmallVector ArgTy; + ArgTy.push_back(elty); + if (rhs.V) + ArgTy.push_back(rhs.V->getType()); + if (rhs.Vboxed) + ArgTy.push_back(rhs.Vboxed->getType()); + if (rhs.TIndex) + ArgTy.push_back(rhs.TIndex->getType()); + for (auto &root : rhs.inline_roots) + ArgTy.push_back(root->getType()); + if (gcstack_arg) + ArgTy.push_back(ctx.builder.getPtrTy()); + FunctionType *FT = FunctionType::get(elty, ArgTy, false); + Function *w = Function::Create(FT, GlobalVariable::PrivateLinkage, "", M); + jl_init_function(w, ctx.emission_context.TargetTriple); + w->addFnAttr(Attribute::AlwaysInline); + w->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + Function::arg_iterator AI = w->arg_begin(); + Argument *A = &*AI++; + // rebuild a copy of rhs from the arguments + if (rhs.V) + rhs.V = &*AI++; + if (rhs.Vboxed) + rhs.Vboxed = &*AI++; + if (rhs.TIndex) + rhs.TIndex = &*AI++; + for (size_t i = 0; i < rhs.inline_roots.size(); i++) + rhs.inline_roots[i] = &*AI++; + rhs.promotion_point = nullptr; + rhs.promotion_ssa = -1; + if (gcstack_arg) { + w->setCallingConv(CallingConv::Swift); + AttrBuilder param(ctx.builder.getContext()); + param.addAttribute(Attribute::SwiftSelf); + param.addAttribute(Attribute::NonNull); + Argument *gcstackarg = &*AI++; + gcstackarg->addAttrs(param); + gcstackarg->setName("pgcstack_arg"); + ctx.pgcstack = gcstackarg; + } + assert(AI == w->arg_end()); + ctx.f = w; + ctx.rettype = jltype; + BasicBlock *b0 = BasicBlock::Create(ctx.builder.getContext(), "top", w); + ctx.builder.SetInsertPoint(b0); + DebugLoc noDbg; + ctx.builder.SetCurrentDebugLocation(noDbg); + allocate_gc_frame(ctx, b0); + const jl_cgval_t argv[3] = { op, mark_julia_type(ctx, A, false, jltype), rhs }; + jl_cgval_t ret = emit_invoke(ctx, modifyop, argv, 3, (jl_value_t*)jl_any_type, true); + emit_typecheck(ctx, ret, jltype, fname); + ret = update_julia_type(ctx, ret, jltype); + ctx.builder.CreateRet(emit_unbox(ctx, elty, ret, jltype)); + if (ctx.topalloca->use_empty()) { + ctx.topalloca->eraseFromParent(); + ctx.topalloca = nullptr; + } + return w; +} + + Function *emit_tojlinvoke(jl_code_instance_t *codeinst, Value *theFunc, Module *M, jl_codegen_params_t ¶ms) JL_NOTSAFEPOINT { ++EmittedToJLInvokes; @@ -8998,7 +9065,7 @@ static jl_llvm_functions_t Instruction &prologue_end = ctx.builder.GetInsertBlock()->back(); // step 11a. Emit the entry safepoint - if (JL_FEAT_TEST(ctx, safepoint_on_entry)) + if (params.safepoint_on_entry && JL_FEAT_TEST(ctx, safepoint_on_entry)) emit_gc_safepoint(ctx.builder, ctx.types().T_size, get_current_ptls(ctx), ctx.tbaa().tbaa_const); // step 11b. Do codegen in control flow order @@ -9822,6 +9889,84 @@ jl_llvm_functions_t jl_emit_codeinst( return decls; } +/// Stolen from IRMover.cpp, since it is needlessly private there +void linkFunctionBody(Function &Dst, Function &Src) +{ + assert(Dst.isDeclaration() && !Src.isDeclaration()); + + // Link in the operands without remapping. + if (Src.hasPrefixData()) + Dst.setPrefixData(Src.getPrefixData()); + if (Src.hasPrologueData()) + Dst.setPrologueData(Src.getPrologueData()); + if (Src.hasPersonalityFn()) + Dst.setPersonalityFn(Src.getPersonalityFn()); + if (Src.hasPersonalityFn()) + Dst.setPersonalityFn(Src.getPersonalityFn()); + assert(Src.IsNewDbgInfoFormat == Dst.IsNewDbgInfoFormat); + + // Copy over the metadata attachments without remapping. + Dst.copyMetadata(&Src, 0); + + // Steal arguments and splice the body of Src into Dst. + Dst.stealArgumentListFrom(Src); + Dst.splice(Dst.end(), &Src); +} + +void emit_always_inline(orc::ThreadSafeModule &result_m, jl_codegen_params_t ¶ms) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER +{ + jl_workqueue_t &edges = params.workqueue; + bool always_inline = false; + for (auto &it : edges) { + if (it.second.private_linkage) + always_inline = true; + } + if (!always_inline) + return; + jl_task_t *ct = jl_current_task; + int8_t gc_state = jl_gc_unsafe_enter(ct->ptls); // codegen may contain safepoints (such as jl_subtype calls) + jl_code_info_t *src = nullptr; + params.safepoint_on_entry = false; + params.temporary_roots = jl_alloc_array_1d(jl_array_any_type, 0); + JL_GC_PUSH2(¶ms.temporary_roots, &src); + for (auto &it : edges) { + jl_code_instance_t *codeinst = it.first; + auto &proto = it.second; + if (!proto.private_linkage) + continue; + if (proto.decl->isDeclaration()) { + src = (jl_code_info_t*)jl_atomic_load_relaxed(&codeinst->inferred); + jl_method_instance_t *mi = jl_get_ci_mi(codeinst); + jl_method_t *def = mi->def.method; + if (src && (jl_value_t*)src != jl_nothing && jl_is_method(def) && jl_ir_inlining_cost((jl_value_t*)src) < UINT16_MAX) + src = jl_uncompress_ir(def, codeinst, (jl_value_t*)src); + if (src && jl_is_code_info(src) && jl_ir_inlining_cost((jl_value_t*)src) < UINT16_MAX) { + jl_llvm_functions_t decls = jl_emit_codeinst(result_m, codeinst, src, params); // contains safepoints + if (!result_m) + break; + // TODO: jl_optimize_roots(params, mi, *result_m.getModuleUnlocked()); // contains safepoints + Module &M = *result_m.getModuleUnlocked(); + if (decls.functionObject != "jl_fptr_args" && + decls.functionObject != "jl_fptr_sparam" && + decls.functionObject != "jl_f_opaque_closure_call") { + Function *F = M.getFunction(decls.functionObject); + F->eraseFromParent(); + } + if (!decls.specFunctionObject.empty()) { + Function *specF = M.getFunction(decls.specFunctionObject); + linkFunctionBody(*proto.decl, *specF); + proto.decl->addFnAttr(Attribute::InlineHint); + proto.decl->setLinkage(proto.external_linkage ? GlobalValue::AvailableExternallyLinkage : GlobalValue::PrivateLinkage); + specF->eraseFromParent(); + } + } + } + } + params.temporary_roots = nullptr; + JL_GC_POP(); + jl_gc_unsafe_leave(ct->ptls, gc_state); +} + // --- initialization --- static auto gv_for_global = new SmallVector, 0>(); static void global_jlvalue_to_llvm(JuliaVariable *var, jl_value_t **addr) diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index bf49b7010b97b..99fb1b8f09bfb 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -71,7 +71,6 @@ STATISTIC(OptO0, "Number of modules optimized at level -O0"); STATISTIC(OptO1, "Number of modules optimized at level -O1"); STATISTIC(OptO2, "Number of modules optimized at level -O2"); STATISTIC(OptO3, "Number of modules optimized at level -O3"); -STATISTIC(ModulesMerged, "Number of modules merged"); STATISTIC(InternedGlobals, "Number of global constants interned in the string pool"); #ifdef _COMPILER_MSAN_ENABLED_ @@ -339,177 +338,196 @@ static DenseMap> incompl static int jl_analyze_workqueue(jl_code_instance_t *callee, jl_codegen_params_t ¶ms, bool forceall=false) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER { jl_task_t *ct = jl_current_task; - decltype(params.workqueue) edges; + jl_workqueue_t edges; std::swap(params.workqueue, edges); for (auto &it : edges) { jl_code_instance_t *codeinst = it.first; JL_GC_PROMISE_ROOTED(codeinst); auto &proto = it.second; - // try to emit code for this item from the workqueue - StringRef invokeName = ""; - StringRef preal_decl = ""; - bool preal_specsig = false; - jl_callptr_t invoke = nullptr; - bool isedge = false; - assert(params.cache); - // Checking the cache here is merely an optimization and not strictly required - // But it must be consistent with the following invokenames lookup, which is protected by the engine_lock - uint8_t specsigflags; - void *fptr; - void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile) JL_NOTSAFEPOINT; // declare it is not a safepoint (or deadlock) in this file due to 0 parameter - jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0); - //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr) - if (invoke == jl_fptr_args_addr) { - preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); - } - else if (specsigflags & 0b1) { - preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); - preal_specsig = true; - } - bool force = forceall || invoke != nullptr; - if (preal_decl.empty()) { - auto it = invokenames.find(codeinst); - if (it != invokenames.end()) { - auto &decls = it->second; - invokeName = decls.functionObject; - if (decls.functionObject == "jl_fptr_args") { - preal_decl = decls.specFunctionObject; - isedge = true; - } - else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") { - preal_decl = decls.specFunctionObject; - preal_specsig = true; - isedge = true; - } - force = true; + if (proto.external_linkage || proto.decl->isDeclaration()) { // if it is not expected externally and has a definition locally, there is no need to patch this edge up + // try to emit code for this item from the workqueue + StringRef invokeName = ""; + StringRef preal_decl = ""; + bool preal_specsig = false; + jl_callptr_t invoke = nullptr; + bool isedge = false; + assert(params.cache); + // Checking the cache here is merely an optimization and not strictly required + // But it must be consistent with the following invokenames lookup, which is protected by the engine_lock + uint8_t specsigflags; + void *fptr; + void jl_read_codeinst_invoke(jl_code_instance_t *ci, uint8_t *specsigflags, jl_callptr_t *invoke, void **specptr, int waitcompile) JL_NOTSAFEPOINT; // declare it is not a safepoint (or deadlock) in this file due to 0 parameter + jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0); + //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr) + if (invoke == jl_fptr_args_addr) { + preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); } - } - if (preal_decl.empty()) { - // there may be an equivalent method already compiled (or at least registered with the JIT to compile), in which case we should be using that instead - jl_code_instance_t *compiled_ci = jl_get_ci_equiv(codeinst, 0); - if (compiled_ci != codeinst) { - codeinst = compiled_ci; - uint8_t specsigflags; - void *fptr; - jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0); - //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr) - if (invoke == jl_fptr_args_addr) { - preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); - } - else if (specsigflags & 0b1) { - preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); - preal_specsig = true; - } - if (preal_decl.empty()) { - auto it = invokenames.find(codeinst); - if (it != invokenames.end()) { - auto &decls = it->second; - invokeName = decls.functionObject; - if (decls.functionObject == "jl_fptr_args") { - preal_decl = decls.specFunctionObject; - isedge = true; - } - else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") { - preal_decl = decls.specFunctionObject; - preal_specsig = true; - isedge = true; - } - } - } + else if (specsigflags & 0b1) { + preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); + preal_specsig = true; } - } - if (!preal_decl.empty() || force) { - // if we have a prototype emitted, compare it to what we emitted earlier - Module *mod = proto.decl->getParent(); - assert(proto.decl->isDeclaration()); - Function *pinvoke = nullptr; + bool force = forceall || invoke != nullptr; if (preal_decl.empty()) { - if (invoke != nullptr && invokeName.empty()) { - assert(invoke != jl_fptr_args_addr); - if (invoke == jl_fptr_sparam_addr) - invokeName = "jl_fptr_sparam"; - else if (invoke == jl_f_opaque_closure_call_addr) - invokeName = "jl_f_opaque_closure_call"; - else - invokeName = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst); + auto it = invokenames.find(codeinst); + if (it != invokenames.end()) { + auto &decls = it->second; + invokeName = decls.functionObject; + if (decls.functionObject == "jl_fptr_args") { + preal_decl = decls.specFunctionObject; + isedge = true; + } + else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") { + preal_decl = decls.specFunctionObject; + preal_specsig = true; + isedge = true; + } + force = true; } - pinvoke = emit_tojlinvoke(codeinst, invokeName, mod, params); - if (!proto.specsig) - proto.decl->replaceAllUsesWith(pinvoke); - isedge = false; - } - if (proto.specsig && !preal_specsig) { - // get or build an fptr1 that can invoke codeinst - if (pinvoke == nullptr) - pinvoke = get_or_emit_fptr1(preal_decl, mod); - // emit specsig-to-(jl)invoke conversion - proto.decl->setLinkage(GlobalVariable::InternalLinkage); - //protodecl->setAlwaysInline(); - jl_init_function(proto.decl, params.TargetTriple); - // TODO: maybe this can be cached in codeinst->specfptr? - int8_t gc_state = jl_gc_unsafe_enter(ct->ptls); // codegen may contain safepoints (such as jl_subtype calls) - jl_method_instance_t *mi = jl_get_ci_mi(codeinst); - size_t nrealargs = jl_nparams(mi->specTypes); // number of actual arguments being passed - bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure; - emit_specsig_to_fptr1(proto.decl, proto.cc, proto.return_roots, mi->specTypes, codeinst->rettype, is_opaque_closure, nrealargs, params, pinvoke); - jl_gc_unsafe_leave(ct->ptls, gc_state); - preal_decl = ""; // no need to fixup the name } - if (!preal_decl.empty()) { - // merge and/or rename this prototype to the real function - if (Value *specfun = mod->getNamedValue(preal_decl)) { - if (proto.decl != specfun) - proto.decl->replaceAllUsesWith(specfun); - } - else { - proto.decl->setName(preal_decl); + if (preal_decl.empty()) { + // there may be an equivalent method already compiled (or at least registered with the JIT to compile), in which case we should be using that instead + jl_code_instance_t *compiled_ci = jl_get_ci_equiv(codeinst, 0); + if (compiled_ci != codeinst) { + codeinst = compiled_ci; + uint8_t specsigflags; + void *fptr; + jl_read_codeinst_invoke(codeinst, &specsigflags, &invoke, &fptr, 0); + //if (specsig ? specsigflags & 0b1 : invoke == jl_fptr_args_addr) + if (invoke == jl_fptr_args_addr) { + preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); + } + else if (specsigflags & 0b1) { + preal_decl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)fptr, invoke, codeinst); + preal_specsig = true; + } + if (preal_decl.empty()) { + auto it = invokenames.find(codeinst); + if (it != invokenames.end()) { + auto &decls = it->second; + invokeName = decls.functionObject; + if (decls.functionObject == "jl_fptr_args") { + preal_decl = decls.specFunctionObject; + isedge = true; + } + else if (decls.functionObject != "jl_fptr_sparam" && decls.functionObject != "jl_f_opaque_closure_call") { + preal_decl = decls.specFunctionObject; + preal_specsig = true; + isedge = true; + } + } + } } } - if (proto.oc) { // additionally, if we are dealing with an OC constructor, then we might also need to fix up the fptr1 reference too - assert(proto.specsig); - StringRef ocinvokeDecl = invokeName; - if (invoke != nullptr && ocinvokeDecl.empty()) { - // check for some special tokens used by opaque_closure.c and convert those to their real functions - assert(invoke != jl_fptr_args_addr); - assert(invoke != jl_fptr_sparam_addr); - if (invoke == jl_fptr_interpret_call_addr) - ocinvokeDecl = "jl_fptr_interpret_call"; - else if (invoke == jl_fptr_const_return_addr) - ocinvokeDecl = "jl_fptr_const_return"; - else if (invoke == jl_f_opaque_closure_call_addr) - ocinvokeDecl = "jl_f_opaque_closure_call"; - //else if (invoke == jl_interpret_opaque_closure_addr) - else - ocinvokeDecl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst); + if (!preal_decl.empty() || force) { + // if we have a prototype emitted, compare it to what we emitted earlier + Module *mod = proto.decl->getParent(); + Function *pinvoke = nullptr; + if (proto.decl->isDeclaration()) { + if (preal_decl.empty()) { + if (invoke != nullptr && invokeName.empty()) { + assert(invoke != jl_fptr_args_addr); + if (invoke == jl_fptr_sparam_addr) + invokeName = "jl_fptr_sparam"; + else if (invoke == jl_f_opaque_closure_call_addr) + invokeName = "jl_f_opaque_closure_call"; + else + invokeName = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst); + } + pinvoke = emit_tojlinvoke(codeinst, invokeName, mod, params); + if (!proto.specsig) { + proto.decl->replaceAllUsesWith(pinvoke); + proto.decl->eraseFromParent(); + proto.decl = pinvoke; + } + isedge = false; + } + if (proto.specsig && !preal_specsig) { + // get or build an fptr1 that can invoke codeinst + if (pinvoke == nullptr) + pinvoke = get_or_emit_fptr1(preal_decl, mod); + // emit specsig-to-(jl)invoke conversion + proto.decl->setLinkage(GlobalVariable::InternalLinkage); + //protodecl->setAlwaysInline(); + jl_init_function(proto.decl, params.TargetTriple); + // TODO: maybe this can be cached in codeinst->specfptr? + int8_t gc_state = jl_gc_unsafe_enter(ct->ptls); // codegen may contain safepoints (such as jl_subtype calls) + jl_method_instance_t *mi = jl_get_ci_mi(codeinst); + size_t nrealargs = jl_nparams(mi->specTypes); // number of actual arguments being passed + bool is_opaque_closure = jl_is_method(mi->def.value) && mi->def.method->is_for_opaque_closure; + emit_specsig_to_fptr1(proto.decl, proto.cc, proto.return_roots, mi->specTypes, codeinst->rettype, is_opaque_closure, nrealargs, params, pinvoke); + jl_gc_unsafe_leave(ct->ptls, gc_state); + preal_decl = ""; // no need to fixup the name + } } - // if OC expected a specialized specsig dispatch, but we don't have it, use the inner trampoline here too - // XXX: this invoke translation logic is supposed to exactly match new_opaque_closure - if (!preal_specsig || ocinvokeDecl == "jl_f_opaque_closure_call" || ocinvokeDecl == "jl_fptr_interpret_call" || ocinvokeDecl == "jl_fptr_const_return") { - if (pinvoke == nullptr) - ocinvokeDecl = get_or_emit_fptr1(preal_decl, mod)->getName(); - else - ocinvokeDecl = pinvoke->getName(); + else if (proto.specsig && !preal_specsig) { + // privatize our definition, since for some reason we couldn't use the external one but have an internal one + proto.decl->setLinkage(GlobalValue::PrivateLinkage); + preal_decl = ""; // no need to fixup the name } - assert(!ocinvokeDecl.empty()); - assert(ocinvokeDecl != "jl_fptr_args"); - assert(ocinvokeDecl != "jl_fptr_sparam"); - // merge and/or rename this prototype to the real function - if (Value *specfun = mod->getNamedValue(ocinvokeDecl)) { - if (proto.oc != specfun) - proto.oc->replaceAllUsesWith(specfun); + if (!preal_decl.empty()) { + // merge and/or rename this prototype to the real function + if (Function *specfun = cast_or_null(mod->getNamedValue(preal_decl))) { + if (proto.decl != specfun) { + proto.decl->replaceAllUsesWith(specfun); + if (!proto.decl->isDeclaration() && specfun->isDeclaration()) + linkFunctionBody(*specfun, *proto.decl); + proto.decl->eraseFromParent(); + proto.decl = specfun; + } + } + else { + proto.decl->setName(preal_decl); + } } - else { - proto.oc->setName(ocinvokeDecl); + if (proto.oc) { // additionally, if we are dealing with an OC constructor, then we might also need to fix up the fptr1 reference too + assert(proto.specsig); + StringRef ocinvokeDecl = invokeName; + if (invoke != nullptr && ocinvokeDecl.empty()) { + // check for some special tokens used by opaque_closure.c and convert those to their real functions + assert(invoke != jl_fptr_args_addr); + assert(invoke != jl_fptr_sparam_addr); + if (invoke == jl_fptr_interpret_call_addr) + ocinvokeDecl = "jl_fptr_interpret_call"; + else if (invoke == jl_fptr_const_return_addr) + ocinvokeDecl = "jl_fptr_const_return"; + else if (invoke == jl_f_opaque_closure_call_addr) + ocinvokeDecl = "jl_f_opaque_closure_call"; + //else if (invoke == jl_interpret_opaque_closure_addr) + else + ocinvokeDecl = jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)invoke, invoke, codeinst); + } + // if OC expected a specialized specsig dispatch, but we don't have it, use the inner trampoline here too + // XXX: this invoke translation logic is supposed to exactly match new_opaque_closure + if (!preal_specsig || ocinvokeDecl == "jl_f_opaque_closure_call" || ocinvokeDecl == "jl_fptr_interpret_call" || ocinvokeDecl == "jl_fptr_const_return") { + if (pinvoke == nullptr) + ocinvokeDecl = get_or_emit_fptr1(preal_decl, mod)->getName(); + else + ocinvokeDecl = pinvoke->getName(); + } + assert(!ocinvokeDecl.empty()); + assert(ocinvokeDecl != "jl_fptr_args"); + assert(ocinvokeDecl != "jl_fptr_sparam"); + // merge and/or rename this prototype to the real function + if (Function *specfun = cast_or_null(mod->getNamedValue(ocinvokeDecl))) { + if (proto.oc != specfun) { + proto.oc->replaceAllUsesWith(specfun); + proto.oc->eraseFromParent(); + proto.oc = specfun; + } + } + else { + proto.oc->setName(ocinvokeDecl); + } } } + else { + isedge = true; + params.workqueue.push_back(it); + incomplete_rgraph[codeinst].push_back(callee); + } + if (isedge) + complete_graph[callee].push_back(codeinst); } - else { - isedge = true; - params.workqueue.push_back(it); - incomplete_rgraph[codeinst].push_back(callee); - } - if (isedge) - complete_graph[callee].push_back(codeinst); } return params.workqueue.size(); } @@ -580,10 +598,11 @@ static void complete_emit(jl_code_instance_t *edge) JL_NOTSAFEPOINT_LEAVE JL_NOT auto ¶ms = std::get<0>(it->second); params.tsctx_lock = params.tsctx.getLock(); assert(callee == it->first); + orc::ThreadSafeModule &M = emittedmodules[callee]; + emit_always_inline(M, params); // may safepoint int waiting = jl_analyze_workqueue(callee, params); // may safepoint assert(!waiting); (void)waiting; - Module *M = emittedmodules[callee].getModuleUnlocked(); - finish_params(M, params, sharedmodules); + finish_params(M.getModuleUnlocked(), params, sharedmodules); incompletemodules.erase(it); } } @@ -796,6 +815,7 @@ void jl_emit_codeinst_to_jit_impl( invokenames[codeinst] = std::move(decls); complete_emit(codeinst); params.tsctx_lock = params.tsctx.getLock(); // re-acquire lock + emit_always_inline(result_m, params); int waiting = jl_analyze_workqueue(codeinst, params); if (waiting) { auto release = std::move(params.tsctx_lock); // unlock again before moving from it @@ -1725,7 +1745,8 @@ struct JuliaOJIT::DLSymOptimizer { Thunk = cast(GV.getInitializer()->stripPointerCasts()); assert(++Thunk->uses().begin() == Thunk->uses().end() && "Thunk should only have one use in PLT initializer!"); assert(Thunk->hasLocalLinkage() && "Thunk should not have non-local linkage!"); - } else { + } + else { GV.setLinkage(GlobalValue::PrivateLinkage); } auto init = ConstantExpr::getIntToPtr(ConstantInt::get(M.getDataLayout().getIntPtrType(M.getContext()), (uintptr_t)addr), GV.getValueType()); @@ -2298,125 +2319,6 @@ void JuliaOJIT::optimizeDLSyms(Module &M) { JuliaOJIT *jl_ExecutionEngine; -// destructively move the contents of src into dest -// this assumes that the targets of the two modules are the same -// including the DataLayout and ModuleFlags (for example) -// and that there is no module-level assembly -// Comdat is also removed, since the JIT doesn't need it -void jl_merge_module(orc::ThreadSafeModule &destTSM, orc::ThreadSafeModule srcTSM) -{ - ++ModulesMerged; - destTSM.withModuleDo([&](Module &dest) JL_NOTSAFEPOINT { - srcTSM.withModuleDo([&](Module &src) JL_NOTSAFEPOINT { - assert(&dest != &src && "Cannot merge module with itself!"); - assert(&dest.getContext() == &src.getContext() && "Cannot merge modules with different contexts!"); - assert(dest.getDataLayout() == src.getDataLayout() && "Cannot merge modules with different data layouts!"); - assert(dest.getTargetTriple() == src.getTargetTriple() && "Cannot merge modules with different target triples!"); - - for (auto &SG : make_early_inc_range(src.globals())) { - GlobalVariable *dG = cast_or_null(dest.getNamedValue(SG.getName())); - if (SG.hasLocalLinkage()) { - dG = nullptr; - } - // Replace a declaration with the definition: - if (dG && !dG->hasLocalLinkage()) { - if (SG.isDeclaration()) { - SG.replaceAllUsesWith(dG); - SG.eraseFromParent(); - continue; - } - //// If we start using llvm.used, we need to enable and test this - //else if (!dG->isDeclaration() && dG->hasAppendingLinkage() && SG.hasAppendingLinkage()) { - // auto *dCA = cast(dG->getInitializer()); - // auto *sCA = cast(SG.getInitializer()); - // SmallVector Init; - // for (auto &Op : dCA->operands()) - // Init.push_back(cast_or_null(Op)); - // for (auto &Op : sCA->operands()) - // Init.push_back(cast_or_null(Op)); - // ArrayType *ATy = ArrayType::get(PointerType::get(dest.getContext()), Init.size()); - // GlobalVariable *GV = new GlobalVariable(dest, ATy, dG->isConstant(), - // GlobalValue::AppendingLinkage, ConstantArray::get(ATy, Init), "", - // dG->getThreadLocalMode(), dG->getType()->getAddressSpace()); - // GV->copyAttributesFrom(dG); - // SG.replaceAllUsesWith(GV); - // dG->replaceAllUsesWith(GV); - // GV->takeName(SG); - // SG.eraseFromParent(); - // dG->eraseFromParent(); - // continue; - //} - else { - assert(dG->isDeclaration() || dG->getInitializer() == SG.getInitializer()); - dG->replaceAllUsesWith(&SG); - dG->eraseFromParent(); - } - } - // Reparent the global variable: - SG.removeFromParent(); - dest.insertGlobalVariable(&SG); - // Comdat is owned by the Module - SG.setComdat(nullptr); - } - - for (auto &SG : make_early_inc_range(src)) { - Function *dG = cast_or_null(dest.getNamedValue(SG.getName())); - if (SG.hasLocalLinkage()) { - dG = nullptr; - } - // Replace a declaration with the definition: - if (dG && !dG->hasLocalLinkage()) { - if (SG.isDeclaration()) { - SG.replaceAllUsesWith(dG); - SG.eraseFromParent(); - continue; - } - else { - assert(dG->isDeclaration()); - dG->replaceAllUsesWith(&SG); - dG->eraseFromParent(); - } - } - // Reparent the global variable: - SG.removeFromParent(); - dest.getFunctionList().push_back(&SG); - // Comdat is owned by the Module - SG.setComdat(nullptr); - } - - for (auto &SG : make_early_inc_range(src.aliases())) { - GlobalAlias *dG = cast_or_null(dest.getNamedValue(SG.getName())); - if (SG.hasLocalLinkage()) { - dG = nullptr; - } - if (dG && !dG->hasLocalLinkage()) { - if (!dG->isDeclaration()) { // aliases are always definitions, so this test is reversed from the above two - SG.replaceAllUsesWith(dG); - SG.eraseFromParent(); - continue; - } - else { - dG->replaceAllUsesWith(&SG); - dG->eraseFromParent(); - } - } - SG.removeFromParent(); - dest.insertAlias(&SG); - } - - // metadata nodes need to be explicitly merged not just copied - // so there are special passes here for each known type of metadata - NamedMDNode *sNMD = src.getNamedMetadata("llvm.dbg.cu"); - if (sNMD) { - NamedMDNode *dNMD = dest.getOrInsertNamedMetadata("llvm.dbg.cu"); - for (MDNode *I : sNMD->operands()) { - dNMD->addOperand(I); - } - } - }); - }); -} - //TargetMachine pass-through methods std::unique_ptr JuliaOJIT::cloneTargetMachine() const diff --git a/src/jitlayers.h b/src/jitlayers.h index b411febd792b8..619e5f3757642 100644 --- a/src/jitlayers.h +++ b/src/jitlayers.h @@ -72,7 +72,6 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeContext, LLVMOrcThreadSafeCont DEFINE_SIMPLE_CONVERSION_FUNCTIONS(orc::ThreadSafeModule, LLVMOrcThreadSafeModuleRef) void addTargetPasses(legacy::PassManagerBase *PM, const Triple &triple, TargetIRAnalysis analysis) JL_NOTSAFEPOINT; -void jl_merge_module(orc::ThreadSafeModule &dest, orc::ThreadSafeModule src) JL_NOTSAFEPOINT; GlobalVariable *jl_emit_RTLD_DEFAULT_var(Module *M) JL_NOTSAFEPOINT; DataLayout jl_create_datalayout(TargetMachine &TM) JL_NOTSAFEPOINT; @@ -210,6 +209,12 @@ struct jl_codegen_call_target_t { llvm::Function *decl; llvm::Function *oc; bool specsig; + bool external_linkage; // whether codegen would like this edge to be externally-available + bool private_linkage; // whether codegen would like this edge to be internally-available + // external = ExternalLinkage (similar to "extern") + // private = InternalLinkage (similar to "static") + // external+private = AvailableExternallyLinkage+ExternalLinkage or ExternalLinkage (similar to "static inline") + // neither = unused }; // reification of a call to jl_jit_abi_convert, so that it isn't necessary to parse the Modules to recover this info @@ -231,7 +236,7 @@ struct jl_codegen_params_t { DataLayout DL; Triple TargetTriple; - inline LLVMContext &getContext() { + inline LLVMContext &getContext() JL_NOTSAFEPOINT { return *tsctx.getContext(); } typedef StringMap SymMapGV; @@ -268,6 +273,7 @@ struct jl_codegen_params_t { bool cache = false; bool external_linkage = false; bool imaging_mode; + bool safepoint_on_entry = true; bool use_swiftcc = true; jl_codegen_params_t(orc::ThreadSafeContext ctx, DataLayout DL, Triple triple) JL_NOTSAFEPOINT JL_NOTSAFEPOINT_ENTER : tsctx(std::move(ctx)), @@ -305,6 +311,9 @@ jl_llvm_functions_t jl_emit_codedecls( jl_code_instance_t *codeinst, jl_codegen_params_t ¶ms); +void linkFunctionBody(Function &Dst, Function &Src) JL_NOTSAFEPOINT; +void emit_always_inline(orc::ThreadSafeModule &result_m, jl_codegen_params_t ¶ms) JL_NOTSAFEPOINT_LEAVE JL_NOTSAFEPOINT_ENTER; + enum CompilationPolicy { Default = 0, Extern = 1, @@ -660,8 +669,8 @@ class JuliaOJIT { OptSelLayerT OptSelLayer; }; extern JuliaOJIT *jl_ExecutionEngine; -std::unique_ptr jl_create_llvm_module(StringRef name, LLVMContext &ctx, const DataLayout &DL = jl_ExecutionEngine->getDataLayout(), const Triple &triple = jl_ExecutionEngine->getTargetTriple()) JL_NOTSAFEPOINT; -inline orc::ThreadSafeModule jl_create_ts_module(StringRef name, orc::ThreadSafeContext ctx, const DataLayout &DL = jl_ExecutionEngine->getDataLayout(), const Triple &triple = jl_ExecutionEngine->getTargetTriple()) JL_NOTSAFEPOINT { +std::unique_ptr jl_create_llvm_module(StringRef name, LLVMContext &ctx, const DataLayout &DL, const Triple &triple) JL_NOTSAFEPOINT; +inline orc::ThreadSafeModule jl_create_ts_module(StringRef name, orc::ThreadSafeContext ctx, const DataLayout &DL, const Triple &triple) JL_NOTSAFEPOINT { auto lock = ctx.getLock(); return orc::ThreadSafeModule(jl_create_llvm_module(name, *ctx.getContext(), DL, triple), ctx); } diff --git a/src/julia.expmap.in b/src/julia.expmap.in index b28a714e75f69..5a3fbce0d1a82 100644 --- a/src/julia.expmap.in +++ b/src/julia.expmap.in @@ -30,7 +30,6 @@ _Z22jl_coverage_alloc_lineN4llvm9StringRefEi*; _Z22jl_malloc_data_pointerN4llvm9StringRefEi*; _jl_timing_*; - LLVMExtra*; JLJIT*; llvmGetPassPluginInfo*; diff --git a/src/llvm-expand-atomic-modify.cpp b/src/llvm-expand-atomic-modify.cpp new file mode 100644 index 0000000000000..7b7b3c8761c17 --- /dev/null +++ b/src/llvm-expand-atomic-modify.cpp @@ -0,0 +1,473 @@ +// This file is a part of Julia. License is MIT: https://julialang.org/license + +// TODO: move this feature into AtomicExpandImpl + +#include "llvm-version.h" +#include "passes.h" + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "julia.h" +#include "julia_assert.h" + +#define DEBUG_TYPE "expand-atomic-modify" +#undef DEBUG + +using namespace llvm; + +// This pass takes fake call instructions that look like this which were emitted by the front end: +// (oldval, newval) = call atomicmodify.iN(ptr %op, ptr align(N) %ptr, i8 immarg %SSID, i8 immarg %Ordering, ...) !rmwattributes +// where op is a function with a prototype of `iN (iN arg, ...)` +// Then rewrite that to +// oldval = atomicrmw op ptr, val ordering syncscope +// newval = op oldval, val +// Or to an equivalent RMWCmpXchgLoop if `op` isn't valid for atomicrmw + + +// from AtomicExpandImpl, with modification of failure order and added Attributes +using CreateWeakCmpXchgInstFun = + std::function; + +static void createWeakCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr, + Value *Loaded, Value *NewVal, Align AddrAlign, + AtomicOrdering MemOpOrder, SyncScope::ID SSID, Instruction &Attributes, + Value *&Success, Value *&NewLoaded) { + Type *OrigTy = NewVal->getType(); + + // This code can go away when cmpxchg supports FP types. + assert(!OrigTy->isPointerTy()); + bool NeedBitcast = OrigTy->isFloatingPointTy(); + if (NeedBitcast) { + IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits()); + NewVal = Builder.CreateBitCast(NewVal, IntTy); + Loaded = Builder.CreateBitCast(Loaded, IntTy); + } + + AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg( + Addr, Loaded, NewVal, AddrAlign, MemOpOrder, + AtomicOrdering::Monotonic, // why does LLVM use AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder) here + SSID); + Pair->copyMetadata(Attributes); + Success = Builder.CreateExtractValue(Pair, 1, "success"); + NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); + + if (NeedBitcast) + NewLoaded = Builder.CreateBitCast(NewLoaded, OrigTy); +} + +// from AtomicExpandImpl, with modification of values returned +std::pair insertRMWCmpXchgLoop( + IRBuilderBase &Builder, Type *ResultTy, Value *Addr, Align AddrAlign, + AtomicOrdering MemOpOrder, SyncScope::ID SSID, Instruction &Attributes, + const std::function &PerformOp, + const CreateWeakCmpXchgInstFun &CreateWeakCmpXchg) { + LLVMContext &Ctx = Builder.getContext(); + BasicBlock *BB = Builder.GetInsertBlock(); + Function *F = BB->getParent(); + + // Given: atomicrmw some_op iN* %addr, iN %incr ordering + // + // The standard expansion we produce is: + // [...] + // %init_loaded = load atomic iN* %addr + // br label %loop + // loop: + // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] + // %new = some_op iN %loaded, %incr + // %pair = cmpxchg iN* %addr, iN %loaded, iN %new + // %new_loaded = extractvalue { iN, i1 } %pair, 0 + // %success = extractvalue { iN, i1 } %pair, 1 + // br i1 %success, label %atomicrmw.end, label %loop + // atomicrmw.end: + // [...] + BasicBlock *ExitBB = + BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + + // The split call above "helpfully" added a branch at the end of BB (to the + // wrong place), but we want a load. It's easiest to just remove + // the branch entirely. + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + LoadInst *InitLoaded = Builder.CreateAlignedLoad(ResultTy, Addr, AddrAlign); + InitLoaded->setOrdering(AtomicOrdering::Unordered); // n.b. the original LLVM pass is missing this call so is actually mildly UB + Builder.CreateBr(LoopBB); + + // Start the main loop block now that we've taken care of the preliminaries. + Builder.SetInsertPoint(LoopBB); + PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded"); + Loaded->addIncoming(InitLoaded, BB); + + Value *NewVal = PerformOp(Builder, Loaded); + + Value *NewLoaded = nullptr; + Value *Success = nullptr; + + CreateWeakCmpXchg(Builder, Addr, Loaded, NewVal, AddrAlign, + MemOpOrder == AtomicOrdering::Unordered + ? AtomicOrdering::Monotonic + : MemOpOrder, + SSID, Attributes, Success, NewLoaded); + assert(Success && NewLoaded); + + Loaded->addIncoming(NewLoaded, LoopBB); + + Builder.CreateCondBr(Success, ExitBB, LoopBB); + + Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + return {NewLoaded, NewVal}; +} + +// from AtomicExpandImpl +struct ReplacementIRBuilder : IRBuilder { + // Preserves the DebugLoc from I, and preserves still valid metadata. + explicit ReplacementIRBuilder(Instruction *I, const DataLayout &DL) + : IRBuilder(I->getContext(), DL) { + SetInsertPoint(I); + this->CollectMetadataToCopy(I, {LLVMContext::MD_pcsections}); + } +}; + +// Must check that either Target cannot observe or mutate global state +// or that no trailing instructions does so either. +// Depending on the choice, it can also decide whether it is better to move Target after RMW +// or to move RMW before Target (or meet somewhere in the middle). +// Currently conservatively implemented as there being no instruction in the +// function which writes memory (which includes any atomics). +// Excluding the Target itself, unless some other instruction might read memory to observe it. +static bool canReorderWithRMW(Instruction &Target, bool verifyop) +{ + if (!verifyop) + return true; + Function &Op = *Target.getFunction(); + // quick check: if Op is nosync and Target doesn't access any memory, then reordering is trivially valid + bool nosync = Op.hasNoSync(); + if (nosync && !Target.mayReadOrWriteMemory()) + return true; + // otherwise, scan the whole function to see if any function accesses memory + // in a way that would conflict with reordering the atomic read and write + bool mayRead = false; + for (auto &BB : Op) { + for (auto &I : BB) { + if (&I == &Target) + continue; + if (I.mayWriteToMemory()) + return false; + if (!mayRead) { + mayRead = I.mayReadFromMemory(); + if (!nosync && mayRead) + return false; + } + } + } + // if any other instruction read memory, then the ordering of any writes by the target instruction might be observed + return !(mayRead && Target.mayWriteToMemory()); +} + +static std::variant patternMatchAtomicRMWOp(Value *Old, Use **ValOp, Value *RetVal) +{ + bool verifyop = RetVal == nullptr; + assert(verifyop ? isa(Old) : isa(Old)); + Function *Op = verifyop ? cast(Old)->getParent() : nullptr; + if (verifyop && (Op->isDeclaration() || Op->isInterposable() || Op->isIntrinsic())) + return false; + // TODO: peek forward from Old through any trivial casts which don't affect the instruction (e.g. i64 to f64 and back) + if (RetVal == nullptr) { + if (Old->use_empty()) { + if (ValOp) *ValOp = nullptr; + return AtomicRMWInst::Xchg; + } + if (!Old->hasOneUse()) + return false; + ReturnInst *Ret = nullptr; + for (auto &BB : *Op) { + if (isa(BB.getTerminator())) { + if (Ret != nullptr) + return false; + Ret = cast(BB.getTerminator()); + } + } + if (Ret == nullptr) + return false; + // Now examine the instruction list + RetVal = Ret->getReturnValue(); + if (!RetVal->hasOneUse()) + return false; + } + if (RetVal == Old) { + // special token indicating to convert to an atomic fence + if (ValOp) *ValOp = nullptr; + return AtomicRMWInst::Or; + } + if (Old->use_empty()) { + if (ValOp) *ValOp = nullptr; + return AtomicRMWInst::Xchg; + } + if (auto BinOp = dyn_cast(RetVal)) { + if ((BinOp->getOperand(0) == Old || (BinOp->isCommutative() && BinOp->getOperand(1) == Old)) && canReorderWithRMW(*BinOp, verifyop)) { + if (ValOp) *ValOp = &BinOp->getOperandUse(BinOp->getOperand(0) == Old ? 1 : 0); + switch (BinOp->getOpcode()) { + case Instruction::Add: + return AtomicRMWInst::Add; + case Instruction::Sub: + return AtomicRMWInst::Sub; + case Instruction::And: + return AtomicRMWInst::And; + case Instruction::Or: + return AtomicRMWInst::Or; + case Instruction::Xor: + return AtomicRMWInst::Xor; + case Instruction::FAdd: + return AtomicRMWInst::FAdd; + case Instruction::FSub: + return AtomicRMWInst::FSub; + default: + break; + } + } + if (BinOp->getOpcode() == Instruction::Xor) { + if (auto CI = dyn_cast(BinOp->getOperand(1))) { + if (CI->isAllOnesValue()) { + BinOp = dyn_cast(BinOp->getOperand(0)); + if (BinOp && BinOp->hasOneUse() && BinOp->getOpcode() == Instruction::And) { + if ((BinOp->getOperand(0) == Old || (BinOp->isCommutative() && BinOp->getOperand(1) == Old)) && canReorderWithRMW(*BinOp, verifyop)) { + if (ValOp) *ValOp = &BinOp->getOperandUse(BinOp->getOperand(0) == Old ? 1 : 0); + return AtomicRMWInst::Nand; + } + } + } + } + } + return false; + } else if (auto Intr = dyn_cast(RetVal)) { + if (Intr->arg_size() == 2) { + if ((Intr->getOperand(0) == Old || (Intr->isCommutative() && Intr->getOperand(1) == Old)) && canReorderWithRMW(*Intr, verifyop)) { + if (ValOp) *ValOp = &Intr->getOperandUse(Intr->getOperand(0) == Old ? 1 : 0); + switch (Intr->getIntrinsicID()) { + case Intrinsic::minnum: + return AtomicRMWInst::FMin; + case Intrinsic::maxnum: + return AtomicRMWInst::FMax; + case Intrinsic::smax: + return AtomicRMWInst::Max; + case Intrinsic::umax: + return AtomicRMWInst::UMax; + case Intrinsic::smin: + return AtomicRMWInst::Min; + case Intrinsic::umin: + return AtomicRMWInst::UMin; +#if JL_LLVM_VERSION >= 200000 + case Intrinsic::usub_sat: + return AtomicRMWInst::USubSat; +#endif + } + } + } + return false; + } + else if (auto Intr = dyn_cast(RetVal)) { + // TODO: decide inlining cost of Op, or check alwaysinline/inlinehint, before this? + for (auto &Arg : Intr->args()) { + if (Arg == Old) { + if (canReorderWithRMW(*Intr, verifyop)) { + if (ValOp) *ValOp = &Arg; + return true; + } + return false; + } + } + } + // TODO: does this need to deal with F->hasFnAttribute(Attribute::StrictFP)? + // TODO: does Fneg and Neg have expansions? + // TODO: be able to ignore some simple bitcasts (particularly f64 to i64) + // TODO: handle longer sequences (UIncWrap, UDecWrap, USubCond, and target-specific ones for CUDA) + return false; +} + +void expandAtomicModifyToCmpXchg(CallInst &Modify, + const CreateWeakCmpXchgInstFun &CreateWeakCmpXchg) { + Value *Ptr = Modify.getOperand(0); + Function *Op = dyn_cast(Modify.getOperand(1)); + if (!Op) { + Modify.getParent()->getParent()->print(errs()); + llvm_unreachable("expected immarg for function argument"); + } + AtomicOrdering Ordering = (AtomicOrdering)cast(Modify.getOperand(2))->getZExtValue(); + SyncScope::ID SSID = (SyncScope::ID)cast(Modify.getOperand(3))->getZExtValue(); + MaybeAlign Alignment = Modify.getParamAlign(0); + unsigned user_arg_start = Modify.getFunctionType()->getNumParams(); + Type *Ty = Modify.getFunctionType()->getReturnType()->getStructElementType(0); + + ReplacementIRBuilder Builder(&Modify, Modify.getModule()->getDataLayout()); + Builder.setIsFPConstrained(Modify.hasFnAttr(Attribute::StrictFP)); + + CallInst *ModifyOp; + { + SmallVector Args(1 + Modify.arg_size() - user_arg_start); + Args[0] = UndefValue::get(Ty); // Undef used as placeholder for Loaded / RMW; + for (size_t argi = 0; argi < Modify.arg_size() - user_arg_start; ++argi) { + Args[argi + 1] = Modify.getArgOperand(argi + user_arg_start); + } + SmallVector Defs; + Modify.getOperandBundlesAsDefs(Defs); + ModifyOp = Builder.CreateCall(Op, Args, Defs); + ModifyOp->setCallingConv(Op->getCallingConv()); + } + Use *LoadedOp = &ModifyOp->getOperandUse(0); + + Value *OldVal = nullptr; + Value *NewVal = nullptr; + auto BinOp = patternMatchAtomicRMWOp(Op->getArg(0), nullptr, nullptr); + if (BinOp != decltype(BinOp)(false)) { + Builder.SetInsertPoint(ModifyOp); + AtomicRMWInst *RMW = Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, Ptr, UndefValue::get(Ty), Alignment, Ordering, SSID); // Undef used as placeholder + RMW->copyMetadata(Modify); + Builder.SetInsertPoint(&Modify); + LoadedOp->set(RMW); + for (int attempts = 0; ; ) { + FreezeInst *TrackReturn = Builder.Insert(new FreezeInst(ModifyOp)); // Create a temporary TrackingVH so we can recover the NewVal after inlining + InlineFunctionInfo IFI; + if (!InlineFunction(*ModifyOp, IFI).isSuccess()) { + // Undo the attempt, since inlining failed + BinOp = false; + TrackReturn->eraseFromParent(); + break; + } + ModifyOp = nullptr; + NewVal = TrackReturn->getOperand(0); + TrackReturn->eraseFromParent(); + // NewVal might have been folded away by inlining so redo patternMatchAtomicRMWOp here + // tracing from RMW to NewVal, in case instsimplify folded something + Use *ValOp; + BinOp = patternMatchAtomicRMWOp(RMW, &ValOp, NewVal); + if (BinOp == decltype(BinOp)(true)) { + ModifyOp = cast(ValOp->getUser()); + LoadedOp = ValOp; + assert(LoadedOp->get() == RMW); + RMW->moveBefore(ModifyOp); // NewValInst is a user of RMW, and RMW has no other dependants (per patternMatchAtomicRMWOp) + BinOp = false; + if (++attempts > 3) + break; + if (auto FOp = ModifyOp->getCalledFunction()) + BinOp = patternMatchAtomicRMWOp(FOp->getArg(LoadedOp->getOperandNo()), nullptr, nullptr); + else + break; + if (BinOp == decltype(BinOp)(false)) + break; + } else { + assert(BinOp != decltype(BinOp)(true)); + auto RMWOp = std::get(BinOp); + assert(RMWOp != AtomicRMWInst::BAD_BINOP); + assert(isa(RMW->getOperand(1))); // RMW was previously being used as the placeholder for Val + Value *Val; + if (ValOp != nullptr) { + RMW->moveBefore(cast(ValOp->getUser())); // ValOp is a user of RMW, and RMW has no other dependants (per patternMatchAtomicRMWOp) + Val = ValOp->get(); + } else if (RMWOp == AtomicRMWInst::Xchg) { + Val = NewVal; + } else { + // convert to an atomic fence of the form: atomicrmw or %ptr, 0 + assert(RMWOp == AtomicRMWInst::Or); + Val = ConstantInt::getNullValue(Ty); + } + RMW->setOperation(RMWOp); + RMW->setOperand(1, Val); + OldVal = RMW; + break; + } + } + if (BinOp == decltype(BinOp)(false)) { + LoadedOp->set(UndefValue::get(Ty)); + RMW->eraseFromParent(); + } + } + + if (BinOp == decltype(BinOp)(false)) { + // FIXME: If FP exceptions are observable, we should force them off for the + // loop for the FP atomics. + std::tie(OldVal, NewVal) = insertRMWCmpXchgLoop( + Builder, Ty, Ptr, *Alignment, Ordering, SSID, Modify, + [&](IRBuilderBase &Builder, Value *Loaded) JL_NOTSAFEPOINT { + LoadedOp->set(Loaded); + ModifyOp->moveBefore(*Builder.GetInsertBlock(), Builder.GetInsertPoint()); + return ModifyOp; + }, + CreateWeakCmpXchg); + } + + for (auto user : make_early_inc_range(Modify.users())) { + if (auto EV = dyn_cast(user)) { + if (EV->getNumIndices() == 1) { + if (EV->use_empty()) { + EV->eraseFromParent(); + continue; + } + else if (EV->getIndices()[0] == 0) { + EV->replaceAllUsesWith(OldVal); + EV->eraseFromParent(); + continue; + } else if (EV->getIndices()[0] == 1) { + EV->replaceAllUsesWith(NewVal); + EV->eraseFromParent(); + continue; + } + } + } + } + if (!Modify.use_empty()) { + auto OldNewVal = Builder.CreateInsertValue(UndefValue::get(Modify.getType()), OldVal, 0); + OldNewVal = Builder.CreateInsertValue(OldNewVal, NewVal, 1); + Modify.replaceAllUsesWith(OldNewVal); + } + Modify.eraseFromParent(); +} + +static bool expandAtomicModify(Function &F) { + SmallVector AtomicInsts; + + // Changing control-flow while iterating through it is a bad idea, so gather a + // list of all atomic instructions before we start. + for (Instruction &I : instructions(F)) + if (auto CI = dyn_cast(&I)) { + auto callee = dyn_cast_or_null(CI->getCalledOperand()); + if (callee && callee->getName().starts_with("julia.atomicmodify.")) { + assert(CI->getFunctionType() == callee->getFunctionType()); + AtomicInsts.push_back(CI); + } + } + + bool MadeChange = !AtomicInsts.empty(); + for (auto *I : AtomicInsts) + expandAtomicModifyToCmpXchg(*I, createWeakCmpXchgInstFun); + return MadeChange; +} + +PreservedAnalyses ExpandAtomicModifyPass::run(Function &F, FunctionAnalysisManager &AM) +{ + if (expandAtomicModify(F)) { + return PreservedAnalyses::none(); + } + return PreservedAnalyses::all(); +} diff --git a/src/llvm-julia-passes.inc b/src/llvm-julia-passes.inc index 0cc36f799db00..bd223499f37af 100644 --- a/src/llvm-julia-passes.inc +++ b/src/llvm-julia-passes.inc @@ -16,6 +16,7 @@ FUNCTION_PASS("AllocOpt", AllocOptPass()) FUNCTION_PASS("PropagateJuliaAddrspaces", PropagateJuliaAddrspacesPass()) FUNCTION_PASS("GCInvariantVerifier", GCInvariantVerifierPass()) FUNCTION_PASS("FinalLowerGC", FinalLowerGCPass()) +FUNCTION_PASS("ExpandAtomicModify", ExpandAtomicModifyPass()) #endif //Loop passes diff --git a/src/passes.h b/src/passes.h index 83721525d6f7e..0c5a124ade952 100644 --- a/src/passes.h +++ b/src/passes.h @@ -43,6 +43,11 @@ struct FinalLowerGCPass : PassInfoMixin { static bool isRequired() { return true; } }; +struct ExpandAtomicModifyPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM) JL_NOTSAFEPOINT; +}; + + // Module Passes struct CPUFeaturesPass : PassInfoMixin { PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) JL_NOTSAFEPOINT; diff --git a/src/pipeline.cpp b/src/pipeline.cpp index eb93943653b34..f91db6fc037d7 100644 --- a/src/pipeline.cpp +++ b/src/pipeline.cpp @@ -574,6 +574,7 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder * FunctionPassManager FPM; JULIA_PASS(FPM.addPass(LateLowerGCPass())); JULIA_PASS(FPM.addPass(FinalLowerGCPass())); + JULIA_PASS(FPM.addPass(ExpandAtomicModifyPass())); // after LateLowerGCPass so that all IPO is valid MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } JULIA_PASS(MPM.addPass(LowerPTLSPass(options.dump_native))); @@ -590,7 +591,8 @@ static void buildIntrinsicLoweringPipeline(ModulePassManager &MPM, PassBuilder * FPM.addPass(SimplifyCFGPass(aggressiveSimplifyCFGOptions())); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); } - } else if (!options.remove_ni) { + } + else if (!options.remove_ni) { JULIA_PASS(MPM.addPass(RemoveNIPass())); } MPM.addPass(AfterIntrinsicLoweringMarkerPass()); diff --git a/test/llvmpasses/atomic-modify.ll b/test/llvmpasses/atomic-modify.ll new file mode 100644 index 0000000000000..23e1949f3ad0a --- /dev/null +++ b/test/llvmpasses/atomic-modify.ll @@ -0,0 +1,288 @@ +; This file is a part of Julia. License is MIT: https://julialang.org/license + +; RUN: opt --load-pass-plugin=libjulia-codegen%shlibext -passes='ExpandAtomicModify' -S %s | FileCheck %s + +declare {i8, i8} @julia.atomicmodify.i8(ptr, ptr, i8, i8, ...) +declare {double, double} @julia.atomicmodify.f64(ptr, ptr, i8, i8, ...) +declare double @llvm.maxnum.f64(double %Val0, double %Val1) + +define i8 @add.i8(i8 %x, i8 %y) { + %z = add i8 %x, %y + ret i8 %z +} + +define i8 @sub.i8(i8 %x, i8 %y) { + %z = sub i8 %x, %y + ret i8 %z +} + +define i8 @subx.i8(i8 %x, i8 %y) { + %z = sub i8 %y, %x + ret i8 %z +} + +define i8 @add.i8.zext(i8 %x, i1 %y) { + %y8 = zext i1 %y to i8 + %z = add i8 %x, %y8 + ret i8 %z +} + +define i8 @and.i8(i8 %x, i8 %y) { + %z = and i8 %x, %y + ret i8 %z +} + +define i8 @nand.i8(i8 %x, i8 %y) { + %z = and i8 %x, %y + %w = xor i8 %z, -1 + ret i8 %w +} + +define i8 @nand.i8.zext(i8 %x, i1 %y) { + %y8 = zext i1 %y to i8 + %z = and i8 %y8, %x + %w = xor i8 %z, -1 + ret i8 %w +} + +define i8 @xchg.i8(i8 %x, i8 %y) { + ret i8 %y +} + +define double @fadd.f64(double %x, double %y) { + %z = fadd double %y, %x + ret double %z +} + +define double @fmax.f64(double %x, double %y) { + %z = call double @llvm.maxnum.f64(double %y, double %x) + ret double %z +} + +define internal i8 @0(i8 %x, i8 %y) unnamed_addr { + %z = call i8 @add.i8(i8 %x, i8 %y) + ret i8 %z +} + +define internal i8 @1(i8 %x, i8 %y) unnamed_addr { + %z = call i8 @0(i8 %x, i8 %y) + ret i8 %z +} + +define internal i8 @2(i8 %x, i8 %y, ptr %f) unnamed_addr { + %z = call i8 %f(i8 %x, i8 %y) + ret i8 %z +} + +define i8 @mod_i8_add(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_add +; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1 +; CHECK: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8, i8 5, i8 1, i8 %b) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_add_new(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_add +; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1 +; CHECK-NEXT: [[newval:%.*]] = add i8 %0, %b +; CHECK-NEXT: ret i8 [[newval]] +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8, i8 5, i8 1, i8 %b) + %newval = extractvalue {i8, i8} %oldnew, 1 + ret i8 %newval +} + +define i8 @mod_i8_addfence(ptr %a) { +; CHECK-LABEL: @mod_i8_addfence +; CHECK: %0 = atomicrmw or ptr %a, i8 0 release, align 1 +; CHECK-NEXT: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8, i8 5, i8 1, i8 0) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_add_zext(ptr %a, i1 %b) { +; CHECK-LABEL: @mod_i8_add_zext +; CHECK: [[b8:%.*]] = zext i1 %b to i8 +; CHECK: %0 = atomicrmw add ptr %a, i8 [[b8]] release, align 1 +; CHECK: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8.zext, i8 5, i8 1, i1 %b) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_add_zext_new(ptr %a, i1 %b) { +; CHECK-LABEL: @mod_i8_add_zext +; CHECK: [[b8:%.*]] = zext i1 %b to i8 +; CHECK-NEXT: %0 = atomicrmw add ptr %a, i8 [[b8]] release, align 1 +; CHECK-NEXT: [[newval:%.*]] = add i8 %0, [[b8]] +; CHECK-NEXT: ret i8 [[newval]] +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @add.i8.zext, i8 5, i8 1, i1 %b) + %newval = extractvalue {i8, i8} %oldnew, 1 + ret i8 %newval +} + +define i8 @mod_i8_sub(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_sub +; CHECK: %0 = atomicrmw sub ptr %a, i8 %b release, align 1 +; CHECK: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @sub.i8, i8 5, i8 1, i8 %b) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_subx(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_subx +; CHECK: [[newval:%.*]] = call i8 @subx.i8(i8 %loaded, i8 %b) +; CHECK: [[success:%.*]] = cmpxchg ptr %a, i8 %loaded, i8 [[newval]] +; CHECK: [[oldval:%.*]] = extractvalue { i8, i1 } [[success:%.*]], 0 +; CHECK: ret i8 [[oldval]] +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @subx.i8, i8 5, i8 1, i8 %b) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_subx_new(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_subx_new +; CHECK: [[newval:%.*]] = call i8 @subx.i8(i8 %loaded, i8 %b) +; CHECK: [[oldval:%.*]] = cmpxchg ptr %a, i8 %loaded, i8 [[newval]] +; CHECK: ret i8 [[newval]] +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @subx.i8, i8 5, i8 1, i8 %b) + %newval = extractvalue {i8, i8} %oldnew, 1 + ret i8 %newval +} + +define i8 @mod_i8_nand(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_nand +; CHECK: %0 = atomicrmw nand ptr %a, i8 %b release, align 1 +; CHECK: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @nand.i8, i8 5, i8 1, i8 %b) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_nand_new(ptr %a, i1 %b) { +; CHECK-LABEL: @mod_i8_nand_new +; CHECK: [[b8:%.*]] = zext i1 %b to i8 +; CHECK: %0 = atomicrmw nand ptr %a, i8 [[b8]] release, align 1 +; CHECK: [[newand:%.*]] = and i8 [[b8]], %0 +; CHECK: [[newval:%.*]] = xor i8 [[newand:%.*]], -1 +; CHECK: ret i8 [[newval]] +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @nand.i8.zext, i8 5, i8 1, i1 %b) + %newval = extractvalue {i8, i8} %oldnew, 1 + ret i8 %newval +} + +define i8 @mod_i8_andxchg(ptr %a) { +; CHECK-LABEL: @mod_i8_andxchg +; CHECK: %0 = atomicrmw xchg ptr %a, i8 0 release, align 1 +; CHECK-NEXT: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @and.i8, i8 5, i8 1, i8 0) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_xchg(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_xchg +; CHECK: %0 = atomicrmw xchg ptr %a, i8 %b release, align 1 +; CHECK-NEXT: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @xchg.i8, i8 5, i8 1, i8 %b) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_xchg_new(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_xchg_new +; CHECK: %0 = atomicrmw xchg ptr %a, i8 %b release, align 1 +; CHECK-NEXT: ret i8 %b +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @xchg.i8, i8 5, i8 1, i8 %b) + %newval = extractvalue {i8, i8} %oldnew, 1 + ret i8 %newval +} + +define double @mod_i8_fadd(ptr %a, double %b) { +; CHECK-LABEL: @mod_i8_fadd +; CHECK: %0 = atomicrmw fadd ptr %a, double %b release, align 8 +; CHECK: ret double %0 +top: + %oldnew = call {double, double} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.f64(ptr align(8) %a, ptr @fadd.f64, i8 5, i8 1, double %b) + %oldval = extractvalue {double, double} %oldnew, 0 + ret double %oldval +} + +define double @mod_i8_fmax(ptr %a, double %b) { +; CHECK-LABEL: @mod_i8_fmax +; CHECK: %0 = atomicrmw fmax ptr %a, double %b release, align 8 +; CHECK: ret double %0 +top: + %oldnew = call {double, double} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.f64(ptr align(8) %a, ptr @fmax.f64, i8 5, i8 1, double %b) + %oldval = extractvalue {double, double} %oldnew, 0 + ret double %oldval +} + +define i8 @mod_i8_indirect0(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_indirect0 +; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1 +; CHECK: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @0, i8 5, i8 1, i8 %b) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_indirect1(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_indirect1 +; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1 +; CHECK: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @1, i8 5, i8 1, i8 %b) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_indirect2(ptr %a, i8 %b, ptr %f) { +; CHECK-LABEL: @mod_i8_indirect2 +; CHECK: [[newval:%.*]] = call i8 %f(i8 %loaded, i8 %b) +; CHECK: [[success:%.*]] = cmpxchg ptr %a, i8 %loaded, i8 [[newval]] +; CHECK: [[oldval:%.*]] = extractvalue { i8, i1 } [[success:%.*]], 0 +; CHECK: ret i8 [[oldval]] +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @2, i8 5, i8 1, i8 %b, ptr %f) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} + +define i8 @mod_i8_indirect2_new(ptr %a, i8 %b, ptr %f) { +; CHECK-LABEL: @mod_i8_indirect2_new +; CHECK: [[newval:%.*]] = call i8 %f(i8 %loaded, i8 %b) +; CHECK: [[oldval:%.*]] = cmpxchg ptr %a, i8 %loaded, i8 [[newval]] +; CHECK: ret i8 [[newval]] +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @2, i8 5, i8 1, i8 %b, ptr %f) + %newval = extractvalue {i8, i8} %oldnew, 1 + ret i8 %newval +} + +define i8 @mod_i8_indirect3(ptr %a, i8 %b) { +; CHECK-LABEL: @mod_i8_indirect3 +; CHECK: %0 = atomicrmw add ptr %a, i8 %b release, align 1 +; CHECK: ret i8 %0 +top: + %oldnew = call {i8, i8} (ptr, ptr, i8, i8, ...) @julia.atomicmodify.i8(ptr align(1) %a, ptr @2, i8 5, i8 1, i8 %b, ptr @0) + %oldval = extractvalue {i8, i8} %oldnew, 0 + ret i8 %oldval +} From 5cfdf66d30f40f6a42f6f70ac522ac208d5dfc92 Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Thu, 9 Jan 2025 20:25:52 +0000 Subject: [PATCH 2/2] remove deprecated Threads.Atomics --- base/atomics.jl | 176 +++++-------------------------------------- test/threads_exec.jl | 38 +++------- 2 files changed, 30 insertions(+), 184 deletions(-) diff --git a/base/atomics.jl b/base/atomics.jl index e6f3a5654cbf7..432c9120939ac 100644 --- a/base/atomics.jl +++ b/base/atomics.jl @@ -1,7 +1,5 @@ # This file is a part of Julia. License is MIT: https://julialang.org/license -using Core.Intrinsics: llvmcall - import .Base: setindex!, getindex, unsafe_convert import .Base.Sys: ARCH, WORD_SIZE @@ -13,34 +11,6 @@ export atomic_and!, atomic_nand!, atomic_or!, atomic_xor!, atomic_max!, atomic_min!, atomic_fence -## -# Filter out unsupported atomic types on platforms -# - 128-bit atomics do not exist on AArch32. -# - Omitting 128-bit types on 32bit x86 and ppc64 -# - LLVM doesn't currently support atomics on floats for ppc64 -# C++20 is adding limited support for atomics on float, but as of -# now Clang does not support that yet. -if Sys.ARCH === :i686 || startswith(string(Sys.ARCH), "arm") || - Sys.ARCH === :powerpc64le || Sys.ARCH === :ppc64le - const inttypes = (Int8, Int16, Int32, Int64, - UInt8, UInt16, UInt32, UInt64) -else - const inttypes = (Int8, Int16, Int32, Int64, Int128, - UInt8, UInt16, UInt32, UInt64, UInt128) -end -const floattypes = (Float16, Float32, Float64) -const arithmetictypes = (inttypes..., floattypes...) -# TODO: Support Ptr -if Sys.ARCH === :powerpc64le || Sys.ARCH === :ppc64le - const atomictypes = (inttypes..., Bool) -else - const atomictypes = (arithmetictypes..., Bool) -end - -const IntTypes = Union{inttypes...} -const FloatTypes = Union{floattypes...} -const ArithmeticTypes = Union{arithmetictypes...} -const AtomicTypes = Union{atomictypes...} """ Threads.Atomic{T} @@ -48,10 +18,6 @@ const AtomicTypes = Union{atomictypes...} Holds a reference to an object of type `T`, ensuring that it is only accessed atomically, i.e. in a thread-safe manner. -Only certain "simple" types can be used atomically, namely the -primitive boolean, integer, and float-point types. These are `Bool`, -`Int8`...`Int128`, `UInt8`...`UInt128`, and `Float16`...`Float64`. - New atomic objects can be created from a non-atomic values; if none is specified, the atomic object is initialized with zero. @@ -72,10 +38,10 @@ julia> x[] Atomic operations use an `atomic_` prefix, such as [`atomic_add!`](@ref), [`atomic_xchg!`](@ref), etc. """ -mutable struct Atomic{T<:AtomicTypes} - value::T - Atomic{T}() where {T<:AtomicTypes} = new(zero(T)) - Atomic{T}(value) where {T<:AtomicTypes} = new(value) +mutable struct Atomic{T} + @atomic value::T + Atomic{T}() where {T} = new(zero(T)) + Atomic{T}(value) where {T} = new(value) end Atomic() = Atomic{Int}() @@ -332,120 +298,21 @@ julia> x[] """ function atomic_min! end -unsafe_convert(::Type{Ptr{T}}, x::Atomic{T}) where {T} = convert(Ptr{T}, pointer_from_objref(x)) -setindex!(x::Atomic{T}, v) where {T} = setindex!(x, convert(T, v)) - -const llvmtypes = IdDict{Any,String}( - Bool => "i8", # julia represents bools with 8-bits for now. # TODO: is this okay? - Int8 => "i8", UInt8 => "i8", - Int16 => "i16", UInt16 => "i16", - Int32 => "i32", UInt32 => "i32", - Int64 => "i64", UInt64 => "i64", - Int128 => "i128", UInt128 => "i128", - Float16 => "half", - Float32 => "float", - Float64 => "double", -) -inttype(::Type{T}) where {T<:Integer} = T -inttype(::Type{Float16}) = Int16 -inttype(::Type{Float32}) = Int32 -inttype(::Type{Float64}) = Int64 - - -import ..Base.gc_alignment - -# All atomic operations have acquire and/or release semantics, depending on -# whether the load or store values. Most of the time, this is what one wants -# anyway, and it's only moderately expensive on most hardware. -for typ in atomictypes - lt = llvmtypes[typ] - ilt = llvmtypes[inttype(typ)] - rt = "$lt, $lt*" - irt = "$ilt, $ilt*" - @eval getindex(x::Atomic{$typ}) = - GC.@preserve x llvmcall($""" - %ptr = bitcast i8* %0 to $lt* - %rv = load atomic $rt %ptr acquire, align $(gc_alignment(typ)) - ret $lt %rv - """, $typ, Tuple{Ptr{$typ}}, unsafe_convert(Ptr{$typ}, x)) - @eval setindex!(x::Atomic{$typ}, v::$typ) = - GC.@preserve x llvmcall($""" - %ptr = bitcast i8* %0 to $lt* - store atomic $lt %1, $lt* %ptr release, align $(gc_alignment(typ)) - ret void - """, Cvoid, Tuple{Ptr{$typ}, $typ}, unsafe_convert(Ptr{$typ}, x), v) - - # Note: atomic_cas! succeeded (i.e. it stored "new") if and only if the result is "cmp" - if typ <: Integer - @eval atomic_cas!(x::Atomic{$typ}, cmp::$typ, new::$typ) = - GC.@preserve x llvmcall($""" - %ptr = bitcast i8* %0 to $lt* - %rs = cmpxchg $lt* %ptr, $lt %1, $lt %2 acq_rel acquire - %rv = extractvalue { $lt, i1 } %rs, 0 - ret $lt %rv - """, $typ, Tuple{Ptr{$typ},$typ,$typ}, - unsafe_convert(Ptr{$typ}, x), cmp, new) - else - @eval atomic_cas!(x::Atomic{$typ}, cmp::$typ, new::$typ) = - GC.@preserve x llvmcall($""" - %iptr = bitcast i8* %0 to $ilt* - %icmp = bitcast $lt %1 to $ilt - %inew = bitcast $lt %2 to $ilt - %irs = cmpxchg $ilt* %iptr, $ilt %icmp, $ilt %inew acq_rel acquire - %irv = extractvalue { $ilt, i1 } %irs, 0 - %rv = bitcast $ilt %irv to $lt - ret $lt %rv - """, $typ, Tuple{Ptr{$typ},$typ,$typ}, - unsafe_convert(Ptr{$typ}, x), cmp, new) - end - - arithmetic_ops = [:add, :sub] - for rmwop in [arithmetic_ops..., :xchg, :and, :nand, :or, :xor, :max, :min] - rmw = string(rmwop) - fn = Symbol("atomic_", rmw, "!") - if (rmw == "max" || rmw == "min") && typ <: Unsigned - # LLVM distinguishes signedness in the operation, not the integer type. - rmw = "u" * rmw - end - if rmwop in arithmetic_ops && !(typ <: ArithmeticTypes) continue end - if typ <: Integer - @eval $fn(x::Atomic{$typ}, v::$typ) = - GC.@preserve x llvmcall($""" - %ptr = bitcast i8* %0 to $lt* - %rv = atomicrmw $rmw $lt* %ptr, $lt %1 acq_rel - ret $lt %rv - """, $typ, Tuple{Ptr{$typ}, $typ}, unsafe_convert(Ptr{$typ}, x), v) - else - rmwop === :xchg || continue - @eval $fn(x::Atomic{$typ}, v::$typ) = - GC.@preserve x llvmcall($""" - %iptr = bitcast i8* %0 to $ilt* - %ival = bitcast $lt %1 to $ilt - %irv = atomicrmw $rmw $ilt* %iptr, $ilt %ival acq_rel - %rv = bitcast $ilt %irv to $lt - ret $lt %rv - """, $typ, Tuple{Ptr{$typ}, $typ}, unsafe_convert(Ptr{$typ}, x), v) - end - end -end - -# Provide atomic floating-point operations via atomic_cas! -const opnames = Dict{Symbol, Symbol}(:+ => :add, :- => :sub) -for op in [:+, :-, :max, :min] - opname = get(opnames, op, op) - @eval function $(Symbol("atomic_", opname, "!"))(var::Atomic{T}, val::T) where T<:FloatTypes - IT = inttype(T) - old = var[] - while true - new = $op(old, val) - cmp = old - old = atomic_cas!(var, cmp, new) - reinterpret(IT, old) == reinterpret(IT, cmp) && return old - # Temporary solution before we have gc transition support in codegen. - ccall(:jl_gc_safepoint, Cvoid, ()) - end - end -end +#const nand = (~) ∘ (&) # ComposedFunction generated very poor code quality +nand(x, y) = ~(x & y) + +getindex(x::Atomic) = @atomic :acquire x.value +setindex!(x::Atomic, v) = (@atomic :release x.value = v; x) +atomic_cas!(x::Atomic, cmp, new) = (@atomicreplace :acquire_release :acquire x.value cmp => new).old +atomic_add!(x::Atomic, v) = (@atomic :acquire_release x.value + v).first +atomic_sub!(x::Atomic, v) = (@atomic :acquire_release x.value - v).first +atomic_and!(x::Atomic, v) = (@atomic :acquire_release x.value & v).first +atomic_or!(x::Atomic, v) = (@atomic :acquire_release x.value | v).first +atomic_xor!(x::Atomic, v) = (@atomic :acquire_release x.value ⊻ v).first +atomic_nand!(x::Atomic, v) = (@atomic :acquire_release x.value nand v).first +atomic_xchg!(x::Atomic, v) = (@atomicswap :acquire_release x.value = v) +atomic_min!(x::Atomic, v) = (@atomic :acquire_release x.value min v).first +atomic_max!(x::Atomic, v) = (@atomic :acquire_release x.value max v).first """ Threads.atomic_fence() @@ -462,7 +329,4 @@ fences should not be necessary in most cases. For further details, see LLVM's `fence` instruction. """ -atomic_fence() = llvmcall(""" - fence seq_cst - ret void - """, Cvoid, Tuple{}) +atomic_fence() = Core.Intrinsics.atomic_fence(:sequentially_consistent) diff --git a/test/threads_exec.jl b/test/threads_exec.jl index 629f474f53a38..dc0bc407d2fb5 100644 --- a/test/threads_exec.jl +++ b/test/threads_exec.jl @@ -334,29 +334,12 @@ using Base.Threads end end -# Ensure only LLVM-supported types can be atomic -@test_throws TypeError Atomic{BigInt} -@test_throws TypeError Atomic{ComplexF64} - -if Sys.ARCH === :i686 || startswith(string(Sys.ARCH), "arm") || - Sys.ARCH === :powerpc64le || Sys.ARCH === :ppc64le - - @test_throws TypeError Atomic{Int128}() - @test_throws TypeError Atomic{UInt128}() -end - -if Sys.ARCH === :powerpc64le || Sys.ARCH === :ppc64le - @test_throws TypeError Atomic{Float16}() - @test_throws TypeError Atomic{Float32}() - @test_throws TypeError Atomic{Float64}() -end - function test_atomic_bools() x = Atomic{Bool}(false) - # Arithmetic functions are not defined. - @test_throws MethodError atomic_add!(x, true) - @test_throws MethodError atomic_sub!(x, true) - # All the rest are: + # Arithmetic functions such as true+true returns Int + @test_throws TypeError atomic_add!(x, true) + @test_throws TypeError atomic_sub!(x, true) + # All the rest are supported: for v in [true, false] @test x[] == atomic_xchg!(x, v) @test v == atomic_cas!(x, v, !v) @@ -462,10 +445,9 @@ end test_fence() # Test load / store with various types -let atomictypes = intersect((Int8, Int16, Int32, Int64, Int128, - UInt8, UInt16, UInt32, UInt64, UInt128, - Float16, Float32, Float64), - Base.Threads.atomictypes) +let atomictypes = (Int8, Int16, Int32, Int64, Int128, + UInt8, UInt16, UInt32, UInt64, UInt128, + Float16, Float32, Float64) for T in atomictypes var = Atomic{T}() var[] = 42 @@ -493,7 +475,7 @@ function test_atomic_cas!(var::Atomic{T}, range::StepRange{Int,Int}) where T end end end -for T in intersect((Int32, Int64, Float32, Float64), Base.Threads.atomictypes) +for T in (Int32, Int64, Float32, Float64) var = Atomic{T}() nloops = 1000 di = threadpoolsize(:default) @@ -507,7 +489,7 @@ function test_atomic_xchg!(var::Atomic{T}, i::Int, accum::Atomic{Int}) where T old = atomic_xchg!(var, T(i)) atomic_add!(accum, Int(old)) end -for T in intersect((Int32, Int64, Float32, Float64), Base.Threads.atomictypes) +for T in (Int32, Int64, Float32, Float64) accum = Atomic{Int}() var = Atomic{T}() nloops = 1000 @@ -522,7 +504,7 @@ function test_atomic_float(varadd::Atomic{T}, varmax::Atomic{T}, varmin::Atomic{ atomic_max!(varmax, T(i)) atomic_min!(varmin, T(i)) end -for T in intersect((Int32, Int64, Float16, Float32, Float64), Base.Threads.atomictypes) +for T in (Int32, Int64, Float16, Float32, Float64) varadd = Atomic{T}() varmax = Atomic{T}() varmin = Atomic{T}()