diff --git a/src/aotcompile.cpp b/src/aotcompile.cpp index 6009bd435534c..4df5c9090e488 100644 --- a/src/aotcompile.cpp +++ b/src/aotcompile.cpp @@ -42,6 +42,11 @@ #include #include +#ifdef _OS_WINDOWS_ +#include +#else +#include +#endif using namespace llvm; @@ -1000,11 +1005,11 @@ static GlobalVariable *emit_ptls_table(Module &M, Type *T_size, Type *T_ptr) { } // See src/processor.h for documentation about this table. Corresponds to jl_image_header_t. -static GlobalVariable *emit_image_header(Module &M, unsigned threads, unsigned nfvars, unsigned ngvars) { +static GlobalVariable *emit_image_header(Module &M, unsigned shards, unsigned nfvars, unsigned ngvars) { constexpr uint32_t version = 1; std::array header{ version, - threads, + shards, nfvars, ngvars, }; @@ -1024,22 +1029,24 @@ static void get_fvars_gvars(Module &M, DenseMap &fvars, assert(gvars_gv); assert(fvars_idxs); assert(gvars_idxs); - auto fvars_init = cast(fvars_gv->getInitializer()); - auto gvars_init = cast(gvars_gv->getInitializer()); - for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) { - auto gv = cast(fvars_init->getOperand(i)->stripPointerCasts()); - assert(gv && gv->hasName() && "fvar must be a named global"); - assert(!fvars.count(gv) && "Duplicate fvar"); - fvars[gv] = i; - } - assert(fvars.size() == fvars_init->getNumOperands()); - for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) { - auto gv = cast(gvars_init->getOperand(i)->stripPointerCasts()); - assert(gv && gv->hasName() && "gvar must be a named global"); - assert(!gvars.count(gv) && "Duplicate gvar"); - gvars[gv] = i; - } - assert(gvars.size() == gvars_init->getNumOperands()); + if (auto fvars_init = dyn_cast(fvars_gv->getInitializer())) { + for (unsigned i = 0; i < fvars_init->getNumOperands(); ++i) { + auto gv = cast(fvars_init->getOperand(i)->stripPointerCasts()); + assert(gv && gv->hasName() && "fvar must be a named global"); + assert(!fvars.count(gv) && "Duplicate fvar"); + fvars[gv] = i; + } + assert(fvars.size() == fvars_init->getNumOperands()); + } + if (auto gvars_init = dyn_cast(gvars_gv->getInitializer())) { + for (unsigned i = 0; i < gvars_init->getNumOperands(); ++i) { + auto gv = cast(gvars_init->getOperand(i)->stripPointerCasts()); + assert(gv && gv->hasName() && "gvar must be a named global"); + assert(!gvars.count(gv) && "Duplicate gvar"); + gvars[gv] = i; + } + assert(gvars.size() == gvars_init->getNumOperands()); + } fvars_gv->eraseFromParent(); gvars_gv->eraseFromParent(); fvars_idxs->eraseFromParent(); @@ -1410,15 +1417,173 @@ struct ShardTimers { } }; +// If an AOTOutput is greater than this many bytes, madvise +// MADV_DONTNEED/MADV_COLD it on Unix to alleviate memory pressure. Except in +// rare cases, this should be triggered only by the output containing the heap +// image. +constexpr size_t jl_large_aotoutput = 64 * 1024 * 1024; // 64 MiB + +class AOTOutput { +public: + // If large = true and we are on Windows, use a temporary file. + AOTOutput(const Twine &prefix, const char *suffix, bool large = false) + : name((prefix + "." + suffix).str()) + { +#ifdef _OS_WINDOWS_ + if (large) { + SmallString<128> path; + SmallVector path_utf16; + auto model = prefix + "-%%%%%%." + suffix; + sys::fs::createUniquePath(model, path, true); + auto fail = [&]() { + jl_errorf("failed to create temporary file: %s", path.c_str()); + }; + if (sys::windows::widenPath(path, path_utf16)) + fail(); + file = + CreateFileW(path_utf16.begin(), GENERIC_READ | GENERIC_WRITE, + FILE_SHARE_READ, nullptr, CREATE_ALWAYS, + FILE_ATTRIBUTE_TEMPORARY | FILE_FLAG_DELETE_ON_CLOSE, nullptr); + if (file == INVALID_HANDLE_VALUE) + fail(); + fd = _open_osfhandle((intptr_t)file, 0); + state = TMP_OPEN; + return; + } +#endif + state = MEMORY; + } + ~AOTOutput() { remove(); } + AOTOutput(const AOTOutput &) = delete; + AOTOutput &operator=(const AOTOutput &) = delete; + AOTOutput(AOTOutput &&other) noexcept + : name(std::move(other.name)), + state(other.state), +#ifdef _OS_WINDOWS_ + file(other.file), + fd(other.fd), +#endif + buf(std::move(other.buf)) + + { + other.state = EMPTY; + } + AOTOutput &operator=(AOTOutput &&other) noexcept + { + remove(); + std::swap(name, other.name); + std::swap(state, other.state); +#ifdef _OS_WINDOWS_ + std::swap(file, other.file); + std::swap(fd, other.fd); +#endif + std::swap(buf, other.buf); + return *this; + } + + std::unique_ptr ostream() + { +#ifdef _OS_WINDOWS_ + if (state == TMP_OPEN) { + return std::make_unique(fd, false); + } +#endif + assert(state == MEMORY); + return std::make_unique(buf); + } + + ErrorOr> memorybuf() + { +#ifdef _OS_WINDOWS_ + if (state == TMP_OPEN) { + sys::fs::file_status status; + if (auto err = sys::fs::status(fd, status)) + return err; + return MemoryBuffer::getOpenFile(file, name, status.getSize(), false); + } +#endif + assert(state == MEMORY); + return MemoryBuffer::getMemBuffer(StringRef{buf.data(), buf.size()}, name, false); + } + + // Signal that we are done with writing to this output for the time being; + // inform the operating system it should page the memory out if we're + // running low. + void done() + { + if (state == MEMORY && buf.size() >= jl_large_aotoutput) { + void *p = (void *)((uintptr_t)buf.data() & ~(jl_page_size - 1)); + size_t s = LLT_ALIGN(buf.size(), jl_page_size); +#if defined(_OS_DARWIN_) || defined(_OS_FREEBSD_) || defined(_OS_OPENBSD_) + if (s > 0) + madvise(p, s, MADV_DONTNEED); +#elif defined(_OS_LINUX_) && defined(MADV_COLD) + if (s > 0) + madvise(p, s, MADV_COLD); +#else + (void)p; + (void)s; +#endif + } + } + + void remove() + { +#ifdef _OS_WINDOWS_ + if (state == TMP_OPEN) { + close(fd); + state = EMPTY; + return; + } +#endif + if (state == MEMORY) { + buf.clear(); + state = EMPTY; + } + } + +private: + std::string name; + enum { + EMPTY, // Temporary file removed/buffer freed + TMP_OPEN, // Temporary file exists and is open, but will be deleted on close (Windows). + MEMORY, // Contents are stored in memory + } state; +#ifdef _OS_WINDOWS_ + HANDLE file; + int fd; +#endif + SmallVector buf; +}; + struct AOTOutputs { - SmallVector unopt, opt, obj, asm_; + AOTOutputs(const char *bc_fname, const char *unopt_bc_fname, const char *obj_fname, + const char *asm_fname) + : bc_fname(bc_fname), + unopt_bc_fname(unopt_bc_fname), + obj_fname(obj_fname), + asm_fname(asm_fname) + { + if (bc_fname) + opt.emplace(); + if (unopt_bc_fname) + unopt.emplace(); + if (obj_fname) + obj.emplace(); + if (asm_fname) + asm_.emplace(); + } + + std::mutex lock; + const char *bc_fname, *unopt_bc_fname, *obj_fname, *asm_fname; + // If one of the vectors is present, this output is being requested. + std::optional> unopt, opt, obj, asm_; }; // Perform the actual optimization and emission of the output files -static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimers &timers, - bool unopt, bool opt, bool obj, bool asm_) { - assert((unopt || opt || obj || asm_) && "no output requested"); - AOTOutputs out; +static void add_output_impl(AOTOutputs &outputs, Module &M, TargetMachine &SourceTM, + bool large = false, ShardTimers *timer = nullptr) +{ auto TM = std::unique_ptr( SourceTM.getTarget().createTargetMachine( SourceTM.getTargetTriple().str(), @@ -1429,23 +1594,35 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer SourceTM.getCodeModel(), SourceTM.getOptLevel())); fixupTM(*TM); - if (unopt) { - timers.unopt.startTimer(); - raw_svector_ostream OS(out.unopt); - PassBuilder PB; - AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; - ModulePassManager MPM; - MPM.addPass(BitcodeWriterPass(OS)); - MPM.run(M, AM.MAM); - timers.unopt.stopTimer(); - } - if (!opt && !obj && !asm_) { - return out; + if (outputs.unopt) { + if (timer) + timer->unopt.startTimer(); + AOTOutput out{M.getModuleIdentifier(), "unopt.bc", large}; + auto OS = out.ostream(); + { + PassBuilder PB; + AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; + ModulePassManager MPM; + MPM.addPass(BitcodeWriterPass(*OS)); + MPM.run(M, AM.MAM); + } + if (timer) + timer->unopt.stopTimer(); + OS->flush(); + out.done(); + { + std::lock_guard guard{outputs.lock}; + outputs.unopt->push_back(std::move(out)); + } + } + if (!outputs.opt && !outputs.obj && !outputs.asm_) { + return; } assert(!verifyLLVMIR(M)); { - timers.optimize.startTimer(); + if (timer) + timer->optimize.startTimer(); auto PMTM = std::unique_ptr( SourceTM.getTarget().createTargetMachine( @@ -1507,51 +1684,85 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer injectCRTAlias(M, "__truncsdbf2", "julia__truncdfbf2", FunctionType::get(Type::getBFloatTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false)); } - timers.optimize.stopTimer(); + if (timer) + timer->optimize.stopTimer(); } - if (opt) { - timers.opt.startTimer(); - raw_svector_ostream OS(out.opt); - PassBuilder PB; - AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; - ModulePassManager MPM; - MPM.addPass(BitcodeWriterPass(OS)); - MPM.run(M, AM.MAM); - timers.opt.stopTimer(); + if (outputs.opt) { + if (timer) + timer->opt.startTimer(); + AOTOutput out{M.getModuleIdentifier(), "bc", large}; + auto OS = out.ostream(); + { + PassBuilder PB; + AnalysisManagers AM{*TM, PB, OptimizationLevel::O0}; + ModulePassManager MPM; + MPM.addPass(BitcodeWriterPass(*OS)); + MPM.run(M, AM.MAM); + } + OS->flush(); + out.done(); + if (timer) + timer->opt.stopTimer(); + { + std::lock_guard guard{outputs.lock}; + outputs.opt->push_back(std::move(out)); + } } - if (obj) { - timers.obj.startTimer(); - raw_svector_ostream OS(out.obj); - legacy::PassManager emitter; - addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + if (outputs.obj) { + if (timer) + timer->obj.startTimer(); + AOTOutput out{M.getModuleIdentifier(), "o", large}; + auto OS = out.ostream(); + { + legacy::PassManager emitter; + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); #if JL_LLVM_VERSION >= 180000 - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CodeGenFileType::ObjectFile, false)) + if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CodeGenFileType::ObjectFile, false)) #else - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_ObjectFile, false)) + if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CGFT_ObjectFile, false)) #endif - jl_safe_printf("ERROR: target does not support generation of object files\n"); - emitter.run(M); - timers.obj.stopTimer(); + jl_safe_printf("ERROR: target does not support generation of object files\n"); + emitter.run(M); + } + OS->flush(); + out.done(); + if (timer) + timer->obj.stopTimer(); + { + std::lock_guard guard{outputs.lock}; + outputs.obj->push_back(std::move(out)); + } } - if (asm_) { - timers.asm_.startTimer(); - raw_svector_ostream OS(out.asm_); - legacy::PassManager emitter; - addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); + if (outputs.asm_) { + if (timer) + timer->asm_.startTimer(); + AOTOutput out{M.getModuleIdentifier(), "s", large}; + auto OS = out.ostream(); + { + legacy::PassManager emitter; + addTargetPasses(&emitter, TM->getTargetTriple(), TM->getTargetIRAnalysis()); #if JL_LLVM_VERSION >= 180000 - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CodeGenFileType::AssemblyFile, false)) + if (TM->addPassesToEmitFile(emitter, *OS, nullptr, + CodeGenFileType::AssemblyFile, false)) #else - if (TM->addPassesToEmitFile(emitter, OS, nullptr, CGFT_AssemblyFile, false)) + if (TM->addPassesToEmitFile(emitter, *OS, nullptr, CGFT_AssemblyFile, false)) #endif - jl_safe_printf("ERROR: target does not support generation of assembly files\n"); - emitter.run(M); - timers.asm_.stopTimer(); + jl_safe_printf( + "ERROR: target does not support generation of assembly files\n"); + emitter.run(M); + } + OS->flush(); + out.done(); + if (timer) + timer->asm_.stopTimer(); + { + std::lock_guard guard{outputs.lock}; + outputs.asm_->push_back(std::move(out)); + } } - - return out; } // serialize module to bitcode @@ -1724,33 +1935,29 @@ extern "C" void lambda_trampoline(void* arg) { delete func; } -// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading, -// as well as partitioning, serialization, and deserialization. template -static SmallVector add_output(Module &M, TargetMachine &TM, StringRef name, unsigned threads, - bool unopt_out, bool opt_out, bool obj_out, bool asm_out, ModuleReleasedFunc module_released) { - SmallVector outputs(threads); - assert(threads); - assert(unopt_out || opt_out || obj_out || asm_out); - // Timers for timing purposes - TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); - SmallVector timers(threads); - for (unsigned i = 0; i < threads; ++i) { - auto idx = std::to_string(i); - timers[i].name = "shard_" + idx; - timers[i].desc = ("Timings for " + name + " module shard " + idx).str(); - timers[i].deserialize.init("deserialize_" + idx, "Deserialize module"); - timers[i].materialize.init("materialize_" + idx, "Materialize declarations"); - timers[i].construct.init("construct_" + idx, "Construct partitioned definitions"); - timers[i].unopt.init("unopt_" + idx, "Emit unoptimized bitcode"); - timers[i].optimize.init("optimize_" + idx, "Optimize shard"); - timers[i].opt.init("opt_" + idx, "Emit optimized bitcode"); - timers[i].obj.init("obj_" + idx, "Emit object file"); - timers[i].asm_.init("asm_" + idx, "Emit assembly file"); +static void add_output_no_partition(AOTOutputs &outputs, Module &M, TargetMachine &TM, + StringRef name, bool large, + ModuleReleasedFunc module_released) +{ + { + JL_TIMING(NATIVE_AOT, NATIVE_Opt); + // convert gvars to the expected offset table format for shard 0 + if (M.getGlobalVariable("jl_gvars")) { + auto gvars = consume_gv(M, "jl_gvars", false); + Type *T_size = M.getDataLayout().getIntPtrType(M.getContext()); + emit_offset_table(M, T_size, gvars, "jl_gvar", + "_0"); // module flag "julia.mv.suffix" + M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs_0"); + } + add_output_impl(outputs, M, TM, large); } - Timer partition_timer("partition", "Partition module", timer_group); - Timer serialize_timer("serialize", "Serialize module", timer_group); - Timer output_timer("output", "Add outputs", timer_group); + // Don't need M anymore + module_released(M); +} + +static bool should_report_image_timings() +{ bool report_timings = false; if (auto env = getenv("JULIA_IMAGE_TIMINGS")) { char *endptr; @@ -1766,35 +1973,46 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri errs() << "WARNING: Invalid value for JULIA_IMAGE_TIMINGS: " << env << "\n"; } } - // Single-threaded case - if (threads == 1) { - output_timer.startTimer(); - { - JL_TIMING(NATIVE_AOT, NATIVE_Opt); - // convert gvars to the expected offset table format for shard 0 - if (M.getGlobalVariable("jl_gvars")) { - auto gvars = consume_gv(M, "jl_gvars", false); - Type *T_size = M.getDataLayout().getIntPtrType(M.getContext()); - emit_offset_table(M, T_size, gvars, "jl_gvar", "_0"); // module flag "julia.mv.suffix" - M.getGlobalVariable("jl_gvar_idxs")->setName("jl_gvar_idxs_0"); - } - outputs[0] = add_output_impl(M, TM, timers[0], unopt_out, opt_out, obj_out, asm_out); - } - output_timer.stopTimer(); - // Don't need M anymore - module_released(M); + return report_timings; +} - if (!report_timings) { - timer_group.clear(); - } else { - timer_group.print(dbgs(), true); - for (auto &t : timers) { - t.print(dbgs(), true); - } - } - return outputs; +static void initialize_shard_timers(StringRef name, SmallVector &timers) +{ + for (unsigned i = 0; i < timers.size(); ++i) { + auto idx = std::to_string(i); + timers[i].name = "shard_" + idx; + timers[i].desc = ("Timings for " + name + " module shard " + idx).str(); + timers[i].deserialize.init("deserialize_" + idx, "Deserialize module"); + timers[i].materialize.init("materialize_" + idx, "Materialize declarations"); + timers[i].construct.init("construct_" + idx, "Construct partitioned definitions"); + timers[i].unopt.init("unopt_" + idx, "Emit unoptimized bitcode"); + timers[i].optimize.init("optimize_" + idx, "Optimize shard"); + timers[i].opt.init("opt_" + idx, "Emit optimized bitcode"); + timers[i].obj.init("obj_" + idx, "Emit object file"); + timers[i].asm_.init("asm_" + idx, "Emit assembly file"); + } +} + +// Entrypoint to optionally-multithreaded image compilation. This handles global coordination of the threading, +// as well as partitioning, serialization, and deserialization. +template +static void add_output(AOTOutputs &outputs, Module &M, TargetMachine &TM, StringRef name, + unsigned threads, unsigned shards, + ModuleReleasedFunc module_released) +{ + assert(threads); + if (shards <= 1) { + add_output_no_partition(outputs, M, TM, name, false, module_released); + return; } + // Timers for timing purposes + TimerGroup timer_group("add_output", ("Time to optimize and emit LLVM module " + name).str()); + Timer partition_timer("partition", "Partition module", timer_group); + Timer serialize_timer("serialize", "Serialize module", timer_group); + Timer output_timer("output", "Add outputs", timer_group); + bool report_timings = should_report_image_timings(); + partition_timer.startTimer(); uint64_t counter = 0; // Partitioning requires all globals to have names. @@ -1804,7 +2022,8 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri G.setName("jl_ext_" + Twine(counter++)); } } - auto partitions = partitionModule(M, threads); + + auto partitions = partitionModule(M, shards); partition_timer.stopTimer(); serialize_timer.startTimer(); @@ -1814,46 +2033,59 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri // Don't need M anymore, since we'll only read from serialized from now on module_released(M); + SmallVector timers(shards); + initialize_shard_timers(name, timers); + + std::atomic next_part = 0; + output_timer.startTimer(); // Start all of the worker threads { JL_TIMING(NATIVE_AOT, NATIVE_Opt); std::vector workers(threads); - for (unsigned i = 0; i < threads; i++) { - std::function func = [&, i]() { - LLVMContext ctx; - ctx.setDiscardValueNames(true); - // Lazily deserialize the entire module - timers[i].deserialize.startTimer(); - auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx); - // Make sure this also fails with only julia, but not LLVM assertions enabled, - // otherwise, the first error we hit is the LLVM module verification failure, - // which will look very confusing, because the module was partially deserialized. - bool deser_succeeded = (bool)EM; - auto M = cantFail(std::move(EM), "Error loading module"); - assert(deser_succeeded); (void)deser_succeeded; - timers[i].deserialize.stopTimer(); - - timers[i].materialize.startTimer(); - materializePreserved(*M, partitions[i]); - timers[i].materialize.stopTimer(); - - timers[i].construct.startTimer(); - std::string suffix = "_" + std::to_string(i); - construct_vars(*M, partitions[i], suffix); - M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), suffix)); - // The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file - // or it may skip emitting debug info for that file. Here set it to ./julia#N - DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), "."); - for (DICompileUnit *CU : M->debug_compile_units()) - CU->replaceOperandWith(0, topfile); - timers[i].construct.stopTimer(); - - outputs[i] = add_output_impl(*M, TM, timers[i], unopt_out, opt_out, obj_out, asm_out); + for (unsigned tid = 0; tid < threads; tid++) { + std::function func = [&]() { + while (1) { + unsigned i = std::atomic_fetch_add(&next_part, 1); + if (i >= shards) + return; + + Partition &partition = partitions[i]; + LLVMContext ctx; + ctx.setDiscardValueNames(true); + // Lazily deserialize the entire module + timers[i].deserialize.startTimer(); + auto EM = getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), name), ctx); + // Make sure this also fails with only julia, but not LLVM assertions enabled, + // otherwise, the first error we hit is the LLVM module verification failure, + // which will look very confusing, because the module was partially deserialized. + bool deser_succeeded = (bool)EM; + auto M = cantFail(std::move(EM), "Error loading module"); + assert(deser_succeeded); (void)deser_succeeded; + timers[i].deserialize.stopTimer(); + + timers[i].materialize.startTimer(); + materializePreserved(*M, partition); + timers[i].materialize.stopTimer(); + + timers[i].construct.startTimer(); + std::string suffix = "_" + std::to_string(i); + construct_vars(*M, partition, suffix); + M->setModuleIdentifier((Twine(M->getModuleIdentifier()) + "#" + Twine(i)).str()); + M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), suffix)); + // The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file + // or it may skip emitting debug info for that file. Here set it to ./julia#N + DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), "."); + for (DICompileUnit *CU : M->debug_compile_units()) + CU->replaceOperandWith(0, topfile); + timers[i].construct.stopTimer(); + + add_output_impl(outputs, *M, TM, false, &timers[i]); + } }; auto arg = new std::function(func); - uv_thread_create(&workers[i], lambda_trampoline, arg); // Use libuv thread to avoid issues with stack sizes + uv_thread_create(&workers[tid], lambda_trampoline, arg); // Use libuv thread to avoid issues with stack sizes } // Wait for all of the worker threads to finish @@ -1881,7 +2113,6 @@ static SmallVector add_output(Module &M, TargetMachine &TM, Stri } dbgs() << "]\n"; } - return outputs; } extern int jl_is_timing_passes; @@ -2016,13 +2247,8 @@ void jl_dump_native_impl(void *native_code, OverrideStackAlignment = M.getOverrideStackAlignment(); }); - auto compile = [&](Module &M, StringRef name, unsigned threads, auto module_released) { - return add_output(M, *SourceTM, name, threads, !!unopt_bc_fname, !!bc_fname, !!obj_fname, !!asm_fname, module_released); - }; + AOTOutputs outputs{bc_fname, unopt_bc_fname, obj_fname, asm_fname}; - SmallVector sysimg_outputs; - SmallVector data_outputs; - SmallVector metadata_outputs; if (z) { JL_TIMING(NATIVE_AOT, NATIVE_Sysimg); LLVMContext Context; @@ -2053,14 +2279,16 @@ void jl_dump_native_impl(void *native_code, // Results in serious memory savings ios_close(z); free(z); - // Note that we don't set z to null, this allows the check in WRITE_ARCHIVE + // Note that we don't set z to null, this allows the check in write_archive // to function as expected // no need to free the module/context, destructor handles that - sysimg_outputs = compile(sysimgM, "sysimg", 1, [](Module &) {}); + add_output_no_partition(outputs, sysimgM, *SourceTM, "sysimg", true, + [](Module &) {}); } const bool imaging_mode = true; unsigned threads = 1; + unsigned nshards = 1; unsigned nfvars = 0; unsigned ngvars = 0; @@ -2118,6 +2346,17 @@ void jl_dump_native_impl(void *native_code, ); threads = compute_image_thread_count(module_info); LLVM_DEBUG(dbgs() << "Using " << threads << " to emit aot image\n"); + + char *weight_s = getenv("JULIA_IMAGE_PARTITION_WEIGHT"); + size_t weight = 500000; + char *end; + if (weight_s) { + size_t x = strtol(weight_s, &end, 10); + if (weight_s != end) + weight = x; + } + nshards = std::max(1, module_info.weight / weight); + nfvars = data->jl_sysimg_fvars.size(); ngvars = data->jl_sysimg_gvars.size(); emit_table(dataM, data->jl_sysimg_gvars, "jl_gvars", T_psize); @@ -2160,15 +2399,15 @@ void jl_dump_native_impl(void *native_code, auto TSCtx = data->M.getContext(); auto lock = TSCtx.getLock(); auto dataM = data->M.getModuleUnlocked(); - - data_outputs = compile(*dataM, "text", threads, [data, &lock, &TSCtx](Module &) { - // Delete data when add_output thinks it's done with it - // Saves memory for use when multithreading - auto lock2 = std::move(lock); - delete data; - // Drop last reference to shared LLVM::Context - auto TSCtx2 = std::move(TSCtx); - }); + add_output(outputs, *dataM, *SourceTM, "text", threads, nshards, + [data, &lock, &TSCtx](Module &) { + // Delete data when add_output thinks it's done with it + // Saves memory for use when multithreading + auto lock2 = std::move(lock); + delete data; + // Drop last reference to shared LLVM::Context + auto TSCtx2 = std::move(TSCtx); + }); } if (params->emit_metadata) { @@ -2229,9 +2468,9 @@ void jl_dump_native_impl(void *native_code, auto target_ids = new GlobalVariable(metadataM, value->getType(), true, GlobalVariable::InternalLinkage, value, "jl_dispatch_target_ids"); - auto shards = emit_shard_table(metadataM, T_size, T_psize, threads); + auto shards = emit_shard_table(metadataM, T_size, T_psize, nshards); auto ptls = emit_ptls_table(metadataM, T_size, T_ptr); - auto header = emit_image_header(metadataM, threads, nfvars, ngvars); + auto header = emit_image_header(metadataM, nshards, nfvars, ngvars); auto AT = ArrayType::get(T_size, sizeof(jl_small_typeof) / sizeof(void*)); auto jl_small_typeof_copy = new GlobalVariable(metadataM, AT, false, GlobalVariable::ExternalLinkage, @@ -2258,7 +2497,8 @@ void jl_dump_native_impl(void *native_code, } // no need to free module/context, destructor handles that - metadata_outputs = compile(metadataM, "data", 1, [](Module &) {}); + add_output_no_partition(outputs, metadataM, *SourceTM, "data", false, + [](Module &) {}); } { @@ -2270,32 +2510,32 @@ void jl_dump_native_impl(void *native_code, #else #define WritingMode true #endif -#define WRITE_ARCHIVE(fname, field, prefix, suffix) \ - if (fname) {\ - SmallVector archive; \ - SmallVector filenames; \ - SmallVector buffers; \ - for (size_t i = 0; i < threads; i++) { \ - filenames.push_back((StringRef("text") + prefix + "#" + Twine(i) + suffix).str()); \ - buffers.push_back(StringRef(data_outputs[i].field.data(), data_outputs[i].field.size())); \ - } \ - filenames.push_back("metadata" prefix suffix); \ - buffers.push_back(StringRef(metadata_outputs[0].field.data(), metadata_outputs[0].field.size())); \ - if (z) { \ - filenames.push_back("sysimg" prefix suffix); \ - buffers.push_back(StringRef(sysimg_outputs[0].field.data(), sysimg_outputs[0].field.size())); \ - } \ - for (size_t i = 0; i < filenames.size(); i++) { \ - archive.push_back(NewArchiveMember(MemoryBufferRef(buffers[i], filenames[i]))); \ - } \ - handleAllErrors(writeArchive(fname, archive, WritingMode, Kind, true, false), reportWriterError); \ - } - - WRITE_ARCHIVE(unopt_bc_fname, unopt, "_unopt", ".bc"); - WRITE_ARCHIVE(bc_fname, opt, "_opt", ".bc"); - WRITE_ARCHIVE(obj_fname, obj, "", ".o"); - WRITE_ARCHIVE(asm_fname, asm_, "", ".s"); -#undef WRITE_ARCHIVE + auto write_archive = [&](const char *fname, SmallVector &outputs) { + if (!fname) + return; + SmallVector archive; + // Must be SmallString<0> so StringRefs in NewArchiveMembers aren't invalidated + SmallVector, 0> buffers; + for (auto &out : outputs) { + auto buf = out.memorybuf(); + if (buf.getError()) + jl_errorf("failed to read temporary object file: %s", + buf.getError().message().c_str()); + buffers.push_back(std::move(*buf)); + archive.push_back(NewArchiveMember{*buffers.back()}); + } + handleAllErrors(writeArchive(fname, archive, WritingMode, Kind, true, false), + reportWriterError); + }; + + if (outputs.unopt) + write_archive(unopt_bc_fname, *outputs.unopt); + if (outputs.opt) + write_archive(bc_fname, *outputs.opt); + if (outputs.obj) + write_archive(obj_fname, *outputs.obj); + if (outputs.asm_) + write_archive(asm_fname, *outputs.asm_); } } diff --git a/src/cgmemmgr.cpp b/src/cgmemmgr.cpp index c257d2a2e3331..7cf358ddf1e95 100644 --- a/src/cgmemmgr.cpp +++ b/src/cgmemmgr.cpp @@ -3,7 +3,11 @@ #include "llvm-version.h" #include "platform.h" +#include +#include +#include #include + #include "julia.h" #include "julia_internal.h" @@ -458,18 +462,27 @@ struct Block { } }; +struct Allocation { + // Address to write to (the one returned by the allocation function) + void *wr_addr; + // Runtime address + void *rt_addr; + size_t sz; + bool relocated; +}; + class RWAllocator { static constexpr int nblocks = 8; Block blocks[nblocks]{}; public: RWAllocator() JL_NOTSAFEPOINT = default; - void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT + Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT { size_t min_size = (size_t)-1; int min_id = 0; for (int i = 0;i < nblocks && blocks[i].ptr;i++) { if (void *ptr = blocks[i].alloc(size, align)) - return ptr; + return {ptr, ptr, size, false}; if (blocks[i].avail < min_size) { min_size = blocks[i].avail; min_id = i; @@ -477,7 +490,8 @@ class RWAllocator { } size_t block_size = get_block_size(size); blocks[min_id].reset(map_anon_page(block_size), block_size); - return blocks[min_id].alloc(size, align); + void *ptr = blocks[min_id].alloc(size, align); + return {ptr, ptr, size, false}; } }; @@ -517,16 +531,6 @@ struct SplitPtrBlock : public Block { } }; -struct Allocation { - // Address to write to (the one returned by the allocation function) - void *wr_addr; - // Runtime address - void *rt_addr; - size_t sz; - bool relocated; -}; - -template class ROAllocator { protected: static constexpr int nblocks = 8; @@ -554,7 +558,7 @@ class ROAllocator { } // Allocations that have not been finalized yet. SmallVector allocations; - void *alloc(size_t size, size_t align) JL_NOTSAFEPOINT + Allocation alloc(size_t size, size_t align) JL_NOTSAFEPOINT { size_t min_size = (size_t)-1; int min_id = 0; @@ -570,8 +574,9 @@ class ROAllocator { wr_ptr = get_wr_ptr(block, ptr, size, align); } block.state |= SplitPtrBlock::Alloc; - allocations.push_back(Allocation{wr_ptr, ptr, size, false}); - return wr_ptr; + Allocation a{wr_ptr, ptr, size, false}; + allocations.push_back(a); + return a; } if (block.avail < min_size) { min_size = block.avail; @@ -592,18 +597,21 @@ class ROAllocator { #ifdef _OS_WINDOWS_ block.state = SplitPtrBlock::Alloc; void *wr_ptr = get_wr_ptr(block, ptr, size, align); - allocations.push_back(Allocation{wr_ptr, ptr, size, false}); + Allocation a{wr_ptr, ptr, size, false}; + allocations.push_back(a); ptr = wr_ptr; #else block.state = SplitPtrBlock::Alloc | SplitPtrBlock::InitAlloc; - allocations.push_back(Allocation{ptr, ptr, size, false}); + Allocation a{ptr, ptr, size, false}; + allocations.push_back(a); #endif - return ptr; + return a; } }; -template -class DualMapAllocator : public ROAllocator { +class DualMapAllocator : public ROAllocator { + bool exec; + protected: void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, size_t, size_t) override JL_NOTSAFEPOINT { @@ -664,7 +672,7 @@ class DualMapAllocator : public ROAllocator { } } public: - DualMapAllocator() JL_NOTSAFEPOINT + DualMapAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec) { assert(anon_hdl != -1); } @@ -677,13 +685,13 @@ class DualMapAllocator : public ROAllocator { finalize_block(block, true); block.reset(nullptr, 0); } - ROAllocator::finalize(); + ROAllocator::finalize(); } }; #ifdef _OS_LINUX_ -template -class SelfMemAllocator : public ROAllocator { +class SelfMemAllocator : public ROAllocator { + bool exec; SmallVector temp_buff; protected: void *get_wr_ptr(SplitPtrBlock &block, void *rt_ptr, @@ -720,9 +728,7 @@ class SelfMemAllocator : public ROAllocator { } } public: - SelfMemAllocator() JL_NOTSAFEPOINT - : ROAllocator(), - temp_buff() + SelfMemAllocator(bool exec) JL_NOTSAFEPOINT : exec(exec), temp_buff() { assert(get_self_mem_fd() != -1); } @@ -756,11 +762,25 @@ class SelfMemAllocator : public ROAllocator { } if (cached) temp_buff.resize(1); - ROAllocator::finalize(); + ROAllocator::finalize(); } }; #endif // _OS_LINUX_ +std::pair, std::unique_ptr> +get_preferred_allocators() JL_NOTSAFEPOINT +{ +#ifdef _OS_LINUX_ + if (get_self_mem_fd() != -1) + return {std::make_unique(false), + std::make_unique(true)}; +#endif + if (init_shared_map() != -1) + return {std::make_unique(false), + std::make_unique(true)}; + return {}; +} + class RTDyldMemoryManagerJL : public SectionMemoryManager { struct EHFrame { uint8_t *addr; @@ -770,8 +790,8 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager { void operator=(const RTDyldMemoryManagerJL&) = delete; SmallVector pending_eh; RWAllocator rw_alloc; - std::unique_ptr> ro_alloc; - std::unique_ptr> exe_alloc; + std::unique_ptr ro_alloc; + std::unique_ptr exe_alloc; size_t total_allocated; public: @@ -779,20 +799,9 @@ class RTDyldMemoryManagerJL : public SectionMemoryManager { : SectionMemoryManager(), pending_eh(), rw_alloc(), - ro_alloc(), - exe_alloc(), total_allocated(0) { -#ifdef _OS_LINUX_ - if (!ro_alloc && get_self_mem_fd() != -1) { - ro_alloc.reset(new SelfMemAllocator()); - exe_alloc.reset(new SelfMemAllocator()); - } -#endif - if (!ro_alloc && init_shared_map() != -1) { - ro_alloc.reset(new DualMapAllocator()); - exe_alloc.reset(new DualMapAllocator()); - } + std::tie(ro_alloc, exe_alloc) = get_preferred_allocators(); } ~RTDyldMemoryManagerJL() override JL_NOTSAFEPOINT { @@ -845,7 +854,7 @@ uint8_t *RTDyldMemoryManagerJL::allocateCodeSection(uintptr_t Size, jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); jl_timing_counter_inc(JL_TIMING_COUNTER_JITCodeSize, Size); if (exe_alloc) - return (uint8_t*)exe_alloc->alloc(Size, Alignment); + return (uint8_t*)exe_alloc->alloc(Size, Alignment).wr_addr; return SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, SectionName); } @@ -860,9 +869,9 @@ uint8_t *RTDyldMemoryManagerJL::allocateDataSection(uintptr_t Size, jl_timing_counter_inc(JL_TIMING_COUNTER_JITSize, Size); jl_timing_counter_inc(JL_TIMING_COUNTER_JITDataSize, Size); if (!isReadOnly) - return (uint8_t*)rw_alloc.alloc(Size, Alignment); + return (uint8_t*)rw_alloc.alloc(Size, Alignment).wr_addr; if (ro_alloc) - return (uint8_t*)ro_alloc->alloc(Size, Alignment); + return (uint8_t*)ro_alloc->alloc(Size, Alignment).wr_addr; return SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, isReadOnly); } @@ -917,6 +926,133 @@ void RTDyldMemoryManagerJL::deregisterEHFrames(uint8_t *Addr, } #endif +class JLJITLinkMemoryManager : public jitlink::JITLinkMemoryManager { + using OnFinalizedFunction = + jitlink::JITLinkMemoryManager::InFlightAlloc::OnFinalizedFunction; + + std::mutex Mutex; + RWAllocator RWAlloc; + std::unique_ptr ROAlloc; + std::unique_ptr ExeAlloc; + SmallVector FinalizedCallbacks; + uint32_t InFlight{0}; + +public: + class InFlightAlloc; + + static std::unique_ptr Create() + { + auto [ROAlloc, ExeAlloc] = get_preferred_allocators(); + if (ROAlloc && ExeAlloc) + return std::unique_ptr( + new JLJITLinkMemoryManager(std::move(ROAlloc), std::move(ExeAlloc))); + + return cantFail( + orc::MapperJITLinkMemoryManager::CreateWithMapper( + /*Reservation Granularity*/ 16 * 1024 * 1024)); + } + + void allocate(const jitlink::JITLinkDylib *JD, jitlink::LinkGraph &G, + OnAllocatedFunction OnAllocated) override; + + void deallocate(std::vector Allocs, + OnDeallocatedFunction OnDeallocated) override + { + jl_unreachable(); + } + +protected: + JLJITLinkMemoryManager(std::unique_ptr ROAlloc, + std::unique_ptr ExeAlloc) + : ROAlloc(std::move(ROAlloc)), ExeAlloc(std::move(ExeAlloc)) + { + } + + void finalize(OnFinalizedFunction OnFinalized) + { + SmallVector Callbacks; + { + std::unique_lock Lock{Mutex}; + FinalizedCallbacks.push_back(std::move(OnFinalized)); + + if (--InFlight > 0) + return; + + ROAlloc->finalize(); + ExeAlloc->finalize(); + Callbacks = std::move(FinalizedCallbacks); + } + + for (auto &CB : Callbacks) + std::move(CB)(FinalizedAlloc{}); + } +}; + +class JLJITLinkMemoryManager::InFlightAlloc + : public jitlink::JITLinkMemoryManager::InFlightAlloc { + JLJITLinkMemoryManager &MM; + jitlink::LinkGraph &G; + +public: + InFlightAlloc(JLJITLinkMemoryManager &MM, jitlink::LinkGraph &G) : MM(MM), G(G) {} + + void abandon(OnAbandonedFunction OnAbandoned) override { jl_unreachable(); } + + void finalize(OnFinalizedFunction OnFinalized) override + { + auto *GP = &G; + MM.finalize([GP, OnFinalized = + std::move(OnFinalized)](Expected FA) mutable { + if (!FA) + return OnFinalized(FA.takeError()); + // Need to handle dealloc actions when we GC code + auto E = orc::shared::runFinalizeActions(GP->allocActions()); + if (!E) + return OnFinalized(E.takeError()); + OnFinalized(std::move(FA)); + }); + } +}; + +using orc::MemProt; + +void JLJITLinkMemoryManager::allocate(const jitlink::JITLinkDylib *JD, + jitlink::LinkGraph &G, + OnAllocatedFunction OnAllocated) +{ + jitlink::BasicLayout BL{G}; + + { + std::unique_lock Lock{Mutex}; + for (auto &[AG, Seg] : BL.segments()) { + if (AG.getMemLifetime() == orc::MemLifetime::NoAlloc) + continue; + assert(AG.getMemLifetime() == orc::MemLifetime::Standard); + + auto Prot = AG.getMemProt(); + uint64_t Alignment = Seg.Alignment.value(); + uint64_t Size = Seg.ContentSize + Seg.ZeroFillSize; + Allocation Alloc; + if (Prot == (MemProt::Read | MemProt::Write)) + Alloc = RWAlloc.alloc(Size, Alignment); + else if (Prot == MemProt::Read) + Alloc = ROAlloc->alloc(Size, Alignment); + else if (Prot == (MemProt::Read | MemProt::Exec)) + Alloc = ExeAlloc->alloc(Size, Alignment); + else + abort(); + + Seg.Addr = orc::ExecutorAddr::fromPtr(Alloc.rt_addr); + Seg.WorkingMem = (char *)Alloc.wr_addr; + } + } + + if (auto Err = BL.apply()) + return OnAllocated(std::move(Err)); + + ++InFlight; + OnAllocated(std::make_unique(*this, G)); +} } RTDyldMemoryManager* createRTDyldMemoryManager() JL_NOTSAFEPOINT @@ -928,3 +1064,8 @@ size_t getRTDyldMemoryManagerTotalBytes(RTDyldMemoryManager *mm) JL_NOTSAFEPOINT { return ((RTDyldMemoryManagerJL*)mm)->getTotalBytes(); } + +std::unique_ptr createJITLinkMemoryManager() +{ + return JLJITLinkMemoryManager::Create(); +} diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp index 3ea95ea42f596..299e5d39d2c9f 100644 --- a/src/jitlayers.cpp +++ b/src/jitlayers.cpp @@ -1156,12 +1156,6 @@ class JLMemoryUsagePlugin : public ObjectLinkingLayer::Plugin { #pragma clang diagnostic ignored "-Wunused-function" #endif -// TODO: Port our memory management optimisations to JITLink instead of using the -// default InProcessMemoryManager. -std::unique_ptr createJITLinkMemoryManager() JL_NOTSAFEPOINT { - return cantFail(orc::MapperJITLinkMemoryManager::CreateWithMapper(/*Reservation Granularity*/ 16 * 1024 * 1024)); -} - #ifdef _COMPILER_CLANG_ #pragma clang diagnostic pop #endif @@ -1185,6 +1179,7 @@ class JLEHFrameRegistrar final : public jitlink::EHFrameRegistrar { }; RTDyldMemoryManager *createRTDyldMemoryManager(void) JL_NOTSAFEPOINT; +std::unique_ptr createJITLinkMemoryManager() JL_NOTSAFEPOINT; // A simple forwarding class, since OrcJIT v2 needs a unique_ptr, while we have a shared_ptr class ForwardingMemoryManager : public RuntimeDyld::MemoryManager {