Skip to content

Commit

Permalink
Add extra profiling events to JIT/AOT compilation (#50610)
Browse files Browse the repository at this point in the history
  • Loading branch information
pchintalapudi authored Jul 21, 2023
1 parent 049de79 commit bf00ff4
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 138 deletions.
278 changes: 149 additions & 129 deletions src/aotcompile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm
params.external_linkage = _external_linkage;
size_t compile_for[] = { jl_typeinf_world, _world };
for (int worlds = 0; worlds < 2; worlds++) {
JL_TIMING(NATIVE_AOT, NATIVE_Codegen);
params.world = compile_for[worlds];
if (!params.world)
continue;
Expand Down Expand Up @@ -390,37 +391,40 @@ void *jl_create_native_impl(jl_array_t *methods, LLVMOrcThreadSafeModuleRef llvm

// clones the contents of the module `m` to the shadow_output collector
// while examining and recording what kind of function pointer we have
Linker L(*clone.getModuleUnlocked());
for (auto &def : emitted) {
jl_merge_module(clone, std::move(std::get<0>(def.second)));
jl_code_instance_t *this_code = def.first;
jl_llvm_functions_t decls = std::get<1>(def.second);
StringRef func = decls.functionObject;
StringRef cfunc = decls.specFunctionObject;
uint32_t func_id = 0;
uint32_t cfunc_id = 0;
if (func == "jl_fptr_args") {
func_id = -1;
}
else if (func == "jl_fptr_sparam") {
func_id = -2;
}
else {
//Safe b/c context is locked by params
data->jl_sysimg_fvars.push_back(cast<Function>(clone.getModuleUnlocked()->getNamedValue(func)));
func_id = data->jl_sysimg_fvars.size();
{
JL_TIMING(NATIVE_AOT, NATIVE_Merge);
Linker L(*clone.getModuleUnlocked());
for (auto &def : emitted) {
jl_merge_module(clone, std::move(std::get<0>(def.second)));
jl_code_instance_t *this_code = def.first;
jl_llvm_functions_t decls = std::get<1>(def.second);
StringRef func = decls.functionObject;
StringRef cfunc = decls.specFunctionObject;
uint32_t func_id = 0;
uint32_t cfunc_id = 0;
if (func == "jl_fptr_args") {
func_id = -1;
}
else if (func == "jl_fptr_sparam") {
func_id = -2;
}
else {
//Safe b/c context is locked by params
data->jl_sysimg_fvars.push_back(cast<Function>(clone.getModuleUnlocked()->getNamedValue(func)));
func_id = data->jl_sysimg_fvars.size();
}
if (!cfunc.empty()) {
//Safe b/c context is locked by params
data->jl_sysimg_fvars.push_back(cast<Function>(clone.getModuleUnlocked()->getNamedValue(cfunc)));
cfunc_id = data->jl_sysimg_fvars.size();
}
data->jl_fvar_map[this_code] = std::make_tuple(func_id, cfunc_id);
}
if (!cfunc.empty()) {
//Safe b/c context is locked by params
data->jl_sysimg_fvars.push_back(cast<Function>(clone.getModuleUnlocked()->getNamedValue(cfunc)));
cfunc_id = data->jl_sysimg_fvars.size();
if (params._shared_module) {
bool error = L.linkInModule(std::move(params._shared_module));
assert(!error && "Error linking in shared module");
(void)error;
}
data->jl_fvar_map[this_code] = std::make_tuple(func_id, cfunc_id);
}
if (params._shared_module) {
bool error = L.linkInModule(std::move(params._shared_module));
assert(!error && "Error linking in shared module");
(void)error;
}

// now get references to the globals in the merged module
Expand Down Expand Up @@ -986,58 +990,60 @@ static AOTOutputs add_output_impl(Module &M, TargetMachine &SourceTM, ShardTimer
}
assert(!verifyLLVMIR(M));

timers.optimize.startTimer();
{
timers.optimize.startTimer();

#ifndef JL_USE_NEW_PM
legacy::PassManager optimizer;
addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
addOptimizationPasses(&optimizer, jl_options.opt_level, true, true);
addMachinePasses(&optimizer, jl_options.opt_level);
legacy::PassManager optimizer;
addTargetPasses(&optimizer, TM->getTargetTriple(), TM->getTargetIRAnalysis());
addOptimizationPasses(&optimizer, jl_options.opt_level, true, true);
addMachinePasses(&optimizer, jl_options.opt_level);
#else

auto PMTM = std::unique_ptr<TargetMachine>(
SourceTM.getTarget().createTargetMachine(
SourceTM.getTargetTriple().str(),
SourceTM.getTargetCPU(),
SourceTM.getTargetFeatureString(),
SourceTM.Options,
SourceTM.getRelocationModel(),
SourceTM.getCodeModel(),
SourceTM.getOptLevel()));
NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)};
auto PMTM = std::unique_ptr<TargetMachine>(
SourceTM.getTarget().createTargetMachine(
SourceTM.getTargetTriple().str(),
SourceTM.getTargetCPU(),
SourceTM.getTargetFeatureString(),
SourceTM.Options,
SourceTM.getRelocationModel(),
SourceTM.getCodeModel(),
SourceTM.getOptLevel()));
NewPM optimizer{std::move(PMTM), getOptLevel(jl_options.opt_level), OptimizationOptions::defaults(true, true)};
#endif
optimizer.run(M);
assert(!verifyLLVMIR(M));
bool inject_aliases = false;
for (auto &F : M.functions()) {
if (!F.isDeclaration() && F.getName() != "_DllMainCRTStartup") {
inject_aliases = true;
break;
optimizer.run(M);
assert(!verifyLLVMIR(M));
bool inject_aliases = false;
for (auto &F : M.functions()) {
if (!F.isDeclaration() && F.getName() != "_DllMainCRTStartup") {
inject_aliases = true;
break;
}
}
}
// no need to inject aliases if we have no functions
// no need to inject aliases if we have no functions

if (inject_aliases) {
if (inject_aliases) {
#if JULIA_FLOAT16_ABI == 1
// We would like to emit an alias or an weakref alias to redirect these symbols
// but LLVM doesn't let us emit a GlobalAlias to a declaration...
// So for now we inject a definition of these functions that calls our runtime
// functions. We do so after optimization to avoid cloning these functions.
injectCRTAlias(M, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
injectCRTAlias(M, "__extendhfsf2", "julia__gnu_h2f_ieee",
FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
injectCRTAlias(M, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
injectCRTAlias(M, "__truncsfhf2", "julia__gnu_f2h_ieee",
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2",
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false));
// We would like to emit an alias or an weakref alias to redirect these symbols
// but LLVM doesn't let us emit a GlobalAlias to a declaration...
// So for now we inject a definition of these functions that calls our runtime
// functions. We do so after optimization to avoid cloning these functions.
injectCRTAlias(M, "__gnu_h2f_ieee", "julia__gnu_h2f_ieee",
FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
injectCRTAlias(M, "__extendhfsf2", "julia__gnu_h2f_ieee",
FunctionType::get(Type::getFloatTy(M.getContext()), { Type::getHalfTy(M.getContext()) }, false));
injectCRTAlias(M, "__gnu_f2h_ieee", "julia__gnu_f2h_ieee",
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
injectCRTAlias(M, "__truncsfhf2", "julia__gnu_f2h_ieee",
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getFloatTy(M.getContext()) }, false));
injectCRTAlias(M, "__truncdfhf2", "julia__truncdfhf2",
FunctionType::get(Type::getHalfTy(M.getContext()), { Type::getDoubleTy(M.getContext()) }, false));
#else
emitFloat16Wrappers(M, false);
emitFloat16Wrappers(M, false);
#endif
}
timers.optimize.stopTimer();
}
timers.optimize.stopTimer();

if (opt) {
timers.opt.startTimer();
Expand Down Expand Up @@ -1276,7 +1282,10 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
// Single-threaded case
if (threads == 1) {
output_timer.startTimer();
outputs[0] = add_output_impl(M, TM, timers[0], unopt_out, opt_out, obj_out, asm_out);
{
JL_TIMING(NATIVE_AOT, NATIVE_Opt);
outputs[0] = add_output_impl(M, TM, timers[0], unopt_out, opt_out, obj_out, asm_out);
}
output_timer.stopTimer();
// Don't need M anymore
module_released(M);
Expand Down Expand Up @@ -1314,40 +1323,43 @@ static SmallVector<AOTOutputs, 16> add_output(Module &M, TargetMachine &TM, Stri
output_timer.startTimer();

// Start all of the worker threads
std::vector<std::thread> workers(threads);
for (unsigned i = 0; i < threads; i++) {
workers[i] = std::thread([&, i]() {
LLVMContext ctx;
// Lazily deserialize the entire module
timers[i].deserialize.startTimer();
auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
timers[i].deserialize.stopTimer();

timers[i].materialize.startTimer();
materializePreserved(*M, partitions[i]);
timers[i].materialize.stopTimer();

timers[i].construct.startTimer();
construct_vars(*M, partitions[i]);
M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
// The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file
// or it may skip emitting debug info for that file. Here set it to ./julia#N
DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), ".");
for (DICompileUnit *CU : M->debug_compile_units())
CU->replaceOperandWith(0, topfile);
timers[i].construct.stopTimer();

timers[i].deletion.startTimer();
dropUnusedGlobals(*M);
timers[i].deletion.stopTimer();

outputs[i] = add_output_impl(*M, TM, timers[i], unopt_out, opt_out, obj_out, asm_out);
});
}
{
JL_TIMING(NATIVE_AOT, NATIVE_Opt);
std::vector<std::thread> workers(threads);
for (unsigned i = 0; i < threads; i++) {
workers[i] = std::thread([&, i]() {
LLVMContext ctx;
// Lazily deserialize the entire module
timers[i].deserialize.startTimer();
auto M = cantFail(getLazyBitcodeModule(MemoryBufferRef(StringRef(serialized.data(), serialized.size()), "Optimized"), ctx), "Error loading module");
timers[i].deserialize.stopTimer();

timers[i].materialize.startTimer();
materializePreserved(*M, partitions[i]);
timers[i].materialize.stopTimer();

timers[i].construct.startTimer();
construct_vars(*M, partitions[i]);
M->setModuleFlag(Module::Error, "julia.mv.suffix", MDString::get(M->getContext(), "_" + std::to_string(i)));
// The DICompileUnit file is not used for anything, but ld64 requires it be a unique string per object file
// or it may skip emitting debug info for that file. Here set it to ./julia#N
DIFile *topfile = DIFile::get(M->getContext(), "julia#" + std::to_string(i), ".");
for (DICompileUnit *CU : M->debug_compile_units())
CU->replaceOperandWith(0, topfile);
timers[i].construct.stopTimer();

timers[i].deletion.startTimer();
dropUnusedGlobals(*M);
timers[i].deletion.stopTimer();

outputs[i] = add_output_impl(*M, TM, timers[i], unopt_out, opt_out, obj_out, asm_out);
});
}

// Wait for all of the worker threads to finish
for (auto &w : workers)
w.join();
// Wait for all of the worker threads to finish
for (auto &w : workers)
w.join();
}

output_timer.stopTimer();

Expand Down Expand Up @@ -1488,6 +1500,7 @@ void jl_dump_native_impl(void *native_code,
SmallVector<AOTOutputs, 16> data_outputs;
SmallVector<AOTOutputs, 16> metadata_outputs;
if (z) {
JL_TIMING(NATIVE_AOT, NATIVE_Sysimg);
LLVMContext Context;
Module sysimgM("sysimg", Context);
sysimgM.setTargetTriple(TheTriple.str());
Expand Down Expand Up @@ -1526,6 +1539,7 @@ void jl_dump_native_impl(void *native_code,
bool has_veccall = false;

data->M.withModuleDo([&](Module &dataM) {
JL_TIMING(NATIVE_AOT, NATIVE_Setup);
dataM.setTargetTriple(TheTriple.str());
dataM.setDataLayout(DL);
auto &Context = dataM.getContext();
Expand Down Expand Up @@ -1616,6 +1630,7 @@ void jl_dump_native_impl(void *native_code,
}

{
JL_TIMING(NATIVE_AOT, NATIVE_Metadata);
LLVMContext Context;
Module metadataM("metadata", Context);
metadataM.setTargetTriple(TheTriple.str());
Expand Down Expand Up @@ -1690,32 +1705,37 @@ void jl_dump_native_impl(void *native_code,
metadata_outputs = compile(metadataM, "data", 1, [](Module &) {});
}

object::Archive::Kind Kind = getDefaultForHost(TheTriple);
{
JL_TIMING(NATIVE_AOT, NATIVE_Write);

object::Archive::Kind Kind = getDefaultForHost(TheTriple);
#define WRITE_ARCHIVE(fname, field, prefix, suffix) \
if (fname) {\
std::vector<NewArchiveMember> archive; \
SmallVector<std::string, 16> filenames; \
SmallVector<StringRef, 16> buffers; \
for (size_t i = 0; i < threads; i++) { \
filenames.push_back((StringRef("text") + prefix + "#" + Twine(i) + suffix).str()); \
buffers.push_back(StringRef(data_outputs[i].field.data(), data_outputs[i].field.size())); \
} \
filenames.push_back("metadata" prefix suffix); \
buffers.push_back(StringRef(metadata_outputs[0].field.data(), metadata_outputs[0].field.size())); \
if (z) { \
filenames.push_back("sysimg" prefix suffix); \
buffers.push_back(StringRef(sysimg_outputs[0].field.data(), sysimg_outputs[0].field.size())); \
} \
for (size_t i = 0; i < filenames.size(); i++) { \
archive.push_back(NewArchiveMember(MemoryBufferRef(buffers[i], filenames[i]))); \
} \
handleAllErrors(writeArchive(fname, archive, true, Kind, true, false), reportWriterError); \
}

WRITE_ARCHIVE(unopt_bc_fname, unopt, "_unopt", ".bc");
WRITE_ARCHIVE(bc_fname, opt, "_opt", ".bc");
WRITE_ARCHIVE(obj_fname, obj, "", ".o");
WRITE_ARCHIVE(asm_fname, asm_, "", ".s");
if (fname) {\
std::vector<NewArchiveMember> archive; \
SmallVector<std::string, 16> filenames; \
SmallVector<StringRef, 16> buffers; \
for (size_t i = 0; i < threads; i++) { \
filenames.push_back((StringRef("text") + prefix + "#" + Twine(i) + suffix).str()); \
buffers.push_back(StringRef(data_outputs[i].field.data(), data_outputs[i].field.size())); \
} \
filenames.push_back("metadata" prefix suffix); \
buffers.push_back(StringRef(metadata_outputs[0].field.data(), metadata_outputs[0].field.size())); \
if (z) { \
filenames.push_back("sysimg" prefix suffix); \
buffers.push_back(StringRef(sysimg_outputs[0].field.data(), sysimg_outputs[0].field.size())); \
} \
for (size_t i = 0; i < filenames.size(); i++) { \
archive.push_back(NewArchiveMember(MemoryBufferRef(buffers[i], filenames[i]))); \
} \
handleAllErrors(writeArchive(fname, archive, true, Kind, true, false), reportWriterError); \
}

WRITE_ARCHIVE(unopt_bc_fname, unopt, "_unopt", ".bc");
WRITE_ARCHIVE(bc_fname, opt, "_opt", ".bc");
WRITE_ARCHIVE(obj_fname, obj, "", ".o");
WRITE_ARCHIVE(asm_fname, asm_, "", ".s");
#undef WRITE_ARCHIVE
}
}

void addTargetPasses(legacy::PassManagerBase *PM, const Triple &triple, TargetIRAnalysis analysis)
Expand Down
16 changes: 9 additions & 7 deletions src/jitlayers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,12 +1214,13 @@ namespace {
}
}

JL_TIMING(LLVM_OPT, LLVM_OPT);

//Run the optimization
assert(!verifyLLVMIR(M));
(***PMs).run(M);
assert(!verifyLLVMIR(M));
{
JL_TIMING(LLVM_JIT, JIT_Opt);
//Run the optimization
assert(!verifyLLVMIR(M));
(***PMs).run(M);
assert(!verifyLLVMIR(M));
}

uint64_t end_time = 0;
{
Expand Down Expand Up @@ -1272,6 +1273,7 @@ namespace {
: orc::IRCompileLayer::IRCompiler(MO), TMs(TMCreator(TM, optlevel)) {}

Expected<std::unique_ptr<MemoryBuffer>> operator()(Module &M) override {
JL_TIMING(LLVM_JIT, JIT_Compile);
return orc::SimpleCompiler(***TMs)(M);
}

Expand Down Expand Up @@ -1459,7 +1461,7 @@ void JuliaOJIT::addGlobalMapping(StringRef Name, uint64_t Addr)

void JuliaOJIT::addModule(orc::ThreadSafeModule TSM)
{
JL_TIMING(LLVM_ORC, LLVM_ORC);
JL_TIMING(LLVM_JIT, JIT_Total);
++ModulesAdded;
orc::SymbolLookupSet NewExports;
TSM.withModuleDo([&](Module &M) JL_NOTSAFEPOINT {
Expand Down
Loading

0 comments on commit bf00ff4

Please sign in to comment.