Skip to content

Commit

Permalink
Faster incremental sysimg rebuilds
Browse files Browse the repository at this point in the history
This commit provides the ability
to rebuild system images much faster. The key observation
is that most of the time in sysimage build is spent in LLVM
generating native code (serializing julia's data structures
is quite fast). Thus if we can re-use the code already
generated for the system image we're currently running, we'll
save a fair amount of time.

Unfortunately, this is not 100% straightforward since we were
assuming that no linking happens in a number of places. This
PR hacks around that, but it is not a particularly satisfying
long term solution. That said, it should work fine, and I think
it's worth doing, so that we can explore the workflow
adjustments that would rely on this.

With that said, here's how to use this (at the low level, of
course PkgCompiler would just handle this)

```shell
$ mkdir chained
$ time ./usr/bin/julia --sysimage-native-code=chained --sysimage=usr/lib/julia/sys.so --output-o chained/chained.o.a -e 'Base.__init_build();'
real	0m9.633s
user	0m8.613s
sys	0m1.020s
$ cp ../usr/lib/julia/sys-o.a . # Get the -o.a from the old sysimage
$ ar x sys-o.a # Extract it into text.o and data.o
$ rm data.o # rm the serialized sysimg data
$ mv text.o text-old.o
$ llvm-objcopy --remove-section .data.jl.unique text-old.o # rm the link between the native code and the old sysimg data
$ ar x chained.o.a # Extract new sysimage files
$ gcc -shared -o chained.so text.o data.o text-old.o # Link everything
$ ../julia --sysimage=chained.so
```

As can be seen, regenerating the system image took about 9s (the
subsequent commands aren't timed here, but take less than a second total).
This compares very favorably with a non-chained sysimg rebuild:

```
time ./usr/bin/julia --sysimage=usr/lib/julia/sys.so --output-o nonchained.o.a -e 'Base.__init_build();'

real	2m42.667s
user	2m39.211s
sys	0m3.452s
```

Of course if you do load additional packages, the extra code
does still need to be compiled, so e.g. building a system image
for `Plots` goes from 3 mins to 1 mins (building all of plots,
plus everything in base that got invalidated). That is still all in
LLVM though - it should be relatively straightforward to
multithread that after this PR (since linking the sysimg
in multiple pieces is allowed). That part is not implemented
yet though.
  • Loading branch information
Keno committed Apr 10, 2021
1 parent 2ca1f76 commit c2d1343
Show file tree
Hide file tree
Showing 15 changed files with 236 additions and 58 deletions.
8 changes: 8 additions & 0 deletions base/Base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,14 @@ function __init__()
nothing
end

function __init_build()
reinit_stdio()
Sys.__init_build()
init_depot_path()
init_load_path()
init_active_project()
end

end

end # baremodule Base
95 changes: 82 additions & 13 deletions src/aotcompile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@
#include <llvm/IR/LegacyPassManagers.h>
#include <llvm/Transforms/Utils/Cloning.h>


using namespace llvm;

// our passes
Expand Down Expand Up @@ -181,10 +180,12 @@ static void emit_offset_table(Module &mod, const std::vector<GlobalValue*> &vars
addrs[i] = ConstantExpr::getBitCast(var, T_psize);
}
ArrayType *vars_type = ArrayType::get(T_psize, nvars);
new GlobalVariable(mod, vars_type, true,
GlobalVariable *GV =
new GlobalVariable(mod, vars_type, true,
GlobalVariable::ExternalLinkage,
ConstantArray::get(vars_type, addrs),
name);
GV->setSection(JL_SYSIMG_LINK_SECTION);
}

static bool is_safe_char(unsigned char c)
Expand Down Expand Up @@ -276,6 +277,32 @@ static void jl_ci_cache_lookup(const jl_cgparams_t &cgparams, jl_method_instance
*ci_out = codeinst;
}

extern FunctionType *jl_func_sig;

StringRef lookup_sysimage_fname(void *ptr, jl_code_instance_t *codeinst)
{
if (ptr == (void*)&jl_fptr_args) {
return "jl_fptr_args";
} else if (ptr == (void*)&jl_fptr_sparam) {
return "jl_fptr_sparam";
}
return jl_ExecutionEngine->getFunctionAtAddress((uintptr_t)ptr, codeinst);
}

extern Type *T_pjlvalue;
static void add_gv(void *ctx, void *mod, jl_value_t **gv_slot)
{
jl_codegen_params_t *params = (jl_codegen_params_t*)ctx;
Module *M = (Module *)mod;

GlobalVariable* &lgv = params->globals[*gv_slot];
assert(!lgv);
lgv = new GlobalVariable(*M, T_pjlvalue,
false, GlobalVariable::PrivateLinkage,
NULL, jl_ExecutionEngine->getGlobalAtAddress((uintptr_t)gv_slot));
lgv->setExternallyInitialized(true);
}

// takes the running content that has collected in the shadow module and dump it to disk
// this builds the object file portion of the sysimage files for fast startup, and can
// also be used be extern consumers like GPUCompiler.jl to obtain a module containing
Expand All @@ -302,6 +329,10 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
imaging_mode = 1;
std::unique_ptr<Module> clone(jl_create_llvm_module("text"));

if (jl_options.use_sysimage_native_code==JL_OPTIONS_USE_SYSIMAGE_NATIVE_CODE_CHAINED) {
jl_foreach_sysimg_gvar_slot(add_gv, (void*)&params, (void*)clone.get());
}

// compile all methods for the current world and type-inference world
size_t compile_for[] = { jl_typeinf_world, jl_world_counter };
for (int worlds = 0; worlds < 2; worlds++) {
Expand Down Expand Up @@ -330,6 +361,18 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
// find and prepare the source code to compile
jl_code_instance_t *codeinst = NULL;
jl_ci_cache_lookup(cgparams, mi, params.world, &codeinst, &src);
jl_llvm_functions_t fnames = {
lookup_sysimage_fname((void*)codeinst->invoke, codeinst).str(),
lookup_sysimage_fname(codeinst->specptr.fptr, codeinst).str(),
};
if (fnames.functionObject.rfind("jsys", 0) != 0 &&
fnames.specFunctionObject.rfind("jsys", 0) != 0) {
// Skip things already in the sysimage, we'll pick it up
// from there.
std::unique_ptr<Module> no_module(nullptr);
emitted[codeinst] = std::make_tuple(std::move(no_module), fnames);
continue;
}
if (src && !emitted.count(codeinst)) {
// now add it to our compilation results
JL_GC_PROMISE_ROOTED(codeinst->rettype);
Expand All @@ -356,11 +399,25 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
// clones the contents of the module `m` to the shadow_output collector
// while examining and recording what kind of function pointer we have
for (auto &def : emitted) {
jl_merge_module(clone.get(), std::move(std::get<0>(def.second)));
jl_code_instance_t *this_code = def.first;
jl_llvm_functions_t decls = std::get<1>(def.second);
StringRef func = decls.functionObject;
StringRef cfunc = decls.specFunctionObject;
if (std::get<0>(def.second))
jl_merge_module(clone.get(), std::move(std::get<0>(def.second)));
else {
// TODO: Probably wait until all other modules were merged
// TODO: These signatures aren't actually right, but it's not worth
// trying to compute signatures for these. Maybe declare them as
// void* global variables instead and have jl_merge_module know
// how to merge them if it comes to it?
Function::Create(jl_func_sig,
GlobalVariable::ExternalLinkage,
func, clone.get());
Function::Create(jl_func_sig,
GlobalVariable::ExternalLinkage,
cfunc, clone.get());
}
uint32_t func_id = 0;
uint32_t cfunc_id = 0;
if (func == "jl_fptr_args") {
Expand Down Expand Up @@ -389,8 +446,10 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
// and set them to be internalized and initialized at startup
for (auto &global : gvars) {
GlobalVariable *G = cast<GlobalVariable>(clone->getNamedValue(global));
G->setInitializer(ConstantPointerNull::get(cast<PointerType>(G->getValueType())));
G->setLinkage(GlobalVariable::InternalLinkage);
if (!G->isExternallyInitialized())
G->setInitializer(ConstantPointerNull::get(cast<PointerType>(G->getValueType())));
G->setLinkage(GlobalVariable::ExternalLinkage);
G->setVisibility(GlobalVariable::HiddenVisibility);
data->jl_sysimg_gvars.push_back(G);
}

Expand All @@ -409,8 +468,12 @@ void *jl_create_native(jl_array_t *methods, const jl_cgparams_t cgparams, int _p
if (policy == CompilationPolicy::Default) {
for (GlobalObject &G : clone->global_objects()) {
if (!G.isDeclaration()) {
G.setLinkage(Function::InternalLinkage);
makeSafeName(G);
if (G.getLinkage() != GlobalVariable::InternalLinkage) {
G.setLinkage(Function::ExternalLinkage);
G.setVisibility(GlobalVariable::HiddenVisibility);
}
if (isa<GlobalVariable>(&G) && !cast<GlobalVariable>(&G)->isExternallyInitialized())
makeSafeName(G);
addComdat(&G);
#if defined(_OS_WINDOWS_) && defined(_CPU_X86_64_)
// Add unwind exception personalities to functions to handle async exceptions
Expand Down Expand Up @@ -515,10 +578,13 @@ void jl_dump_native(void *native_code,
std::vector<NewArchiveMember> unopt_bc_Archive;
std::vector<std::string> outputs;

bool sysimg_chained = jl_options.use_sysimage_native_code ==
JL_OPTIONS_USE_SYSIMAGE_NATIVE_CODE_CHAINED;

if (unopt_bc_fname)
PM.add(createBitcodeWriterPass(unopt_bc_OS));
if (bc_fname || obj_fname || asm_fname) {
addOptimizationPasses(&PM, jl_options.opt_level, true, true);
addOptimizationPasses(&PM, jl_options.opt_level, true, true, sysimg_chained);
addMachinePasses(&PM, TM.get());
}
if (bc_fname)
Expand Down Expand Up @@ -550,12 +616,15 @@ void jl_dump_native(void *native_code,
// reflect the address of the jl_RTLD_DEFAULT_handle variable
// back to the caller, so that we can check for consistency issues
GlobalValue *jlRTLD_DEFAULT_var = jl_emit_RTLD_DEFAULT_var(data->M.get());
addComdat(new GlobalVariable(*data->M,
GlobalVariable *jlRTLD_DEFAULT_var_pointer =
new GlobalVariable(*data->M,
jlRTLD_DEFAULT_var->getType(),
true,
GlobalVariable::ExternalLinkage,
jlRTLD_DEFAULT_var,
"jl_RTLD_DEFAULT_handle_pointer"));
"jl_RTLD_DEFAULT_handle_pointer");
jlRTLD_DEFAULT_var_pointer->setSection(JL_SYSIMG_LINK_SECTION);
addComdat(jlRTLD_DEFAULT_var_pointer);
}

// do the actual work
Expand Down Expand Up @@ -627,7 +696,7 @@ void addMachinePasses(legacy::PassManagerBase *PM, TargetMachine *TM)
// this defines the set of optimization passes defined for Julia at various optimization levels.
// it assumes that the TLI and TTI wrapper passes have already been added.
void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level,
bool lower_intrinsics, bool dump_native)
bool lower_intrinsics, bool dump_native, bool chained)
{
#ifdef JL_DEBUG_BUILD
PM->add(createGCInvariantVerifierPass(true));
Expand Down Expand Up @@ -660,7 +729,7 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level,
PM->add(createRemoveNIPass());
}
PM->add(createLowerSimdLoopPass()); // Annotate loop marked with "loopinfo" as LLVM parallel loop
if (dump_native)
if (dump_native && !chained)
PM->add(createMultiVersioningPass());
#if defined(JL_ASAN_ENABLED)
PM->add(createAddressSanitizerFunctionPass());
Expand Down Expand Up @@ -695,7 +764,7 @@ void addOptimizationPasses(legacy::PassManagerBase *PM, int opt_level,
// consider AggressiveInstCombinePass at optlevel > 2
PM->add(createInstructionCombiningPass());
PM->add(createCFGSimplificationPass());
if (dump_native)
if (dump_native && !chained)
PM->add(createMultiVersioningPass());
PM->add(createSROAPass());
PM->add(createInstSimplifyLegacyPass());
Expand Down
4 changes: 3 additions & 1 deletion src/ccall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ extern const char jl_crtdll_basename[];
// somewhat unusual variable, in that aotcompile wants to get the address of this for a sanity check
GlobalVariable *jl_emit_RTLD_DEFAULT_var(Module *M)
{
return prepare_global_in(M, jlRTLD_DEFAULT_var);
GlobalVariable *var = prepare_global_in(M, jlRTLD_DEFAULT_var);
var->setSection(JL_SYSIMG_LINK_SECTION);
return var;
}

// Find or create the GVs for the library and symbol lookup.
Expand Down
29 changes: 21 additions & 8 deletions src/cgutils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ static Value *stringConstPtr(
StringRef ctxt(txt.c_str(), txt.size() + 1);
Constant *Data = ConstantDataArray::get(jl_LLVMContext, arrayRefFromStringRef(ctxt));
GlobalVariable *gv = get_pointer_to_constant(emission_context, Data, "_j_str", *M);
gv->setLinkage(GlobalVariable::InternalLinkage);
Value *zero = ConstantInt::get(Type::getInt32Ty(jl_LLVMContext), 0);
Value *Args[] = { zero, zero };
return irbuilder.CreateInBoundsGEP(gv->getValueType(), gv, Args);
Expand Down Expand Up @@ -206,16 +207,25 @@ static Value *julia_pgv(jl_codectx_t &ctx, const char *cname, void *addr)
if (!gv) {
raw_string_ostream(gvname) << cname << ctx.global_targets.size();
localname = StringRef(gvname);
gv = new GlobalVariable(*M, T_pjlvalue,
false, GlobalVariable::PrivateLinkage,
NULL, localname);
}
else {
localname = gv->getName();
if (gv->getParent() != M)
if (gv->getParent() != M) {
GlobalVariable *oldgv = gv;
gv = cast_or_null<GlobalVariable>(M->getNamedValue(localname));
if (!gv) {
gv = new GlobalVariable(*M, T_pjlvalue,
false, oldgv->getLinkage(),
NULL, localname);
if (oldgv->isExternallyInitialized()) {
gv->setExternallyInitialized(true);
}
}
}
}
if (gv == nullptr)
gv = new GlobalVariable(*M, T_pjlvalue,
false, GlobalVariable::PrivateLinkage,
NULL, localname);
// LLVM passes sometimes strip metadata when moving load around
// since the load at the new location satisfy the same condition as the origional one.
// Mark the global as constant to LLVM code using our own metadata
Expand Down Expand Up @@ -295,7 +305,7 @@ static Value *literal_pointer_val_slot(jl_codectx_t &ctx, jl_value_t *p)
return julia_pgv(ctx, "jl_sym#", addr, NULL, p);
}
// something else gets just a generic name
return julia_pgv(ctx, "jl_global#", p);
return julia_pgv(ctx, "jl_global#abc#", p);
}

static size_t dereferenceable_size(jl_value_t *jt)
Expand Down Expand Up @@ -1524,8 +1534,11 @@ static Value *data_pointer(jl_codectx_t &ctx, const jl_cgval_t &x)
Value *data = x.V;
if (x.constant) {
Constant *val = julia_const_to_llvm(ctx, x.constant);
if (val)
data = get_pointer_to_constant(ctx.emission_context, val, "_j_const", *jl_Module);
if (val) {
GlobalVariable *gv = get_pointer_to_constant(ctx.emission_context, val, "_j_const", *jl_Module);
gv->setLinkage(GlobalVariable::InternalLinkage);
data = gv;
}
else
data = literal_pointer_val(ctx, x.constant);
}
Expand Down
8 changes: 5 additions & 3 deletions src/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,13 @@ static DataLayout &jl_data_layout = *(new DataLayout(""));

// types
static Type *T_jlvalue;
static Type *T_pjlvalue;
Type *T_pjlvalue;
static Type *T_prjlvalue;
static Type *T_ppjlvalue;
static Type *T_pprjlvalue;
static Type *jl_array_llvmt;
static Type *jl_parray_llvmt;
static FunctionType *jl_func_sig;
FunctionType *jl_func_sig;
static FunctionType *jl_func_sig_sparams;
static Type *T_pvoidfunc;

Expand Down Expand Up @@ -1294,7 +1294,9 @@ static inline jl_cgval_t value_to_pointer(jl_codectx_t &ctx, Value *v, jl_value_
{
Value *loc;
if (valid_as_globalinit(v)) { // llvm can't handle all the things that could be inside a ConstantExpr
loc = get_pointer_to_constant(ctx.emission_context, cast<Constant>(v), "_j_const", *jl_Module);
GlobalVariable *gv = get_pointer_to_constant(ctx.emission_context, cast<Constant>(v), "_j_const", *jl_Module);
gv->setLinkage(GlobalVariable::InternalLinkage);
loc = gv;
}
else {
loc = emit_static_alloca(ctx, v->getType());
Expand Down
2 changes: 1 addition & 1 deletion src/debuginfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ openDebugInfo(StringRef debuginfopath, const debug_link_info &info)
std::move(SplitFile.get()));
}

static uint64_t jl_sysimage_base;
uint64_t jl_sysimage_base;
static jl_sysimg_fptrs_t sysimg_fptrs;
static jl_method_instance_t **sysimg_fvars_linfo;
static size_t sysimg_fvars_n;
Expand Down
45 changes: 43 additions & 2 deletions src/jitlayers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ RTDyldMemoryManager* createRTDyldMemoryManager(void);

void jl_init_jit(void) { }

extern "C"
void jl_init_sysimage_chaining(void *sysimg_base, char *fname)
{
auto errorobj = llvm::object::ObjectFile::createObjectFile(fname);
if (!errorobj) {
jl_error("Failed to load sysimg symbol table");
}

auto *theobj = errorobj->getBinary();
jl_ExecutionEngine->addSysimgSymbolsByName(sysimg_base, theobj);
}

// Snooping on which functions are being compiled, and how long it takes
JL_STREAM *dump_compiles_stream = NULL;
extern "C" JL_DLLEXPORT
Expand Down Expand Up @@ -757,9 +769,9 @@ void JuliaOJIT::addModule(std::unique_ptr<Module> M)
SectionMemoryManager::getSymbolAddressInProcess(
getMangledName(F->getName())))) {
llvm::errs() << "FATAL ERROR: "
<< "Symbol \"" << F->getName().str() << "\""
<< "Symbol \"" << F->getName().str() << "\" "
<< "not found";
abort();
abort();
}
}
}
Expand Down Expand Up @@ -849,6 +861,35 @@ StringRef JuliaOJIT::getFunctionAtAddress(uint64_t Addr, jl_code_instance_t *cod
return fname;
}

StringRef JuliaOJIT::getGlobalAtAddress(uint64_t Addr)
{
auto fname = ReverseLocalSymbolTable[(void*)(uintptr_t)Addr];
assert(!fname.empty());
return fname;
}

void JuliaOJIT::addSysimgSymbolsByName(void *sysimg_base, llvm::object::ObjectFile *ofile)
{
for (auto symbol : ofile->symbols()) {
if (symbol.getType().get() != llvm::object::SymbolRef::ST_Function &&
symbol.getType().get() != llvm::object::SymbolRef::ST_Data) {
continue;
}
if (symbol.getFlags().get() & llvm::object::SymbolRef::SF_Undefined) {
continue;
}
void *Addr = (void*)((char*)sysimg_base + symbol.getAddress().get());
auto &fname = ReverseLocalSymbolTable[Addr];
if (fname.empty()) {
StringRef symname = symbol.getName().get();
jl_sym_t *symsym = jl_symbol_n(symname.data(), symname.size());
fname = StringRef(jl_symbol_name(symsym), symname.size());
assert(!fname.empty());
addGlobalMapping(fname, (uintptr_t)Addr);
}
}
}


void JuliaOJIT::RegisterJITEventListener(JITEventListener *L)
{
Expand Down
Loading

0 comments on commit c2d1343

Please sign in to comment.