From 9187c4e49c7ef80b195ca0e8d1e24b06d8f4ce1f Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Tue, 21 Jan 2014 13:23:09 +0000 Subject: [PATCH 01/17] Add incremental GC & write barrier. This also includes various allocation changes which should improve performances. There is also a start of generational behavior for <2k objects. This broke the heuristics in the process, still pretty much a WIP. --- src/alloc.c | 27 +- src/array.c | 17 +- src/ast.c | 38 +- src/builtins.c | 89 ++- src/cgutils.cpp | 24 +- src/codegen.cpp | 146 +++- src/dump.c | 42 +- src/gc.c | 1660 ++++++++++++++++++++++++++++++++++++-------- src/gf.c | 74 +- src/init.c | 3 +- src/interpreter.c | 9 +- src/intrinsics.cpp | 2 +- src/jltypes.c | 11 + src/julia.expmap | 2 + src/julia.h | 61 +- src/module.c | 8 + src/options.h | 4 +- src/table.c | 11 +- src/task.c | 24 +- src/toplevel.c | 12 +- 20 files changed, 1861 insertions(+), 403 deletions(-) diff --git a/src/alloc.c b/src/alloc.c index daaa3f64ac007..654e12528dd0d 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -249,6 +249,7 @@ void jl_set_nth_field(jl_value_t *v, size_t i, jl_value_t *rhs) size_t offs = jl_field_offset(st,i) + sizeof(void*); if (st->fields[i].isptr) { *(jl_value_t**)((char*)v + offs) = rhs; + if(rhs != NULL) gc_wb(v, rhs); } else { jl_assign_bits((char*)v + offs, rhs); @@ -488,7 +489,7 @@ static jl_sym_t *mk_symbol(const char *str) static void unmark_symbols_(jl_sym_t *root) { while (root != NULL) { - root->type = (jl_value_t*)(((uptrint_t)root->type)&~1UL); + root->type = (jl_value_t*)(((uptrint_t)root->type)&~3UL); unmark_symbols_(root->left); root = root->right; } @@ -496,9 +497,10 @@ static void unmark_symbols_(jl_sym_t *root) void jl_unmark_symbols(void) { unmark_symbols_(symtab); } -static jl_sym_t **symtab_lookup(jl_sym_t **ptree, const char *str) +static jl_sym_t **symtab_lookup(jl_sym_t **ptree, const char *str, jl_sym_t **parent) { int x; + if (parent != NULL) *parent = NULL; uptrint_t h = hash_symbol(str, strlen(str)); // Tree nodes sorted by major key of (int(hash)) and minor key o (str). @@ -509,6 +511,7 @@ static jl_sym_t **symtab_lookup(jl_sym_t **ptree, const char *str) if (x == 0) return ptree; } + if (parent != NULL) *parent = *ptree; if (x < 0) ptree = &(*ptree)->left; else @@ -520,16 +523,19 @@ static jl_sym_t **symtab_lookup(jl_sym_t **ptree, const char *str) jl_sym_t *jl_symbol(const char *str) { jl_sym_t **pnode; - - pnode = symtab_lookup(&symtab, str); - if (*pnode == NULL) + jl_sym_t *parent; + pnode = symtab_lookup(&symtab, str, &parent); + if (*pnode == NULL) { *pnode = mk_symbol(str); + if (parent != NULL) + gc_wb(parent, *pnode); + } return *pnode; } jl_sym_t *jl_symbol_lookup(const char *str) { - return *symtab_lookup(&symtab, str); + return *symtab_lookup(&symtab, str, NULL); } DLLEXPORT jl_sym_t *jl_symbol_n(const char *str, int32_t len) @@ -627,6 +633,7 @@ void jl_add_constructors(jl_datatype_t *t) cfactory = jl_instantiate_method((jl_function_t*)t->name->ctor_factory, env); cfactory->linfo->ast = jl_prepare_ast(cfactory->linfo, cfactory->linfo->sparams); + gc_wb(cfactory->linfo, cfactory->linfo->ast); } else { cfactory = ((jl_datatype_t*)t)->name->static_ctor_factory; @@ -715,25 +722,31 @@ jl_datatype_t *jl_new_datatype(jl_sym_t *name, jl_datatype_t *super, t = jl_bool_type; } if (t == NULL) { - t = jl_new_uninitialized_datatype(jl_tuple_len(fnames)); if (jl_is_typename(name)) tn = (jl_typename_t*)name; else tn = jl_new_typename((jl_sym_t*)name); + t = jl_new_uninitialized_datatype(jl_tuple_len(fnames)); t->name = tn; + gc_wb(t, t->name); } if (t->name->primary == NULL) t->name->primary = (jl_value_t*)t; t->super = super; + if(super != NULL) gc_wb(t, t->super); t->parameters = parameters; + gc_wb(t, t->parameters); t->names = fnames; + gc_wb(t, t->names); t->types = ftypes; + if(ftypes != NULL) gc_wb(t, t->types); t->abstract = abstract; t->mutabl = mutabl; t->pointerfree = 0; t->fptr = jl_f_no_function; t->env = (jl_value_t*)t; + gc_wb(t, t->env); t->linfo = NULL; t->instance = NULL; t->struct_decl = NULL; diff --git a/src/array.c b/src/array.c index e09798cdf5ee1..91cd09f589c51 100644 --- a/src/array.c +++ b/src/array.c @@ -99,6 +99,7 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims, memset(data, 0, tot); JL_GC_POP(); } + a->pooled = tsz <= 2048; a->data = data; if (elsz == 1) ((char*)data)[tot-1] = '\0'; @@ -147,8 +148,10 @@ jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data, jl_tuple_t *di size_t ndims = jl_tuple_len(dims); int ndimwords = jl_array_ndimwords(ndims); - a = (jl_array_t*)allocobj((sizeof(jl_array_t) + sizeof(void*) + ndimwords*sizeof(size_t) + 15)&-16); + int tsz = (sizeof(jl_array_t) + sizeof(void*) + ndimwords*sizeof(size_t) + 15)&-16; + a = (jl_array_t*)allocobj(tsz); a->type = atype; + a->pooled = tsz <= 2048; a->ndims = ndims; a->offset = 0; a->data = NULL; @@ -211,8 +214,9 @@ jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data, size_t nel, elsz = jl_datatype_size(el_type); else elsz = sizeof(void*); - - a = (jl_array_t*)allocobj((sizeof(jl_array_t)+jl_array_ndimwords(1)*sizeof(size_t)+15)&-16); + int tsz = (sizeof(jl_array_t)+jl_array_ndimwords(1)*sizeof(size_t)+15)&-16; + a = (jl_array_t*)allocobj(tsz); + a->pooled = tsz <= 2048; a->type = atype; a->data = data; #ifdef STORE_ARRAY_LEN @@ -260,7 +264,9 @@ jl_array_t *jl_ptr_to_array(jl_value_t *atype, void *data, jl_tuple_t *dims, elsz = sizeof(void*); int ndimwords = jl_array_ndimwords(ndims); - a = (jl_array_t*)allocobj((sizeof(jl_array_t) + ndimwords*sizeof(size_t)+15)&-16); + int tsz = (sizeof(jl_array_t) + ndimwords*sizeof(size_t)+15)&-16; + a = (jl_array_t*)allocobj(tsz); + a->pooled = tsz <= 2048; a->type = atype; a->data = data; #ifdef STORE_ARRAY_LEN @@ -500,6 +506,7 @@ void jl_arrayset(jl_array_t *a, jl_value_t *rhs, size_t i) } else { ((jl_value_t**)a->data)[i] = rhs; + gc_wb(a, rhs); } } @@ -569,6 +576,8 @@ static void array_resize_buffer(jl_array_t *a, size_t newlen, size_t oldlen, siz a->isshared = 0; if (a->ptrarray || es==1) memset(newdata+offsnb+oldnbytes, 0, nbytes-oldnbytes-offsnb); + if(a->how == 1) + gc_wb_buf(a, newdata); // to protect data : if a is already marked we wont mark newdata (in cases how = 1) on the next collection a->maxsize = newlen; } diff --git a/src/ast.c b/src/ast.c index 773193375d40f..dd2cdb8beb32a 100644 --- a/src/ast.c +++ b/src/ast.c @@ -720,6 +720,7 @@ static jl_value_t *copy_ast(jl_value_t *expr, jl_tuple_t *sp, int do_sp) // of a top-level thunk that gets type inferred. li->def = li; li->ast = jl_prepare_ast(li, li->sparams); + gc_wb(li, li->ast); JL_GC_POP(); return (jl_value_t*)li; } @@ -738,17 +739,18 @@ static jl_value_t *copy_ast(jl_value_t *expr, jl_tuple_t *sp, int do_sp) jl_expr_t *ne = jl_exprn(e->head, jl_array_len(e->args)); JL_GC_PUSH1(&ne); if (e->head == lambda_sym) { - jl_exprarg(ne, 0) = copy_ast(jl_exprarg(e,0), sp, 0); - jl_exprarg(ne, 1) = copy_ast(jl_exprarg(e,1), sp, 0); - jl_exprarg(ne, 2) = copy_ast(jl_exprarg(e,2), sp, 1); + jl_exprargset(ne, 0, copy_ast(jl_exprarg(e,0), sp, 0)); + jl_exprargset(ne, 1, copy_ast(jl_exprarg(e,1), sp, 0)); + jl_exprargset(ne, 2, copy_ast(jl_exprarg(e,2), sp, 1)); } else if (e->head == assign_sym) { - jl_exprarg(ne, 0) = copy_ast(jl_exprarg(e,0), sp, 0); - jl_exprarg(ne, 1) = copy_ast(jl_exprarg(e,1), sp, 1); + jl_exprargset(ne, 0, copy_ast(jl_exprarg(e,0), sp, 0)); + jl_exprargset(ne, 1, copy_ast(jl_exprarg(e,1), sp, 1)); } else { - for(size_t i=0; i < jl_array_len(e->args); i++) - jl_exprarg(ne, i) = copy_ast(jl_exprarg(e,i), sp, 1); + for(size_t i=0; i < jl_array_len(e->args); i++) { + jl_exprargset(ne, i, copy_ast(jl_exprarg(e,i), sp, 1)); + } } JL_GC_POP(); return (jl_value_t*)ne; @@ -766,10 +768,12 @@ DLLEXPORT jl_value_t *jl_copy_ast(jl_value_t *expr) ne = jl_exprn(e->head, l); if (l == 0) { ne->args = jl_alloc_cell_1d(0); + gc_wb(ne, ne->args); } else { - for(i=0; i < l; i++) - jl_exprarg(ne, i) = jl_copy_ast(jl_exprarg(e,i)); + for(i=0; i < l; i++) { + jl_exprargset(ne, i, jl_copy_ast(jl_exprarg(e,i))); + } } JL_GC_POP(); return (jl_value_t*)ne; @@ -806,17 +810,18 @@ static jl_value_t *dont_copy_ast(jl_value_t *expr, jl_tuple_t *sp, int do_sp) else if (jl_is_expr(expr)) { jl_expr_t *e = (jl_expr_t*)expr; if (e->head == lambda_sym) { - jl_exprarg(e, 0) = dont_copy_ast(jl_exprarg(e,0), sp, 0); - jl_exprarg(e, 1) = dont_copy_ast(jl_exprarg(e,1), sp, 0); - jl_exprarg(e, 2) = dont_copy_ast(jl_exprarg(e,2), sp, 1); + jl_exprargset(e, 0, dont_copy_ast(jl_exprarg(e,0), sp, 0)); + jl_exprargset(e, 1, dont_copy_ast(jl_exprarg(e,1), sp, 0)); + jl_exprargset(e, 2, dont_copy_ast(jl_exprarg(e,2), sp, 1)); } else if (e->head == assign_sym) { - jl_exprarg(e, 0) = dont_copy_ast(jl_exprarg(e,0), sp, 0); - jl_exprarg(e, 1) = dont_copy_ast(jl_exprarg(e,1), sp, 1); + jl_exprargset(e, 0, dont_copy_ast(jl_exprarg(e,0), sp, 0)); + jl_exprargset(e, 1, dont_copy_ast(jl_exprarg(e,1), sp, 1)); } else { - for(size_t i=0; i < jl_array_len(e->args); i++) - jl_exprarg(e, i) = dont_copy_ast(jl_exprarg(e,i), sp, 1); + for(size_t i=0; i < jl_array_len(e->args); i++) { + jl_exprargset(e, i, dont_copy_ast(jl_exprarg(e,i), sp, 1)); + } } return (jl_value_t*)e; } @@ -834,6 +839,7 @@ static void eval_decl_types(jl_array_t *vi, jl_value_t *ast, jl_tuple_t *spenv) (jl_value_t*)spenv, (jl_expr_t*)ast, 1, 1); if (ty != NULL && (jl_is_type(ty) || jl_is_typevar(ty))) { jl_cellref(v, 1) = ty; + gc_wb(v, ty); } else { jl_cellref(v, 1) = (jl_value_t*)jl_any_type; diff --git a/src/builtins.c b/src/builtins.c index 7225f486bf97e..709a7bb392c3e 100644 --- a/src/builtins.c +++ b/src/builtins.c @@ -490,6 +490,15 @@ JL_CALLABLE(jl_f_isdefined) return jl_boundp(m, s) ? jl_true : jl_false; } +/*jl_value_t* jl_tupleset(void* t, int i, void* x) { + if(x != 0) gc_wb(t, x); + return ((((jl_value_t**)(t))[2+(i)])=(jl_value_t*)(x)); + }*/ +/*jl_value_t* jl_cellset(void* a, int i, void* x) { + if(x != 0) gc_wb_back(a); + return ((((jl_value_t**)((jl_array_t*)a)->data)[(i)])=((jl_value_t*)(x))); + }*/ + // tuples --------------------------------------------------------------------- JL_CALLABLE(jl_f_tuple) @@ -797,6 +806,7 @@ void jl_trampoline_compile_function(jl_function_t *f, int always_infer, jl_tuple if (!jl_in_inference) { if (!jl_is_expr(f->linfo->ast)) { f->linfo->ast = jl_uncompress_ast(f->linfo, f->linfo->ast); + gc_wb(f->linfo, f->linfo->ast); } if (always_infer || jl_eval_with_compiler_p(jl_lam_body((jl_expr_t*)f->linfo->ast),1)) { jl_type_infer(f->linfo, sig, f->linfo); @@ -810,6 +820,7 @@ void jl_trampoline_compile_function(jl_function_t *f, int always_infer, jl_tuple jl_generate_fptr(f); if (jl_boot_file_loaded && jl_is_expr(f->linfo->ast)) { f->linfo->ast = jl_compress_ast(f->linfo, f->linfo->ast); + gc_wb(f->linfo, f->linfo->ast); } } @@ -1108,7 +1119,7 @@ void jl_init_primitives(void) // toys for debugging --------------------------------------------------------- // comma_one prints a comma for 1 element, e.g. "(x,)" -static size_t jl_show_tuple(JL_STREAM *out, jl_tuple_t *t, char *opn, char *cls, int comma_one) +static size_t jl_show_tuple(JL_STREAM *out, jl_tuple_t *t, char *opn, char *cls, int comma_one, int depth) { size_t i, n=0, len = jl_tuple_len(t); n += JL_PRINTF(out, "("); @@ -1124,35 +1135,42 @@ static size_t jl_show_tuple(JL_STREAM *out, jl_tuple_t *t, char *opn, char *cls, return n; } -DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) +#define MAX_DEPTH 5 + +size_t jl_static_show_x(JL_STREAM *out, jl_value_t *v, int depth) { // mimic jl_show, but never calling a julia method size_t n = 0; + if(depth > MAX_DEPTH) return 0; // cheap way of bailing out of cycles + depth++; if (v == NULL) { n += JL_PRINTF(out, "#"); } + else if(jl_typeof(v) == NULL) { + n += JL_PRINTF(out, ""); + } else if (jl_is_lambda_info(v)) { jl_lambda_info_t *li = (jl_lambda_info_t*)v; - n += jl_static_show(out, (jl_value_t*)li->module); + n += jl_static_show_x(out, (jl_value_t*)li->module, depth); n += JL_PRINTF(out, ".%s", li->name->name); if (li->specTypes) { - n += jl_static_show(out, (jl_value_t*)li->specTypes); + n += jl_static_show_x(out, (jl_value_t*)li->specTypes, depth); } else { n += JL_PRINTF(out, "(?)"); } } else if (jl_is_tuple(v)) { - n += jl_show_tuple(out, (jl_tuple_t*)v, "(", ")", 1); + n += jl_show_tuple(out, (jl_tuple_t*)v, "(", ")", 1, depth); } else if (jl_is_vararg_type(v)) { - n += jl_static_show(out, jl_tparam0(v)); + n += jl_static_show_x(out, jl_tparam0(v), depth); n += JL_PRINTF(out, "..."); } else if (jl_is_datatype(v)) { jl_datatype_t *dv = (jl_datatype_t*)v; if (dv->name->module != jl_core_module) { - n += jl_static_show(out, (jl_value_t*)dv->name->module); + n += jl_static_show_x(out, (jl_value_t*)dv->name->module, depth); JL_PUTS(".", out); n += 1; } n += JL_PRINTF(out, "%s", dv->name->name->name); @@ -1162,7 +1180,7 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) n += JL_PRINTF(out, "{"); for (j = 0; j < tlen; j++) { jl_value_t *p = jl_tupleref(dv->parameters,j); - n += jl_static_show(out, p); + n += jl_static_show_x(out, p, depth); if (j != tlen-1) n += JL_PRINTF(out, ", "); } @@ -1232,10 +1250,10 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) } else if (jl_is_uniontype(v)) { n += JL_PRINTF(out, "Union"); - n += jl_static_show(out, (jl_value_t*)((jl_uniontype_t*)v)->types); + n += jl_static_show_x(out, (jl_value_t*)((jl_uniontype_t*)v)->types, depth); } else if (jl_is_typector(v)) { - n += jl_static_show(out, ((jl_typector_t*)v)->body); + n += jl_static_show_x(out, ((jl_typector_t*)v)->body, depth); } else if (jl_is_typevar(v)) { n += jl_static_show(out, ((jl_tvar_t*)v)->lb); @@ -1245,7 +1263,7 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) else if (jl_is_module(v)) { jl_module_t *m = (jl_module_t*)v; if (m->parent != m && m->parent != jl_main_module) { - n += jl_static_show(out, (jl_value_t*)m->parent); + n += jl_static_show_x(out, (jl_value_t*)m->parent, depth); n += JL_PRINTF(out, "."); } n += JL_PRINTF(out, "%s", m->name->name); @@ -1255,13 +1273,13 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) } else if (jl_is_symbolnode(v)) { n += JL_PRINTF(out, "%s::", jl_symbolnode_sym(v)->name); - n += jl_static_show(out, jl_symbolnode_type(v)); + n += jl_static_show_x(out, jl_symbolnode_type(v), depth); } else if (jl_is_getfieldnode(v)) { - n += jl_static_show(out, jl_getfieldnode_val(v)); + n += jl_static_show_x(out, jl_getfieldnode_val(v), depth); n += JL_PRINTF(out, ".%s", jl_getfieldnode_name(v)->name); n += JL_PRINTF(out, "::"); - n += jl_static_show(out, jl_getfieldnode_type(v)); + n += jl_static_show_x(out, jl_getfieldnode_type(v), depth); } else if (jl_is_labelnode(v)) { n += JL_PRINTF(out, "%d:", jl_labelnode_label(v)); @@ -1272,12 +1290,17 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) else if (jl_is_quotenode(v)) { jl_value_t *qv = jl_fieldref(v,0); if (!jl_is_symbol(qv)) { n += JL_PRINTF(out, "quote "); } - n += jl_static_show(out, qv); + n += jl_static_show_x(out, jl_fieldref(v,0), depth); if (!jl_is_symbol(qv)) { n += JL_PRINTF(out, " end"); } } + else if (jl_is_newvarnode(v)) { + n += JL_PRINTF(out, ""); + } else if (jl_is_topnode(v)) { n += JL_PRINTF(out, "top("); - n += jl_static_show(out, jl_fieldref(v,0)); + n += jl_static_show_x(out, jl_fieldref(v,0), depth); n += JL_PRINTF(out, ")"); } else if (jl_is_linenode(v)) { @@ -1286,9 +1309,9 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) else if (jl_is_expr(v)) { jl_expr_t *e = (jl_expr_t*)v; if (e->head == assign_sym && jl_array_len(e->args) == 2) { - n += jl_static_show(out, jl_exprarg(e,0)); + n += jl_static_show_x(out, jl_exprarg(e,0), depth); n += JL_PRINTF(out, " = "); - n += jl_static_show(out, jl_exprarg(e,1)); + n += jl_static_show_x(out, jl_exprarg(e,1), depth); } else { char sep = ' '; @@ -1298,14 +1321,14 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) size_t i, len = jl_array_len(e->args); for (i = 0; i < len; i++) { n += JL_PRINTF(out, ",%c", sep); - n += jl_static_show(out, jl_exprarg(e,i)); + n += jl_static_show_x(out, jl_exprarg(e,i), depth); } n += JL_PRINTF(out, ")::"); - n += jl_static_show(out, e->etype); + n += jl_static_show_x(out, e->etype, depth); } } else if (jl_is_array(v)) { - n += jl_static_show(out, jl_typeof(v)); + n += jl_static_show_x(out, jl_typeof(v), depth); n += JL_PRINTF(out, "["); size_t j, tlen = jl_array_len(v); for (j = 0; j < tlen; j++) { @@ -1314,29 +1337,30 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) elt = jl_cellref(v, j); else elt = jl_arrayref((jl_array_t*)v,j); - n += jl_static_show(out, elt); + n += jl_static_show_x(out, elt, depth); if (j != tlen-1) n += JL_PRINTF(out, ", "); } + if(j < tlen) n += JL_PRINTF(out, " ..."); n += JL_PRINTF(out, "]"); } else if (jl_typeis(v,jl_loaderror_type)) { n += JL_PRINTF(out, "LoadError(at "); - n += jl_static_show(out, jl_fieldref(v, 0)); + n += jl_static_show_x(out, jl_fieldref(v, 0), depth); n += JL_PRINTF(out, " line "); - n += jl_static_show(out, jl_fieldref(v, 1)); + n += jl_static_show_x(out, jl_fieldref(v, 1), depth); n += JL_PRINTF(out, ": "); - n += jl_static_show(out, jl_fieldref(v, 2)); + n += jl_static_show_x(out, jl_fieldref(v, 2), depth); n += JL_PRINTF(out, ")"); } else if (jl_typeis(v,jl_errorexception_type)) { n += JL_PRINTF(out, "ErrorException("); - n += jl_static_show(out, jl_fieldref(v, 0)); + n += jl_static_show_x(out, jl_fieldref(v, 0), depth); n += JL_PRINTF(out, ")"); } else if (jl_is_datatype(jl_typeof(v))) { jl_datatype_t *t = (jl_datatype_t*)jl_typeof(v); - n += jl_static_show(out, (jl_value_t*)t); + n += jl_static_show_x(out, (jl_value_t*)t, depth); n += JL_PRINTF(out, "("); size_t nb = jl_datatype_size(t); size_t tlen = jl_tuple_len(t->names); @@ -1354,7 +1378,7 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) //jl_fielddesc_t f = t->fields[i]; n += JL_PRINTF(out, "="); fldval = jl_get_nth_field(v, i); - n += jl_static_show(out, fldval); + n += jl_static_show_x(out, fldval, depth); if (i != tlen-1) n += JL_PRINTF(out, ", "); } @@ -1364,12 +1388,19 @@ DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) } else { n += JL_PRINTF(out, ""); } return n; } + +DLLEXPORT size_t jl_static_show(JL_STREAM *out, jl_value_t *v) +{ + return jl_static_show_x(out, v, 0); +} + + int in_jl_ = 0; DLLEXPORT void jl_(void *jl_value) { diff --git a/src/cgutils.cpp b/src/cgutils.cpp index 7f2b519c34c97..cce4f7eed52d9 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -688,6 +688,8 @@ static Value *emit_typeof(Value *p) tt = builder. CreateLoad(builder.CreateGEP(tt,ConstantInt::get(T_size,0)), false); + tt = builder. + CreateIntToPtr(builder. CreateAnd(builder.CreatePtrToInt(tt, T_int64), ConstantInt::get(T_int64,~(uptrint_t)3)), jl_pvalue_llvmt); #ifdef OVERLAP_TUPLE_LEN tt = builder. CreateIntToPtr(builder. @@ -908,8 +910,8 @@ static Value *typed_load(Value *ptr, Value *idx_0based, jl_value_t *jltype, static Value *emit_unbox(Type *to, Value *x, jl_value_t *jt); -static void typed_store(Value *ptr, Value *idx_0based, Value *rhs, - jl_value_t *jltype, jl_codectx_t *ctx) +static Value *typed_store(Value *ptr, Value *idx_0based, Value *rhs, + jl_value_t *jltype, jl_codectx_t *ctx, Value* parent) // for the write barrier, NULL if no barrier needed { Type *elty = julia_type_to_llvm(jltype); assert(elty != NULL); @@ -918,8 +920,10 @@ static void typed_store(Value *ptr, Value *idx_0based, Value *rhs, if (elty==T_int1) { elty = T_int8; } if (jl_isbits(jltype) && ((jl_datatype_t*)jltype)->size > 0) rhs = emit_unbox(elty, rhs, jltype); - else + else { rhs = boxed(rhs,ctx); + if(parent != NULL) emit_write_barrier(ctx, parent, rhs); + } Value *data; if (ptr->getType()->getContainedType(0) != elty) data = builder.CreateBitCast(ptr, PointerType::get(elty, 0)); @@ -1653,3 +1657,17 @@ static void emit_cpointercheck(Value *x, const std::string &msg, ctx->f->getBasicBlockList().push_back(passBB); builder.SetInsertPoint(passBB); } + +// allocation for known size object +static Value* emit_allocobj(size_t static_size) +{ + if (static_size == sizeof(void*)*2) + return builder.CreateCall(prepare_call(jlalloc2w_func)); + else if (static_size == sizeof(void*)*3) + return builder.CreateCall(prepare_call(jlalloc3w_func)); + else if (static_size == sizeof(void*)*4) + return builder.CreateCall(prepare_call(jlalloc4w_func)); + else + return builder.CreateCall(prepare_call(jlallocobj_func), + ConstantInt::get(T_size, static_size)); +} diff --git a/src/codegen.cpp b/src/codegen.cpp index 1e275ac6459f4..62a03ddc6a15c 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -73,7 +73,7 @@ #include "llvm/IR/MDBuilder.h" #define LLVM33 1 #else -#include "llvm/DerivedTypes.h" +#include "llvm/DerivedTypes.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Intrinsics.h" @@ -277,6 +277,7 @@ static GlobalVariable *jldomerr_var; static GlobalVariable *jlovferr_var; static GlobalVariable *jlinexacterr_var; static GlobalVariable *jlboundserr_var; + static GlobalVariable *jlstderr_var; static GlobalVariable *jlRTLD_DEFAULT_var; #ifdef _OS_WINDOWS_ @@ -308,6 +309,7 @@ static Function *jlegal_func; static Function *jlallocobj_func; static Function *jlalloc2w_func; static Function *jlalloc3w_func; +static Function *jlalloc4w_func; static Function *jl_alloc_tuple_func; static Function *jlsubtype_func; static Function *setjmp_func; @@ -327,6 +329,9 @@ static Function *box16_func; static Function *box32_func; static Function *box64_func; static Function *jlputs_func; +static Function *wbfunc; +static Function *queuerootfun; +static Function *expect_func; static Function *jldlsym_func; static Function *jlnewbits_func; //static Function *jlgetnthfield_func; @@ -341,6 +346,11 @@ static Function *show_execution_point_func; static std::vector two_pvalue_llvmt; static std::vector three_pvalue_llvmt; +extern "C" DLLEXPORT void gc_wb_slow(void* parent, void* ptr) +{ + gc_wb(parent, ptr); +} + // --- code generation --- // per-local-variable information @@ -540,6 +550,8 @@ jl_value_t *jl_get_cpu_name(void) return jl_pchar_to_string(HostCPUName.data(), HostCPUName.size()); } +static void emit_write_barrier(jl_codectx_t*,Value*,Value*); + #include "cgutils.cpp" static void jl_rethrow_with_add(const char *fmt, ...) @@ -1346,6 +1358,65 @@ static Value *emit_boxed_rooted(jl_value_t *e, jl_codectx_t *ctx) return v; } +// if ptr is NULL this emits a write barrier _back_ +static void emit_write_barrier(jl_codectx_t* ctx, Value *parent, Value *ptr) +{ + #ifdef GC_INC + /* builder.CreateCall2(wbfunc, builder.CreateBitCast(parent, jl_pvalue_llvmt), builder.CreateBitCast(ptr, jl_pvalue_llvmt)); + return;*/ + parent = builder.CreateBitCast(parent, T_psize); + Value* parent_type = builder.CreateLoad(parent); + Value* parent_mark_bits = builder.CreateAnd(parent_type, 3); + + // the branch hint does not seem to make it to the generated code + //builder.CreateCall2(expect_func, parent_marked, ConstantInt::get(T_int1, 0)); + Value* parent_marked = builder.CreateICmpEQ(parent_mark_bits, ConstantInt::get(T_size, 1)); + + BasicBlock* cont = BasicBlock::Create(getGlobalContext(), "cont"); + BasicBlock* barrier_may_trigger; + if (ptr) barrier_may_trigger = BasicBlock::Create(getGlobalContext(), "wb_may_trigger", ctx->f); + BasicBlock* barrier_trigger = BasicBlock::Create(getGlobalContext(), "wb_trigger", ctx->f); + builder.CreateCondBr(parent_marked, ptr ? barrier_may_trigger : barrier_trigger, cont); + + if (ptr) { + builder.SetInsertPoint(barrier_may_trigger); + Value* ptr_mark_bits = builder.CreateAnd(builder.CreateLoad(builder.CreateBitCast(ptr, T_psize)), 3); + Value* ptr_not_marked = builder.CreateICmpEQ(ptr_mark_bits, ConstantInt::get(T_size, 0)); + // builder.CreateCall2(expect_func, ptr_not_marked, ConstantInt::get(T_int1, 0)); + builder.CreateCondBr(ptr_not_marked, barrier_trigger, cont); + } + builder.SetInsertPoint(barrier_trigger); + if (!ptr) { // clear the mark + builder.CreateStore(builder.CreateAnd(parent_type, ~(uintptr_t)3), parent); + } + builder.CreateCall(queuerootfun, ptr ? ptr : builder.CreateBitCast(parent, jl_pvalue_llvmt)); + builder.CreateBr(cont); + ctx->f->getBasicBlockList().push_back(cont); + builder.SetInsertPoint(cont); + #endif +} + +static void emit_checked_write_barrier(jl_codectx_t *ctx, Value *parent, Value *ptr) +{ +#ifdef GC_INC + BasicBlock *cont; + if (ptr) { + Value *not_null = builder.CreateICmpNE(ptr, V_null); + BasicBlock *if_not_null = BasicBlock::Create(getGlobalContext(), "wb_not_null", ctx->f); + cont = BasicBlock::Create(getGlobalContext(), "cont"); + builder.CreateCondBr(not_null, if_not_null, cont); + builder.SetInsertPoint(if_not_null); + } + emit_write_barrier(ctx, parent, ptr); + if (ptr) { + builder.CreateBr(cont); + ctx->f->getBasicBlockList().push_back(cont); + builder.SetInsertPoint(cont); + } +#endif +} + + // --- lambda --- static void jl_add_linfo_root(jl_lambda_info_t *li, jl_value_t *val) @@ -1353,6 +1424,7 @@ static void jl_add_linfo_root(jl_lambda_info_t *li, jl_value_t *val) li = li->def; if (li->roots == NULL) { li->roots = jl_alloc_cell_1d(1); + gc_wb(li, li->roots); jl_cellset(li->roots, 0, val); } else { @@ -1538,7 +1610,7 @@ static Value *emit_getfield(jl_value_t *expr, jl_sym_t *name, jl_codectx_t *ctx) } static void emit_setfield(jl_datatype_t *sty, Value *strct, size_t idx, - Value *rhs, jl_codectx_t *ctx, bool checked=true) + Value *rhs, jl_codectx_t *ctx, bool checked, bool wb) { if (sty->mutabl || !checked) { Value *addr = @@ -1546,11 +1618,13 @@ static void emit_setfield(jl_datatype_t *sty, Value *strct, size_t idx, ConstantInt::get(T_size, sty->fields[idx].offset + sizeof(void*))); jl_value_t *jfty = jl_tupleref(sty->types, idx); if (sty->fields[idx].isptr) { - builder.CreateStore(boxed(rhs,ctx), + rhs = boxed(rhs, ctx); + builder.CreateStore(rhs, builder.CreateBitCast(addr, jl_ppvalue_llvmt)); + if (wb) emit_checked_write_barrier(ctx, strct, rhs); } else { - typed_store(addr, ConstantInt::get(T_size, 0), rhs, jfty, ctx); + typed_store(addr, ConstantInt::get(T_size, 0), rhs, jfty, ctx, strct); } } else { @@ -1925,13 +1999,12 @@ static Value *emit_known_call(jl_value_t *ff, jl_value_t **args, size_t nargs, #else size_t nwords = nargs+2; #endif - Value *tup = - builder.CreateCall(prepare_call(jlallocobj_func), - ConstantInt::get(T_size, sizeof(void*)*nwords)); + Value *tup = emit_allocobj(sizeof(void*)*nwords); #ifdef OVERLAP_TUPLE_LEN builder.CreateStore(arg1, emit_nthptr_addr(tup, 1)); #else builder.CreateStore(arg1, emit_nthptr_addr(tup, 2)); + emit_write_barrier(ctx, tup, arg1); #endif ctx->argDepth = last_depth; #ifdef OVERLAP_TUPLE_LEN @@ -1968,6 +2041,7 @@ static Value *emit_known_call(jl_value_t *ff, jl_value_t **args, size_t nargs, } Value *argi = boxed(argval,ctx); builder.CreateStore(argi, emit_nthptr_addr(tup, i+offs)); + emit_write_barrier(ctx, tup, argi); } ctx->argDepth = last_depth; JL_GC_POP(); @@ -2094,9 +2168,13 @@ static Value *emit_known_call(jl_value_t *ff, jl_value_t **args, size_t nargs, emit_expr(args[2],ctx,false); } else { + Value* v = ety==(jl_value_t*)jl_any_type ? emit_expr(args[2],ctx) : emit_unboxed(args[2],ctx); typed_store(emit_arrayptr(ary,args[1],ctx), idx, - ety==(jl_value_t*)jl_any_type ? emit_expr(args[2],ctx) : emit_unboxed(args[2],ctx), - ety, ctx); + v, + ety, ctx, /*ety == (jl_value_t*)jl_any_type ? ary : */NULL); + if (ety == (jl_value_t*)jl_any_type) { + emit_write_barrier(ctx, ary, NULL); + } } JL_GC_POP(); return ary; @@ -2209,7 +2287,7 @@ static Value *emit_known_call(jl_value_t *ff, jl_value_t **args, size_t nargs, rhs = emit_expr(args[3], ctx); else rhs = emit_unboxed(args[3], ctx); - emit_setfield(sty, strct, idx, rhs, ctx); + emit_setfield(sty, strct, idx, rhs, ctx, true, true); JL_GC_POP(); return rhs; } @@ -2415,6 +2493,14 @@ static Value *global_binding_pointer(jl_module_t *m, jl_sym_t *s, return julia_binding_gv(b); } +static bool is_stack(Value *v) +{ + if (isa(v)) return true; + GetElementPtrInst *i = dyn_cast(v); + if (i && is_stack(i->getOperand(0))) return true; + return false; +} + // yields a jl_value_t** giving the binding location of a variable static Value *var_binding_pointer(jl_sym_t *s, jl_binding_t **pbnd, bool assign, jl_codectx_t *ctx) @@ -2575,7 +2661,9 @@ static void emit_assignment(jl_value_t *l, jl_value_t *r, jl_codectx_t *ctx) rval = emit_unbox(vt->getContainedType(0), emit_unboxed(r, ctx), vi.declType); } else { - rval = boxed(emit_expr(r, ctx, true),ctx,rt); + rval = boxed(emit_expr(r, ctx, true), ctx); + Value* box = builder.CreateGEP(bp, ConstantInt::get(T_size, -1)); + if (!is_stack(bp)) emit_write_barrier(ctx, box, rval); } if (builder.GetInsertBlock()->getTerminator() == NULL) { builder.CreateStore(rval, bp, vi.isVolatile); @@ -2891,16 +2979,13 @@ static Value *emit_expr(jl_value_t *expr, jl_codectx_t *ctx, bool isboxed, if (might_need_root(args[1]) || fval->getType() != jl_pvalue_llvmt) make_gcroot(f1, ctx); } - Value *strct = - builder.CreateCall(prepare_call(jlallocobj_func), - ConstantInt::get(T_size, - sizeof(void*)+sty->size)); + Value *strct = emit_allocobj(sizeof(void*)+sty->size); builder.CreateStore(literal_pointer_val((jl_value_t*)ty), emit_nthptr_addr(strct, (size_t)0)); if (f1) { if (!jl_subtype(expr_type(args[1],ctx), jl_t0(sty->types), 0)) emit_typecheck(f1, jl_t0(sty->types), "new", ctx); - emit_setfield(sty, strct, 0, f1, ctx, false); + emit_setfield(sty, strct, 0, f1, ctx, false, false); ctx->argDepth = fieldStart; if (nf > 1 && needroots) make_gcroot(strct, ctx); @@ -2910,7 +2995,7 @@ static Value *emit_expr(jl_value_t *expr, jl_codectx_t *ctx, bool isboxed, } for(size_t i=j; i < nf; i++) { if (sty->fields[i].isptr) { - emit_setfield(sty, strct, i, V_null, ctx, false); + emit_setfield(sty, strct, i, V_null, ctx, false, false); } } for(size_t i=j+1; i < nargs; i++) { @@ -2926,7 +3011,7 @@ static Value *emit_expr(jl_value_t *expr, jl_codectx_t *ctx, bool isboxed, if (!jl_subtype(expr_type(args[i],ctx), jl_tupleref(sty->types,i-1), 0)) emit_typecheck(rhs, jl_tupleref(sty->types,i-1), "new", ctx); } - emit_setfield(sty, strct, i-1, rhs, ctx, false); + emit_setfield(sty, strct, i-1, rhs, ctx, false, false); } ctx->argDepth = fieldStart; return strct; @@ -4003,6 +4088,7 @@ static Function *emit_function(jl_lambda_info_t *lam, bool cstyle) ctx.dbuilder->finalize(); JL_GC_POP(); + return f; } @@ -4093,6 +4179,7 @@ extern "C" DLLEXPORT jl_value_t *jl_new_box(jl_value_t *v) #else box->type = jl_box_any_type; #endif + if(v) gc_wb(box, v); ((jl_value_t**)box)[1] = v; return box; } @@ -4335,6 +4422,22 @@ static void init_julia_llvm_env(Module *m) jlcall_func_to_llvm("jl_apply_generic", (void*)&jl_apply_generic, m); jlgetfield_func = jlcall_func_to_llvm("jl_f_get_field", (void*)&jl_f_get_field, m); + std::vector wbargs(0); + wbargs.push_back(jl_pvalue_llvmt); + wbargs.push_back(jl_pvalue_llvmt); + queuerootfun = Function::Create(FunctionType::get(T_void, args_1ptr, false), + Function::ExternalLinkage, + "gc_queue_root", m); + jl_ExecutionEngine->addGlobalMapping(queuerootfun, (void*)&gc_queue_root); + wbfunc = Function::Create(FunctionType::get(T_void, wbargs, false), + Function::ExternalLinkage, + "gc_wb_slow", m); + jl_ExecutionEngine->addGlobalMapping(wbfunc, (void*)&gc_wb_slow); + + std::vector exp_args(0); + exp_args.push_back(T_int1); + expect_func = Intrinsic::getDeclaration(m, Intrinsic::expect, exp_args); + std::vector args3(0); args3.push_back(jl_pvalue_llvmt); jlbox_func = @@ -4449,6 +4552,12 @@ static void init_julia_llvm_env(Module *m) "alloc_3w", m); add_named_global(jlalloc3w_func, (void*)&alloc_3w); + jlalloc4w_func = + Function::Create(FunctionType::get(jl_pvalue_llvmt, empty_args, false), + Function::ExternalLinkage, + "alloc_4w", m); + add_named_global(jlalloc4w_func, (void*)&alloc_4w); + std::vector atargs(0); atargs.push_back(T_size); jl_alloc_tuple_func = @@ -4658,6 +4767,7 @@ extern "C" void jl_init_codegen(void) #endif options.NoFramePointerElim = true; #ifndef LLVM34 + options.JITExceptionHandling = 1; options.NoFramePointerElimNonLeaf = true; #endif #if defined(_OS_WINDOWS_) && !defined(_CPU_X86_64_) diff --git a/src/dump.c b/src/dump.c index 50c8f57e9c6c2..bd1535a569cd9 100644 --- a/src/dump.c +++ b/src/dump.c @@ -352,8 +352,10 @@ static int is_ast_node(jl_value_t *v) { if (jl_is_lambda_info(v)) { jl_lambda_info_t *li = (jl_lambda_info_t*)v; - if (jl_is_expr(li->ast)) + if (jl_is_expr(li->ast)) { li->ast = jl_compress_ast(li, li->ast); + gc_wb(li, li->ast); + } return 0; } return jl_is_symbol(v) || jl_is_expr(v) || jl_is_newvarnode(v) || @@ -639,7 +641,9 @@ static jl_value_t *jl_deserialize_datatype(ios_t *s, int pos) dt->alignment = read_int32(s); ios_read(s, (char*)&dt->fields[0], nf*sizeof(jl_fielddesc_t)); dt->names = (jl_tuple_t*)jl_deserialize_value(s); + gc_wb(dt, dt->names); dt->types = (jl_tuple_t*)jl_deserialize_value(s); + gc_wb(dt, dt->types); } else { dt->alignment = dt->size; @@ -648,10 +652,15 @@ static jl_value_t *jl_deserialize_datatype(ios_t *s, int pos) dt->names = dt->types = jl_null; } dt->parameters = (jl_tuple_t*)jl_deserialize_value(s); + gc_wb(dt, dt->parameters); dt->name = (jl_typename_t*)jl_deserialize_value(s); + gc_wb(dt, dt->name); dt->super = (jl_datatype_t*)jl_deserialize_value(s); + gc_wb(dt, dt->super); dt->env = jl_deserialize_value(s); + gc_wb(dt, dt->env); dt->linfo = (jl_lambda_info_t*)jl_deserialize_value(s); + if(dt->linfo != NULL) gc_wb(dt, dt->linfo); dt->fptr = jl_deserialize_fptr(s); if (dt->name == jl_array_type->name || dt->name == jl_pointer_type->name || dt->name == jl_type_type->name || dt->name == jl_vararg_type->name || @@ -757,6 +766,7 @@ static jl_value_t *jl_deserialize_value(ios_t *s) else { for(i=0; i < jl_array_len(a); i++) { ((jl_value_t**)a->data)[i] = jl_deserialize_value(s); + if(((jl_value_t**)a->data)[i] != NULL) gc_wb(a, ((jl_value_t**)a->data)[i]); } } return (jl_value_t*)a; @@ -772,6 +782,7 @@ static jl_value_t *jl_deserialize_value(ios_t *s) if (usetable) ptrhash_put(&backref_table, (void*)(ptrint_t)pos, (jl_value_t*)e); e->etype = jl_deserialize_value(s); + gc_wb(e, e->etype); for(i=0; i < len; i++) { jl_cellset(e->args, i, jl_deserialize_value(s)); } @@ -782,8 +793,11 @@ static jl_value_t *jl_deserialize_value(ios_t *s) if (usetable) ptrhash_put(&backref_table, (void*)(ptrint_t)pos, tv); tv->name = (jl_sym_t*)jl_deserialize_value(s); + gc_wb(tv, tv->name); tv->lb = jl_deserialize_value(s); + gc_wb(tv, tv->lb); tv->ub = jl_deserialize_value(s); + gc_wb(tv, tv->ub); tv->bound = read_int8(s); return (jl_value_t*)tv; } @@ -793,7 +807,9 @@ static jl_value_t *jl_deserialize_value(ios_t *s) if (usetable) ptrhash_put(&backref_table, (void*)(ptrint_t)pos, f); f->linfo = (jl_lambda_info_t*)jl_deserialize_value(s); + if(f->linfo != NULL) gc_wb(f, f->linfo); f->env = jl_deserialize_value(s); + gc_wb(f, f->env); f->fptr = jl_deserialize_fptr(s); return (jl_value_t*)f; } @@ -804,18 +820,30 @@ static jl_value_t *jl_deserialize_value(ios_t *s) if (usetable) ptrhash_put(&backref_table, (void*)(ptrint_t)pos, li); li->ast = jl_deserialize_value(s); + gc_wb(li, li->ast); li->sparams = (jl_tuple_t*)jl_deserialize_value(s); + gc_wb(li, li->sparams); li->tfunc = jl_deserialize_value(s); + gc_wb(li, li->tfunc); li->name = (jl_sym_t*)jl_deserialize_value(s); + gc_wb(li, li->name); li->specTypes = (jl_tuple_t*)jl_deserialize_value(s); + if(li->specTypes != NULL) gc_wb(li, li->specTypes); li->specializations = (jl_array_t*)jl_deserialize_value(s); + if(li->specializations != NULL) gc_wb(li, li->specializations); li->inferred = read_int8(s); li->file = (jl_sym_t*)jl_deserialize_value(s); + gc_wb(li, li->file); li->line = read_int32(s); li->module = (jl_module_t*)jl_deserialize_value(s); + gc_wb(li, li->module); li->roots = (jl_array_t*)jl_deserialize_value(s); + if(li->roots != NULL) gc_wb(li, li->roots); li->def = (jl_lambda_info_t*)jl_deserialize_value(s); + gc_wb(li, li->def); li->capt = jl_deserialize_value(s); + if(li->capt != NULL) gc_wb(li, li->capt); + li->fptr = &jl_trampoline; li->functionObject = NULL; li->cFunctionObject = NULL; @@ -836,14 +864,19 @@ static jl_value_t *jl_deserialize_value(ios_t *s) if (usetable) ptrhash_put(&backref_table, (void*)(ptrint_t)pos, m); m->parent = (jl_module_t*)jl_deserialize_value(s); + gc_wb(m, m->parent); while (1) { jl_sym_t *name = (jl_sym_t*)jl_deserialize_value(s); if (name == NULL) break; jl_binding_t *b = jl_get_binding_wr(m, name); b->value = jl_deserialize_value(s); + gc_wb_buf(m, b); + if(b->value != NULL) gc_wb(m, b->value); b->type = (jl_value_t*)jl_deserialize_value(s); + gc_wb(m, b->type); b->owner = (jl_module_t*)jl_deserialize_value(s); + if(b->owner != NULL) gc_wb(m, b->owner); int8_t flags = read_int8(s); b->constp = (flags>>2) & 1; b->exportp = (flags>>1) & 1; @@ -855,6 +888,7 @@ static jl_value_t *jl_deserialize_value(ios_t *s) arraylist_push(&m->usings, jl_deserialize_value(s)); } m->constant_table = (jl_array_t*)jl_deserialize_value(s); + if(m->constant_table != NULL) gc_wb(m, m->constant_table); return (jl_value_t*)m; } else if (vtag == (jl_value_t*)SmallInt64_tag) { @@ -1041,6 +1075,7 @@ void jl_restore_system_image(char *fname) datatype_list = jl_alloc_cell_1d(0); jl_array_type->env = jl_deserialize_value(&f); + gc_wb(jl_array_type, jl_array_type->env); jl_main_module = (jl_module_t*)jl_deserialize_value(&f); jl_internal_main_module = jl_main_module; @@ -1141,10 +1176,13 @@ jl_value_t *jl_compress_ast(jl_lambda_info_t *li, jl_value_t *ast) int en = jl_gc_is_enabled(); jl_gc_disable(); - if (li->module->constant_table == NULL) + if (li->module->constant_table == NULL) { li->module->constant_table = jl_alloc_cell_1d(0); + gc_wb(li->module, li->module->constant_table); + } tree_literal_values = li->module->constant_table; li->capt = (jl_value_t*)jl_lam_capt((jl_expr_t*)ast); + gc_wb(li, li->capt); if (jl_array_len(li->capt) == 0) li->capt = NULL; jl_serialize_value(&dest, jl_lam_body((jl_expr_t*)ast)->etype); diff --git a/src/gc.c b/src/gc.c index 5661e39f674b5..4daa4695bccda 100644 --- a/src/gc.c +++ b/src/gc.c @@ -13,76 +13,165 @@ #include #include +#include #include -#ifdef USE_MMAP -# include -# include -#endif #include "julia.h" #include "julia_internal.h" +#ifndef _OS_WINDOWS_ +#include +#endif -#ifdef _P64 -# ifdef USE_MMAP -# define GC_PAGE_SZ 16384//bytes -# else -# define GC_PAGE_SZ 12288//bytes -# endif -#else -#define GC_PAGE_SZ 8192//bytes +//#define GC_VERIFY + + +#if defined(GC_TRACK_ESC) && !defined(GC_INC) +#undef GC_TRACK_ESC +#warning GC_TRACK_ESC requires GC_INC #endif #ifdef __cplusplus extern "C" { #endif +/*#ifdef _P64 +#define GC_PAGE_SZ (1536*sizeof(void*))//bytes +#else*/ +#define GC_PG_LG2 14 +#define GC_PAGE_SZ (4*4096) // ((1 << GC_PAGE_W) - 16) +#define SYS_PAGE_SZ 4096 + +#define PAGE_COOKIE 0xD1CED0 +#pragma pack(push, 1) +// the cookie field must be before the page data +// becaue we will be doing GC_PAGE(v)->cookie for +// some v not in a page and it must not segfault typedef struct _gcpage_t { - char data[GC_PAGE_SZ]; union { - struct _gcpage_t *next; - char _pad[8]; + uint32_t cookie; + struct { + // this is a bitwise | of all gc_bits in this page + uint32_t gc_bits : 2; + // if this is 1, the freelist in this page contains only 2 cells. + // one is the first free cell, it points to the last cell of the page + // every cell in between is free + uint32_t linear : 1; + }; }; + uint16_t nfree; + uint16_t nmarked; + + struct _gcpage_t *next; + // struct _gcpage_t **prev; // point to the next field of the previous page + char *data; // this is not strictly necessary + uint16_t osize; + + struct { + char bits; + } old[GC_PAGE_SZ/(8*8)]; // one bit per object } gcpage_t; -typedef struct _gcval_t { +#define PAGE_GROUP_COUNT 31 +// We pack pages by groups of 31 which means a little less than 512k = 32*4 vm pages +#define PAGE_GROUP_LG2 19 +#define PAGE_GROUP_SZ 1 << PAGE_GROUP_LG2 + +typedef struct { union { - struct _gcval_t *next; - uptrint_t flags; - uptrint_t data0; // overlapped - uptrint_t marked:1; + gcpage_t pages[PAGE_GROUP_COUNT]; + char _pad[GC_PAGE_SZ]; }; -} gcval_t; + char data[PAGE_GROUP_COUNT][GC_PAGE_SZ]; +} gcpages_t; -typedef struct _pool_t { - size_t osize; - gcpage_t *pages; - gcval_t *freelist; -} pool_t; +#define GC_PAGES(x) ((gcpage_t*)(((uintptr_t)x) >> PAGE_GROUP_LG2 << PAGE_GROUP_LG2)) +#define GC_PAGE_IDX(x) (((uintptr_t)(x) - (uintptr_t)GC_PAGES(x) - GC_PAGE_SZ)/GC_PAGE_SZ) +#define GC_PAGE(x) (gcpage_t*)(&(GC_PAGES(x)[GC_PAGE_IDX(x)])) +#define GC_PAGE_DATA(x) ((char*)((uintptr_t)(x) >> GC_PG_LG2 << GC_PG_LG2)) +#define GC_POOL_END_OFS(osize) (((GC_PAGE_SZ/osize) - 1)*osize) + +#define REGION_PG_COUNT 2*1024 + +typedef struct { + // union { + // uint32_t freemap[REGION_PG_COUNT/32]; + uint32_t freemap[SYS_PAGE_SZ/4]; + // char _pad[SYS_PAGE_SZ]; + // }; + char pages[REGION_PG_COUNT][GC_PAGE_SZ]; +} region_t; + +#define BVOFFS 4 -#ifdef _P64 -# define BVOFFS 2 -#else -# define BVOFFS 4 -#endif typedef struct _bigval_t { struct _bigval_t *next; + struct _bigval_t **prev; // pointer to the next field of the prev entry size_t sz; -#ifndef _P64 uptrint_t _pad0; - uptrint_t _pad1; -#endif + // must be 16-aligned here, in 32 & 64b + union { uptrint_t flags; - uptrint_t marked:1; + uptrint_t gc_bits:2; char _data[1]; }; } bigval_t; +#define bigval_header(data) ((bigval_t*)((char*)(data) - BVOFFS*sizeof(void*))) + +#pragma pack(pop) + +typedef struct { + union { + uintptr_t header; + struct { + uintptr_t gc_bits:2; + uintptr_t pooled:1; + }; + }; + char data[]; +} buff_t; + +typedef struct _gcval_t { + union { + struct _gcval_t *next; + uptrint_t flags; + uptrint_t gc_bits:2; + }; +} gcval_t; + +typedef struct _pool_t { + gcval_t *freelist ; + int32_t fl_linear; + int32_t nfree; + // size_t end_offset; // avoid to compute this at each allocation + gcpage_t *pages; + gcpage_t *needsweep; + uint16_t osize; +} pool_t; +#define HEAP_COUNT 64 +static region_t *heaps[HEAP_COUNT] = {NULL}; + //static int free_lb = 0; + // GC knobs and self-measurement variables -static size_t allocd_bytes = 0; -static int64_t total_allocd_bytes = 0; static int64_t last_gc_total_bytes = 0; +/*static size_t allocd_bytes = 0; +static int64_t total_allocd_bytes = 0; +static size_t allocd_bytes_since_sweep = 0; static size_t freed_bytes = 0; static uint64_t total_gc_time=0; +static size_t live_bytes = 0; +static size_t scanned_bytes = 0; +static size_t scanned_bytes_goal; +static size_t current_pg_count = 0; +static size_t max_pg_count = 0;*/ + +#ifdef GC_INC +static int gc_inc_steps = 1; +static int gc_quick_steps = 1; +static int gc_sweep_steps = 1; +#else +static const int gc_inc_steps = 1; +#endif #ifdef _P64 #define default_collect_interval (5600*1024*sizeof(void*)) static size_t max_collect_interval = 1250000000UL; @@ -90,23 +179,183 @@ static size_t max_collect_interval = 1250000000UL; #define default_collect_interval (3200*1024*sizeof(void*)) static size_t max_collect_interval = 500000000UL; #endif -static size_t collect_interval = default_collect_interval; +// keep those 3 together +static int allocd_bytes; +static size_t collect_interval; +static int gc_steps; +#define N_POOLS 42 +static __attribute__((aligned (64))) pool_t norm_pools[N_POOLS]; +static pool_t ephe_pools[N_POOLS]; +static const pool_t *pools = &norm_pools[0]; + +static int64_t total_allocd_bytes = 0; +static size_t allocd_bytes_since_sweep = 0; +static size_t freed_bytes = 0; +static uint64_t total_gc_time=0; +static size_t live_bytes = 0; +static size_t current_pg_count = 0; +static size_t max_pg_count = 0; + int jl_in_gc; // referenced from switchto task.c #ifdef OBJPROFILE -static htable_t obj_counts; +static htable_t obj_counts[2]; #endif #ifdef GC_FINAL_STATS +static double page_alloc_time=0; static size_t total_freed_bytes=0; +static double max_pause = 0.0; +static double total_sweep_time=0; +static double total_mark_time=0; +static double total_fin_time=0; #endif +static int n_pause = 0; // manipulating mark bits -#define gc_marked(o) (((gcval_t*)(o))->marked) -#define gc_setmark(o) (((gcval_t*)(o))->marked=1) +#define GC_CLEAN 0 +#define GC_MARKED 1 +#define GC_QUEUED 2 +#define GC_MARKED_NOESC (GC_MARKED | GC_QUEUED) + +int sweeping = 0; + +#ifdef GC_INC +size_t scanned_bytes; +static int prev_sweep_mask = GC_MARKED; +static size_t scanned_bytes_goal; +#else +const int prev_sweep_mask = GC_MARKED; +#endif + +#define gc_bits(o) ((gcval_t*)(o))->gc_bits +#define gc_marked(o) (((gcval_t*)(o))->gc_bits & GC_MARKED) +#define _gc_setmark(o, mark_mode) (((gcval_t*)(o))->gc_bits = mark_mode) + +// mark verification +#ifdef GC_VERIFY +static jl_value_t* lostval = 0; +static arraylist_t lostval_parents; +static arraylist_t lostval_parents_done; + +static void add_lostval_parent(jl_value_t* parent) +{ + for(int i = 0; i < lostval_parents_done.len; i++) { + if((jl_value_t*)lostval_parents_done.items[i] == parent) + return; + } + for(int i = 0; i < lostval_parents.len; i++) { + if((jl_value_t*)lostval_parents.items[i] == parent) + return; + } + arraylist_push(&lostval_parents, parent); +} + +#define verify_val(v) do { \ + if(lostval == (jl_value_t*)(v) && (v) != 0) { \ + JL_PRINTF(JL_STDOUT, "Found lostval 0x%lx at %s:%d\n", \ + (uintptr_t)(lostval), __FILE__, __LINE__); \ + } \ + } while(0); + + +#define verify_parent(ty, obj, slot, args...) do { \ + if(*(jl_value_t**)(slot) == lostval && (obj) != lostval) { \ + JL_PRINTF(JL_STDOUT, "Found parent %s 0x%lx at %s:%d\n", \ + ty, (uintptr_t)(obj), __FILE__, __LINE__); \ + JL_PRINTF(JL_STDOUT, "\tloc 0x%lx : ", (uintptr_t)(slot)); \ + JL_PRINTF(JL_STDOUT, args); \ + JL_PRINTF(JL_STDOUT, "\n"); \ + add_lostval_parent((jl_value_t*)(obj)); \ + } \ + } while(0); + +#else +#define verify_val(v) +#define verify_parent(ty,obj,slot,args...) +#endif + +static bigval_t *big_objects = NULL; +static bigval_t *big_objects_marked = NULL; + +static inline void objprofile_count(void* v, int old) +{ +#ifdef OBJPROFILE + if (jl_typeof(v) <= 0x10) return; + void **bp = ptrhash_bp(&obj_counts[old], jl_typeof(v)); + if (*bp == HT_NOTFOUND) + *bp = (void*)2; + else + (*((ptrint_t*)bp))++; +#endif +} + +static inline int gc_setmark_other(void *o, int mark_mode) +{ + _gc_setmark(o, mark_mode); + verify_val(o); + return mark_mode; +} + +static inline int gc_setmark_big(void *o, int mark_mode) +{ + bigval_t* hdr = bigval_header(o); + if ((mark_mode == GC_MARKED) & (gc_bits(o) != GC_MARKED)) { + *hdr->prev = hdr->next; + if (hdr->next) + hdr->next->prev = hdr->prev; + hdr->next = big_objects_marked; + hdr->prev = &big_objects_marked; + if (big_objects_marked) + big_objects_marked->prev = &hdr->next; + big_objects_marked = hdr; + } + _gc_setmark(o, mark_mode); + verify_val(o); + return mark_mode; +} + +static inline int gc_setmark_pool(void *o, int mark_mode) +{ + gcpage_t* page = GC_PAGE(o); + int obj_i = ((uintptr_t)o - (uintptr_t)page->data)/page->osize; + if (page->old[obj_i/8].bits & (1 << (obj_i % 8))) { + _gc_setmark(o, GC_MARKED); + mark_mode = GC_MARKED; + } + else { + if (!gc_marked(o)) + page->old[obj_i/8].bits |= 1 << (obj_i % 8); + _gc_setmark(o, mark_mode); + } + page->nmarked += (mark_mode == GC_MARKED); + page->cookie |= gc_bits(o); + verify_val(o); + return mark_mode; +} + + +static inline int gc_setmark(void *o, int sz, int mark_mode) +{ + if(sz <= 2048) + return gc_setmark_pool(o, mark_mode); + else + return gc_setmark_big(o, mark_mode); +} + +#define gc_typeof(v) ((jl_value_t*)(((uptrint_t)jl_typeof(v))&(~(uintptr_t)3))) #define gc_val_buf(o) ((gcval_t*)(((void**)(o))-1)) -#define gc_setmark_buf(o) gc_setmark(gc_val_buf(o)) -#define gc_typeof(v) ((jl_value_t*)(((uptrint_t)jl_typeof(v))&~1UL)) + +inline void gc_setmark_buf(void *o) +{ + buff_t *buf = (buff_t*)gc_val_buf(o); + // buffers are always old but it does not matter since they do not contain any reference + // directly, it is handled by the parent object + if (buf->pooled) + gc_setmark_pool(buf, GC_MARKED); + else + gc_setmark_big(buf, GC_MARKED); +} // malloc wrappers, aligned allocation @@ -134,10 +383,117 @@ static inline void *malloc_a16(size_t sz) #endif -DLLEXPORT void *jl_gc_counted_malloc(size_t sz) +static __attribute__((noinline)) void *malloc_page(void) +{ + void *ptr = (void*)0; + int i; +#ifdef GC_FINAL_STATS + double t0 = clock_now(); +#endif + region_t* heap; + int heap_i = 0; + while(heap_i < HEAP_COUNT) { + heap = heaps[heap_i]; + if (heap == NULL) { +#ifdef _OS_WINDOWS_ + char* mem = VirtualAlloc(NULL, sizeof(region_t) + GC_PAGE_SZ, MEM_RESERVE, PAGE_READWRITE); +#else + char* mem = mmap(NULL, sizeof(region_t) + GC_PAGE_SZ*32, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + mem = mem == MAP_FAILED ? NULL : mem; +#endif + if (mem == NULL) { + jl_printf(JL_STDERR, "could not allocate pools\n"); + abort(); + } + heap = (region_t*)((char*)GC_PAGES(mem + SYS_PAGE_SZ + GC_PAGE_SZ*32 - 1) - SYS_PAGE_SZ); + heaps[heap_i] = heap; +#ifdef _OS_WINDOWS_ + VirtualAlloc(heap->freemap, REGION_PG_COUNT/8, MEM_COMMIT, PAGE_READWRITE); +#endif + memset(heap->freemap, 0xff, REGION_PG_COUNT/8); + } + heap_i++; + for(i = 0; i < REGION_PG_COUNT/32; i++) { + if (heap->freemap[i]) break; + } + if (i == REGION_PG_COUNT/32) { + // heap full + continue; + } + break; + } + if (heap_i >= HEAP_COUNT) { + jl_printf(JL_STDERR, "increase HEAP_COUNT or allocate less memory\n"); + abort(); + } + int j = (ffs(heap->freemap[i]) - 1); + heap->freemap[i] &= ~(uint32_t)(1 << j); + if (j == 0) { // reserve a page for metadata (every 31 data pages) + j++; + heap->freemap[i] &= ~(uint32_t)(1 << j); + #ifdef _OS_WINDOWS_ + VirtualAlloc(heap->pages[32*i], GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); + #endif + } + ptr = heap->pages[i*32 + j]; +#ifdef _OS_WINDOWS_ + VirtualAlloc(ptr, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); +#endif + current_pg_count++; + max_pg_count = max_pg_count < current_pg_count ? current_pg_count : max_pg_count; +#ifdef GC_FINAL_STATS + page_alloc_time += clock_now() - t0; +#endif + return ptr; +} + +static inline void free_page(void *p) { - if (allocd_bytes > collect_interval) + int pg_idx; + int i; + for(i = 0; i < HEAP_COUNT && heaps[i] != NULL; i++) { + pg_idx = ((uintptr_t)p - (uintptr_t)heaps[i]->pages[0])/GC_PAGE_SZ; + if (pg_idx >= 0 && pg_idx < 8*SYS_PAGE_SZ) break; + } + assert(i < HEAP_COUNT && heaps[i] != NULL); + region_t *heap = heaps[i]; + uint32_t msk = (uint32_t)(1 << ((pg_idx % 32))); + assert(!(heap->freemap[pg_idx/32] & msk)); + heap->freemap[pg_idx/32] ^= msk; +#ifdef _OS_WINDOWS_ + VirtualFree(p, GC_PAGE_SZ, MEM_DECOMMIT); +#else + madvise(p, GC_PAGE_SZ, MADV_DONTNEED); +#endif + if (heap->freemap[pg_idx/32] == ~(uint32_t)1) { // free the metadata page + heap->freemap[pg_idx/32] = ~(uint32_t)0; +#ifdef _OS_WINDOWS_ + VirtualFree(&heap->pages[pg_idx], GC_PAGE_SZ, MEM_DECOMMIT); +#else + madvise(&heap->pages[pg_idx], GC_PAGE_SZ, MADV_DONTNEED); +#endif + } + current_pg_count--; +} + +#ifdef GC_INC +//#define maybe_collect() if (__unlikely(T.allocd_bytes/**gc_steps*/ > collect_interval)) jl_gc_collect() +#define should_collect() (__unlikely(allocd_bytes > 0)) +static inline int maybe_collect(void) +{ + if (should_collect()) { jl_gc_collect(); + return 1; + } + return 0; +} +#else +#define maybe_collect() if (__unlikely(allocd_bytes > collect_interval)) jl_gc_collect() +#endif + +DLLEXPORT void *jl_gc_counted_malloc(size_t sz) +{ + maybe_collect(); allocd_bytes += sz; void *b = malloc(sz); if (b == NULL) @@ -153,8 +509,7 @@ DLLEXPORT void jl_gc_counted_free(void *p, size_t sz) DLLEXPORT void *jl_gc_counted_realloc(void *p, size_t sz) { - if (allocd_bytes > collect_interval) - jl_gc_collect(); + maybe_collect(); allocd_bytes += ((sz+1)/2); // NOTE: wild guess at growth amount void *b = realloc(p, sz); if (b == NULL) @@ -164,8 +519,7 @@ DLLEXPORT void *jl_gc_counted_realloc(void *p, size_t sz) DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { - if (allocd_bytes > collect_interval) - jl_gc_collect(); + maybe_collect(); if (sz > old) allocd_bytes += (sz-old); void *b = realloc(p, sz); @@ -176,8 +530,7 @@ DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t void *jl_gc_managed_malloc(size_t sz) { - if (allocd_bytes > collect_interval) - jl_gc_collect(); + maybe_collect(); sz = (sz+15) & -16; void *b = malloc_a16(sz); if (b == NULL) @@ -188,8 +541,7 @@ void *jl_gc_managed_malloc(size_t sz) void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, int isaligned) { - if (allocd_bytes > collect_interval) - jl_gc_collect(); + maybe_collect(); sz = (sz+15) & -16; void *b; #ifdef _P64 @@ -349,12 +701,9 @@ void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) // big value list -static bigval_t *big_objects = NULL; - -static void *alloc_big(size_t sz) +static __attribute__((noinline)) void *alloc_big(size_t sz) { - if (allocd_bytes > collect_interval) - jl_gc_collect(); + maybe_collect(); size_t offs = BVOFFS*sizeof(void*); if (sz+offs+15 < offs+15) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); @@ -364,35 +713,65 @@ static void *alloc_big(size_t sz) if (v == NULL) jl_throw(jl_memory_exception); #ifdef MEMDEBUG - //memset(v, 0xee, allocsz); + memset(v, 0xee, allocsz); #endif v->sz = sz; v->flags = 0; v->next = big_objects; + v->prev = &big_objects; + if (v->next) + v->next->prev = &v->next; big_objects = v; - return &v->_data[0]; + void* ptr = &v->_data[0]; + return ptr; } -static void sweep_big(void) +static int big_total; +static int big_freed; +static int big_reset; + +static jl_value_t** sweep_big_list(int sweep_mask, bigval_t** pv) { - bigval_t *v = big_objects; - bigval_t **pv = &big_objects; + bigval_t *v = *pv; while (v != NULL) { bigval_t *nxt = v->next; - if (v->marked) { + if (gc_marked(&v->_data)) { pv = &v->next; - v->marked = 0; + if ((gc_bits(&v->_data) & sweep_mask) == sweep_mask) { + gc_bits(&v->_data) = GC_CLEAN; + big_reset++; + } } else { *pv = nxt; + if (nxt) + nxt->prev = pv; freed_bytes += v->sz; #ifdef MEMDEBUG memset(v, 0xbb, v->sz+BVOFFS*sizeof(void*)); #endif free_a16(v); + big_freed++; } + big_total++; v = nxt; } + return pv; +} + +static void sweep_big(int sweep_mask) +{ + sweep_big_list(sweep_mask, &big_objects); + if (sweep_mask == GC_MARKED) { + jl_value_t** last_next = sweep_big_list(sweep_mask, &big_objects_marked); + if (big_objects) + big_objects->prev = last_next; + *last_next = big_objects; + big_objects = big_objects_marked; + if (big_objects) + big_objects->prev = &big_objects; + big_objects_marked = NULL; + } } // tracking Arrays with malloc'd storage @@ -440,7 +819,11 @@ void jl_gc_free_array(jl_array_t *a) } } -static void sweep_malloced_arrays() +static int mallocd_array_total; +static int mallocd_array_freed; + + +static void sweep_malloced_arrays(void) { mallocarray_t *ma = mallocarrays; mallocarray_t **pma = &mallocarrays; @@ -455,63 +838,94 @@ static void sweep_malloced_arrays() jl_gc_free_array(ma->a); ma->next = mafreelist; mafreelist = ma; + mallocd_array_freed++; } + mallocd_array_total++; ma = nxt; } } // pool allocation -#define N_POOLS 42 -static pool_t norm_pools[N_POOLS]; -static pool_t ephe_pools[N_POOLS]; -static pool_t *pools = &norm_pools[0]; +static inline gcval_t *reset_page(pool_t *p, gcpage_t *pg, gcval_t *fl) +{ + pg->nfree = GC_PAGE_SZ/p->osize; + pg->nmarked = 0; + pg->cookie = PAGE_COOKIE; + memset(pg->old, 0x0, GC_PAGE_SZ/(8*8)); + gcval_t *beg = (gcval_t*)pg->data; + gcval_t *end = (gcval_t*)((char*)beg + (pg->nfree - 1)*p->osize); + // madvise(beg, GC_PAGE_SZ, MADV_FREE); + end->next = fl; + pg->linear = 1; + return beg; +} -static void add_page(pool_t *p) +static __attribute__((noinline)) void add_page(pool_t *p) { -#ifdef USE_MMAP - gcpage_t *pg = (gcpage_t*)mmap(NULL, sizeof(gcpage_t), PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); -#else - gcpage_t *pg = (gcpage_t*)malloc_a16(sizeof(gcpage_t)); -#endif - if (pg == NULL) + //gcpage_t *pg = (gcpage_t*)malloc_a16(sizeof(gcpage_t)); + void *data = malloc_page(); + if (data == NULL) jl_throw(jl_memory_exception); - gcval_t *v = (gcval_t*)&pg->data[0]; - char *lim = (char*)v + GC_PAGE_SZ - p->osize; - gcval_t *fl; - gcval_t **pfl = &fl; - while ((char*)v <= lim) { - *pfl = v; - pfl = &v->next; - v = (gcval_t*)((char*)v + p->osize); - } + gcpage_t *pg = GC_PAGE(data); +// jl_printf(JL_STDOUT, "add page [%d] : 0x%lx 0x%lx = 0x%lx hdr 0x%lx\n", GC_PAGE_IDX(data), pg, data, (uintptr_t)data - (uintptr_t)pg, GC_PAGES(data)); + pg->data = data; + pg->osize = p->osize; + gcval_t *fl = reset_page(p, pg, p->freelist); + p->nfree += pg->nfree; // these statements are ordered so that interrupting after any of them // leaves the system in a valid state - *pfl = p->freelist; pg->next = p->pages; p->pages = pg; p->freelist = fl; + p->fl_linear = 1; } -static inline void *pool_alloc(pool_t *p) +/*static inline void *_pool_alloc_fast(pool_t* p, int osize, int end_offset) { - if (allocd_bytes > collect_interval) - jl_gc_collect(); - allocd_bytes += p->osize; - if (p->freelist == NULL) { + gcval_t *v = p->freelist; + p->nfree--; + end = &(GC_PAGE_DATA(v)[end_offset]); + linear = (v != end) & p->fl_linear; + gcval_t *next_lin = (gcval_t*)((char*)v + osize); + allocd_bytes += osize; + p->freelist = next_lin; + }*/ + +static inline void *_pool_alloc(pool_t* p, int osize) +{ + gcval_t *v, *end; + int end_offset = GC_POOL_END_OFS(osize); + int ab = allocd_bytes; + p->nfree--; + allocd_bytes += osize; + maybe_collect(); + if (__unlikely(!p->freelist)) { add_page(p); } - assert(p->freelist != NULL); - gcval_t *v = p->freelist; - p->freelist = p->freelist->next; - v->flags = 0; + + v = p->freelist; + p->freelist = (char*)p->freelist + osize; + end = &(GC_PAGE_DATA(v)[end_offset]); + if (__unlikely(!((v != end) & (!!p->fl_linear)))) { + p->freelist = v->next; + if (p->freelist) + p->fl_linear = GC_PAGE(p->freelist)->linear; + } + // p->freelist = next; + // v->flags = 0; + // pg->nfree--; return v; } +static inline void *pool_alloc(pool_t *p) +{ + return _pool_alloc(p, p->osize); +} + static int szclass(size_t sz) { -#ifndef _P64 + #ifndef _P64 if (sz <= 8) return 0; #endif if (sz <= 56) return ((sz+3)/4) - 2; @@ -525,7 +939,23 @@ static int szclass(size_t sz) return 41; } -static void sweep_pool(pool_t *p) +#ifdef GC_INC +int check_timeout = 0; +#define should_timeout() (check_timeout && scanned_bytes >= scanned_bytes_goal) +#else +#define should_timeout() 0 +#endif + + +static int skipped_pages = 0; +static int total_pages = 0; +static int freed_pages = 0; +static int lazy_freed_pages = 0; +static int page_done = 0; +static int obj_old = 0; +static int obj_young = 0; + +static void sweep_pool(pool_t *p, int sweep_mask) { #ifdef FREE_PAGES_EAGER int freedall; @@ -534,115 +964,238 @@ static void sweep_pool(pool_t *p) #endif gcval_t **prev_pfl; gcval_t *v; - gcpage_t *pg = p->pages; - gcpage_t **ppg = &p->pages; + gcpage_t *pg = p->needsweep; + gcpage_t **ppg = &p->needsweep; gcval_t **pfl = &p->freelist; size_t osize = p->osize; - size_t nfreed = 0; - - size_t old_nfree = 0; - gcval_t *ofl = p->freelist; - while (ofl != NULL) { - old_nfree++; - ofl = ofl->next; - } - + size_t old_nfree = p->nfree; + int pg_freedall = 0, pg_total = 0; + int stats[4] = {0, 0, 0, 0}; + int pg_skpd = 0, pg_wont_skip = 0; + int obj_per_page = GC_PAGE_SZ/osize; + int whole_page = 0; while (pg != NULL) { - v = (gcval_t*)&pg->data[0]; + // if ((pg->cookie & ~(uint32_t)7) != PAGE_COOKIE) + // abort(); + v = (gcval_t*)pg->data; char *lim = (char*)v + GC_PAGE_SZ - osize; -#ifdef FREE_PAGES_EAGER freedall = 1; -#else - empty = 1; -#endif prev_pfl = pfl; + + if (gc_bits(&pg->cookie) == GC_MARKED) { + // skip + if (sweep_mask == GC_MARKED_NOESC && pg->nmarked > 0) { + freedall = 0; + pg_skpd++; + goto free_page; + } + } + else if(gc_bits(&pg->cookie) == GC_CLEAN) { + // if (whole_page) + // p->nfree += obj_per_page; // overestimation + // else + p->nfree++; // underestimation + whole_page = 1; + lazy_freed_pages++; + goto free_page; + } + int obj_i = 0; + pg->nmarked = 0; while ((char*)v <= lim) { - if (!v->marked) { -#ifndef FREE_PAGES_EAGER - // check that all but last object points to its next object, - // which is a heuristic check for being on the freelist. - if ((char*)v->next != (char*)v + osize && v->next != NULL && - (char*)v+osize <= lim) - empty = 0; -#endif + stats[gc_bits(v)]++; + // we can encouter a queued value at this point + // if a write barrier was moved back between two + // sweeping increments + if (!gc_marked(v) & (gc_bits(v) != GC_QUEUED)) { *pfl = v; pfl = &v->next; - nfreed++; + p->nfree++; + pg->old[obj_i/8].bits &= ~(1 << (obj_i % 8)); } else { - v->marked = 0; -#ifdef FREE_PAGES_EAGER + if ((sweep_mask & gc_bits(v)) == sweep_mask) + gc_bits(v) = GC_CLEAN; freedall = 0; -#else - empty = 0; -#endif } v = (gcval_t*)((char*)v + osize); + obj_i++; } - gcpage_t *nextpg = pg->next; + page_done++; + free_page: + // nfreed += this_page_nfree; + // pg->nfree = this_page_nfree; + if (sweep_mask == GC_MARKED) + pg->nmarked = 0; + pg_freedall += freedall; + // lazy version: (empty) if the whole page was already unused, free it // eager version: (freedall) free page as soon as possible // the eager one uses less memory. - if ( -#ifdef FREE_PAGES_EAGER - freedall -#else - empty -#endif - ) { - pfl = prev_pfl; - *ppg = nextpg; -#ifdef MEMDEBUG - memset(pg, 0xbb, sizeof(gcpage_t)); -#endif -#ifdef USE_MMAP - munmap(pg, sizeof(gcpage_t)); -#else - free_a16(pg); -#endif - //freed_bytes += GC_PAGE_SZ; + gcpage_t *nextpg; + pg_total++; + nextpg = pg->next; + if (freedall) { + if (prev_sweep_mask == GC_MARKED_NOESC) { + gcval_t *begin = reset_page(p, pg, *prev_pfl); + p->nfree += pg->nfree; + pfl = (gcval_t**)((char*)begin + (pg->nfree - 1)*p->osize); + *prev_pfl = begin; + ppg = &pg->next; + } + else { + pfl = prev_pfl; + *ppg = nextpg; + #ifdef MEMDEBUG + memset(pg, 0xbb, sizeof(gcpage_t)); + #endif + free_page(pg->data); + } + freed_pages++; } else { + gc_bits(&pg->cookie) = GC_MARKED; ppg = &pg->next; + pg->linear = 0; + } + if (should_timeout() && nextpg) { + pg->next = NULL; + pg = nextpg; + break; } + scanned_bytes += GC_PAGE_SZ; pg = nextpg; } + //gcpage_t* pgs = p->pages; + *ppg = p->pages; + p->pages = p->needsweep; + if (pg == NULL) { + p->needsweep = NULL; + } else { + p->needsweep = pg; + } + skipped_pages += pg_skpd; + total_pages += pg_total; *pfl = NULL; - freed_bytes += (nfreed - old_nfree)*osize; + if (p->freelist) { + p->fl_linear = GC_PAGE(p->freelist)->linear; + } + /* if (stats[0] + stats[1] + stats[2] + stats[2] > 0) + jl_printf(JL_STDOUT, "Pool : %d %d %d %d\n", stats[0], stats[1], stats[2], stats[3]);*/ + freed_bytes += (p->nfree - old_nfree)*osize; } // sweep phase extern void jl_unmark_symbols(void); -static void gc_sweep(void) + +// if mark_bits & sweep_mask == sweep_mask we reset the mark while sweeping the heap +static void gc_sweep_once(int sweep_mask) { +#ifdef GC_TIME + double t0 = clock_now(); + mallocd_array_total = 0; + mallocd_array_freed = 0; +#endif sweep_malloced_arrays(); - sweep_big(); +#ifdef GC_TIME + JL_PRINTF(JL_STDOUT, "GC sweep arrays %.2f (freed %d/%d)\n", (clock_now() - t0)*1000, mallocd_array_freed, mallocd_array_total); + t0 = clock_now(); + big_total = 0; + big_freed = 0; + big_reset = 0; +#endif + sweep_big(sweep_mask); +#ifdef GC_TIME + JL_PRINTF(JL_STDOUT, "GC sweep big %.2f (freed %d/%d with %d rst)\n", (clock_now() - t0)*1000, big_freed, big_total, big_reset); + t0 = clock_now(); +#endif + if (sweep_mask == GC_MARKED) + jl_unmark_symbols(); +#ifdef GC_TIME + JL_PRINTF(JL_STDOUT, "GC sweep symbols %.2f\n", (clock_now() - t0)*1000); +#endif +} + +// returns 0 if not finished +static int gc_sweep_inc(int sweep_mask) +{ + double t0 = clock_now(); + skipped_pages = 0; + total_pages = 0; + freed_pages = 0; + lazy_freed_pages = 0; + page_done = 0; int i; + int finished = 1; +#ifdef GC_INC + int ct = check_timeout; + if (sweep_mask == GC_MARKED_NOESC || gc_steps == 1) check_timeout = 0; +#endif for(i=0; i < N_POOLS; i++) { - sweep_pool(&norm_pools[i]); - sweep_pool(&ephe_pools[i]); + sweep_pool(&norm_pools[i], sweep_mask); + finished &= !norm_pools[i].needsweep; + /* sweep_pool(&ephe_pools[i], sweep_mask); + finished &= !ephe_pools[i].needsweep;*/ } - jl_unmark_symbols(); +#ifdef GC_INC + check_timeout = ct; +#endif +#ifdef GC_TIME + JL_PRINTF(JL_STDOUT, "GC sweep pools %s %.2f (skipped %d%% of %d, done %d pgs, %d freed with %d lazily) mask %d\n", finished ? "end" : "inc", (clock_now() - t0)*1000, total_pages ? (skipped_pages*100)/total_pages : 0, total_pages, page_done, freed_pages, lazy_freed_pages, sweep_mask); +#endif + return finished; +} + +static void gc_sweep(int sweep_mask) +{ + gc_sweep_once(sweep_mask); + while (!gc_sweep_inc(sweep_mask)); } + + // mark phase -static jl_value_t **mark_stack = NULL; -static size_t mark_stack_size = 0; -static size_t mark_sp = 0; +jl_value_t **mark_stack = NULL; +jl_value_t **mark_stack_base = NULL; +size_t mark_stack_size = 0; +size_t mark_sp = 0; +size_t perm_marked = 0; -static void push_root(jl_value_t *v, int d); -#define gc_push_root(v,d) do { assert(v != NULL); if (!gc_marked(v)) { push_root((jl_value_t*)(v),d); } } while (0) +void grow_mark_stack(void) +{ + size_t newsz = mark_stack_size>0 ? mark_stack_size*2 : 32000; + size_t offset = mark_stack - mark_stack_base; + mark_stack_base = (jl_value_t**)realloc(mark_stack_base, newsz*sizeof(void*)); + if (mark_stack_base == NULL) { + JL_PRINTF(JL_STDERR, "Could'nt grow mark stack to : %d\n", newsz); + exit(1); + } + mark_stack = mark_stack_base + offset; + mark_stack_size = newsz; +} -void jl_gc_setmark(jl_value_t *v) +DLLEXPORT void gc_queue_root(void *p) { - gc_setmark(v); + if(mark_sp + perm_marked >= mark_stack_size) grow_mark_stack(); + gc_bits((uintptr_t)p & ~(uintptr_t)3) = GC_QUEUED; + mark_stack[mark_sp++] = (jl_value_t*)p; } -static void gc_mark_stack(jl_gcframe_t *s, ptrint_t offset, int d) +#ifdef GC_INC +static arraylist_t tasks; +#endif +static void push_root(jl_value_t *v, int mark_mode, int d); +#define gc_push_root(v,mark_mode,d) do { assert((v) != NULL); verify_val(v); if ((!gc_marked(v)) | ((gc_bits(v) & mark_mode) != gc_bits(v))) { push_root((jl_value_t*)(v),mark_mode,d); } } while(0) + +void jl_gc_setmark(jl_value_t *v) // TODO rename this as it is misleading now +{ + gc_setmark_pool(v, GC_MARKED); +} + +static void gc_mark_stack(jl_value_t* ta, jl_gcframe_t *s, ptrint_t offset, int mark_mode, int d) { while (s != NULL) { s = (jl_gcframe_t*)((char*)s + offset); @@ -651,21 +1204,25 @@ static void gc_mark_stack(jl_gcframe_t *s, ptrint_t offset, int d) if (s->nroots & 1) { for(size_t i=0; i < nr; i++) { jl_value_t **ptr = (jl_value_t**)((char*)rts[i] + offset); + scanned_bytes += sizeof(void*); if (*ptr != NULL) - gc_push_root(*ptr, d); + gc_push_root(*ptr, mark_mode, d); } } else { for(size_t i=0; i < nr; i++) { - if (rts[i] != NULL) - gc_push_root(rts[i], d); + scanned_bytes += sizeof(void*); + if (rts[i] != NULL) { + verify_parent("task", ta, &rts[i], "stack(%d)", i); + gc_push_root(rts[i], mark_mode, d); + } } } s = s->prev; } } -static void gc_mark_module(jl_module_t *m, int d) +static void gc_mark_module(jl_module_t *m, int mark_mode, int d) { size_t i; void **table = m->bindings.table; @@ -673,10 +1230,15 @@ static void gc_mark_module(jl_module_t *m, int d) if (table[i] != HT_NOTFOUND) { jl_binding_t *b = (jl_binding_t*)table[i]; gc_setmark_buf(b); - if (b->value != NULL) - gc_push_root(b->value, d); + void* vb = gc_val_buf(b); + verify_parent("module", m, &vb, "binding_buff"); + scanned_bytes += sizeof(jl_binding_t); + if (b->value != NULL) { + verify_parent("module", m, &b->value, "binding(%s)", b->name->name); + gc_push_root(b->value, mark_mode, d); + } if (b->type != (jl_value_t*)jl_any_type) - gc_push_root(b->type, d); + gc_push_root(b->type, mark_mode, d); } } // this is only necessary because bindings for "using" modules @@ -684,22 +1246,14 @@ static void gc_mark_module(jl_module_t *m, int d) // after "using" it but before accessing it, this array might // contain the only reference. for(i=0; i < m->usings.len; i++) { - gc_push_root(m->usings.items[i], d); + gc_push_root(m->usings.items[i], mark_mode, d); } if (m->constant_table) - gc_push_root(m->constant_table, d); + gc_push_root(m->constant_table, mark_mode, d); } -static void gc_mark_task(jl_task_t *ta, int d) +static void gc_mark_task_stack(jl_task_t *ta, int mark_mode, int d) { - if (ta->parent) gc_push_root(ta->parent, d); - if (ta->last) gc_push_root(ta->last, d); - gc_push_root(ta->tls, d); - gc_push_root(ta->consumers, d); - gc_push_root(ta->donenotify, d); - gc_push_root(ta->exception, d); - if (ta->start) gc_push_root(ta->start, d); - if (ta->result) gc_push_root(ta->result, d); if (ta->stkbuf != NULL || ta == jl_current_task) { if (ta->stkbuf != NULL) gc_setmark_buf(ta->stkbuf); @@ -707,18 +1261,46 @@ static void gc_mark_task(jl_task_t *ta, int d) ptrint_t offset; if (ta == jl_current_task) { offset = 0; - gc_mark_stack(jl_pgcstack, offset, d); + gc_mark_stack((jl_value_t*)ta, jl_pgcstack, offset, mark_mode, d); } else { offset = (char *)ta->stkbuf - ((char *)ta->stackbase - ta->ssize); - gc_mark_stack(ta->gcstack, offset, d); + gc_mark_stack((jl_value_t*)ta, ta->gcstack, offset, mark_mode, d); } #else - gc_mark_stack(ta->gcstack, 0, d); + gc_mark_stack((jl_value_t*)ta, ta->gcstack, 0, mark_mode, d); #endif } } +static void mark_task_stacks(int mark_mode) { + for (int i = 0; i < tasks.len; i++) { + gc_mark_task_stack(tasks.items[i], mark_mode, 0); + } +} + +static void gc_mark_task(jl_task_t *ta, int mark_mode, int d) +{ + if (ta->parent) gc_push_root(ta->parent, mark_mode, d); + if (ta->last) gc_push_root(ta->last, mark_mode, d); + gc_push_root(ta->tls, mark_mode, d); + gc_push_root(ta->consumers, mark_mode, d); + gc_push_root(ta->donenotify, mark_mode, d); + gc_push_root(ta->exception, mark_mode, d); + if (ta->start) gc_push_root(ta->start, mark_mode, d); + if (ta->result) gc_push_root(ta->result, mark_mode, d); +#ifdef GC_INC + if (mark_mode == GC_MARKED_NOESC) { + gc_mark_task_stack(ta, mark_mode, d); + } else { + arraylist_push(&tasks, (void*)ta); + } +#else + gc_mark_task_stack(ta, mark_mode, d); +#endif +} + + // for chasing down unwanted references /* static jl_value_t *lookforme = NULL; @@ -727,51 +1309,59 @@ DLLEXPORT void jl_gc_lookfor(jl_value_t *v) { lookforme = v; } #define MAX_MARK_DEPTH 400 -static void push_root(jl_value_t *v, int d) +static void push_root(jl_value_t *v, int mark_mode, int d) { assert(v != NULL); jl_value_t *vt = (jl_value_t*)gc_typeof(v); + // gc_setmark(v); -#ifdef OBJPROFILE - if (!gc_marked(v)) { - void **bp = ptrhash_bp(&obj_counts, vt); - if (*bp == HT_NOTFOUND) - *bp = (void*)2; - else - (*((ptrint_t*)bp))++; + if (vt == (jl_value_t*)jl_weakref_type) { + mark_mode = gc_setmark(v, jl_datatype_size(jl_weakref_type), mark_mode); + goto ret; } -#endif - - gc_setmark(v); - - if (vt == (jl_value_t*)jl_weakref_type || - (jl_is_datatype(vt) && ((jl_datatype_t*)vt)->pointerfree)) { - return; + if ((jl_is_datatype(vt) && ((jl_datatype_t*)vt)->pointerfree)) { + int sz = jl_datatype_size(vt); + gc_setmark(v, sz, mark_mode); + scanned_bytes += sz; + goto ret; } if (d >= MAX_MARK_DEPTH) goto queue_the_root; + if (should_timeout()) + goto queue_the_root; + d++; // some values have special representations if (vt == (jl_value_t*)jl_tuple_type) { size_t l = jl_tuple_len(v); + mark_mode = gc_setmark(v, l*sizeof(void*) + sizeof(jl_tuple_t), mark_mode); jl_value_t **data = ((jl_tuple_t*)v)->data; for(size_t i=0; i < l; i++) { jl_value_t *elt = data[i]; - if (elt != NULL) - gc_push_root(elt, d); + scanned_bytes += sizeof(void*); + if (elt != NULL) { + verify_parent("tuple", v, &data[i], "elem(%d)", i); + gc_push_root(elt, mark_mode, d); + } } } else if (((jl_datatype_t*)(vt))->name == jl_array_typename) { jl_array_t *a = (jl_array_t*)v; + if (a->pooled) + mark_mode = gc_setmark_pool(a, mark_mode); + else + mark_mode = gc_setmark_big(a, mark_mode); if (a->how == 3) { jl_value_t *owner = jl_array_data_owner(a); - gc_push_root(owner, d); - return; + gc_push_root(owner, mark_mode, d); + goto ret; } else if (a->how == 1) { + void* val_buf = gc_val_buf((char*)a->data - a->offset*a->elsize); + verify_parent("array", v, &val_buf, "buffer ('loc' addr is meaningless)"); gc_setmark_buf((char*)a->data - a->offset*a->elsize); } if (a->ptrarray && a->data!=NULL) { @@ -785,47 +1375,99 @@ static void push_root(jl_value_t *v, int d) void *data = a->data; for(size_t i=0; i < l; i++) { jl_value_t *elt = ((jl_value_t**)data)[i]; - if (elt != NULL) gc_push_root(elt, d); + scanned_bytes += sizeof(void*); + if (elt != NULL){ + verify_parent("array", v, &((jl_value_t**)data)[i], "elem(%d)", i); + gc_push_root(elt, mark_mode, d); + } + // try to split large array marking + // if (should_timeout() && l > 1000) goto queue_the_root; } } } + else { + scanned_bytes += array_nbytes(a); + } } else if (vt == (jl_value_t*)jl_module_type) { - gc_mark_module((jl_module_t*)v, d); + mark_mode = gc_setmark(v, sizeof(jl_module_t), mark_mode); + gc_mark_module((jl_module_t*)v, mark_mode, d); + scanned_bytes += sizeof(jl_module_t); } else if (vt == (jl_value_t*)jl_task_type) { - gc_mark_task((jl_task_t*)v, d); + mark_mode = gc_setmark(v, sizeof(jl_task_t), mark_mode); + gc_mark_task((jl_task_t*)v, mark_mode, d); + scanned_bytes += sizeof(jl_task_t); } - else { + else if(vt == (jl_value_t*)jl_symbol_type) { + mark_mode = gc_setmark_other(v, mark_mode); // symbols are not pooled + } + else if( +#ifdef GC_VERIFY + // this check should not be needed but it helps catching corruptions early + gc_typeof(vt) == (jl_value_t*)jl_datatype_type +#else + 1 +#endif + ) { jl_datatype_t *dt = (jl_datatype_t*)vt; + mark_mode = gc_setmark(v, jl_datatype_size(dt), mark_mode); int nf = (int)jl_tuple_len(dt->names); for(int i=0; i < nf; i++) { if (dt->fields[i].isptr) { - jl_value_t *fld = *(jl_value_t**)((char*)v + dt->fields[i].offset + sizeof(void*)); - if (fld) - gc_push_root(fld, d); + scanned_bytes += sizeof(void*); + jl_value_t **slot = (jl_value_t**)((char*)v + dt->fields[i].offset + sizeof(void*)); + jl_value_t *fld = *slot; + if (fld) { + verify_parent("object", v, slot, "field(%d)", i); + gc_push_root(fld, mark_mode, d); + } + } + else { + scanned_bytes += jl_field_size(dt, i); } } } +#ifdef GC_VERIFY + else { + JL_PRINTF(JL_STDOUT, "GC error (probable corruption) :\n"); + jl_(vt); + abort(); + } +#endif + ret: + objprofile_count(v, gc_bits(v) == GC_MARKED ? 1 : 0); return; queue_the_root: - if (mark_sp >= mark_stack_size) { - size_t newsz = mark_stack_size>0 ? mark_stack_size*2 : 32000; - mark_stack = (jl_value_t**)realloc(mark_stack,newsz*sizeof(void*)); - if (mark_stack == NULL) exit(1); - mark_stack_size = newsz; - } - mark_stack[mark_sp++] = v; + scanned_bytes += sizeof(void*); + // save the mark mode in the lower bits of the pointer + gc_queue_root((void*)((uintptr_t)v | mark_mode)); } -static void visit_mark_stack() +static void visit_mark_stack_inc(int mark_mode) { - while (mark_sp > 0) { - push_root(mark_stack[--mark_sp], 0); + while(mark_sp > 0 && !should_timeout()) { + gcval_t* v = (gcval_t*)mark_stack[--mark_sp]; + // assert(gc_bits(v) == GC_QUEUED || gc_bits(v) == GC_MARKED || gc_bits(v) == GC_MARKED_NOESC); + int mode = ((uintptr_t)v & 3) ? ((uintptr_t)v & 3) : mark_mode; + push_root((jl_value_t*)((uintptr_t)v & ~(uintptr_t)3), mode, 0); } } +static void visit_mark_stack(int mark_mode) +{ +#ifdef GC_INC + int ct = check_timeout; + check_timeout = 0; +#endif + visit_mark_stack_inc(mark_mode); + // assert(!mark_sp); +#ifdef GC_INC + check_timeout = ct; +#endif +} + void jl_mark_box_caches(void); extern jl_value_t * volatile jl_task_arg_in_transit; @@ -837,76 +1479,291 @@ extern jl_module_t *jl_old_base_module; extern jl_array_t *typeToTypeId; extern jl_array_t *jl_module_init_order; -static void gc_mark(void) +static int inc_count = 0; +static int quick_count = 0; +static void pre_mark(int mark_mode) { - // mark all roots - - // active tasks - gc_push_root(jl_root_task, 0); - gc_push_root(jl_current_task, 0); - // modules - gc_push_root(jl_main_module, 0); - gc_push_root(jl_internal_main_module, 0); - gc_push_root(jl_current_module, 0); - if (jl_old_base_module) gc_push_root(jl_old_base_module, 0); + gc_push_root(jl_main_module, mark_mode, 0); + gc_push_root(jl_current_module, mark_mode, 0); + if (jl_old_base_module) gc_push_root(jl_old_base_module, mark_mode, 0); + gc_push_root(jl_internal_main_module, mark_mode, 0); + gc_push_root(jl_root_task, mark_mode, 0); + gc_push_root(jl_current_task, mark_mode, 0); // invisible builtin values - if (jl_an_empty_cell) gc_push_root(jl_an_empty_cell, 0); - gc_push_root(jl_exception_in_transit, 0); - gc_push_root(jl_task_arg_in_transit, 0); - gc_push_root(jl_unprotect_stack_func, 0); - gc_push_root(jl_bottom_func, 0); - gc_push_root(jl_typetype_type, 0); - gc_push_root(jl_tupletype_type, 0); - gc_push_root(typeToTypeId, 0); + if (jl_an_empty_cell) gc_push_root(jl_an_empty_cell, mark_mode, 0); + gc_push_root(jl_exception_in_transit, mark_mode, 0); + gc_push_root(jl_task_arg_in_transit, mark_mode, 0); + gc_push_root(typeToTypeId, mark_mode, 0); if (jl_module_init_order != NULL) - gc_push_root(jl_module_init_order, 0); - - // constants - gc_push_root(jl_null, 0); - gc_push_root(jl_true, 0); - gc_push_root(jl_false, 0); - - jl_mark_box_caches(); - + gc_push_root(jl_module_init_order, mark_mode, 0); + size_t i; // stuff randomly preserved for(i=0; i < preserved_values.len; i++) { - gc_push_root((jl_value_t*)preserved_values.items[i], 0); + gc_push_root((jl_value_t*)preserved_values.items[i], mark_mode, 0); } // objects currently being finalized for(i=0; i < to_finalize.len; i++) { - gc_push_root(to_finalize.items[i], 0); + gc_push_root(to_finalize.items[i], mark_mode, 0); + } + + // if (inc_count > 1) return; // the following roots are constant and will stay marked in between increments + jl_mark_box_caches(); + gc_push_root(jl_unprotect_stack_func, mark_mode, 0); + gc_push_root(jl_bottom_func, mark_mode, 0); + gc_push_root(jl_typetype_type, mark_mode, 0); + gc_push_root(jl_tupletype_type, mark_mode, 0); + + // constants + gc_push_root(jl_null, mark_mode, 0); + gc_push_root(jl_true, mark_mode, 0); + gc_push_root(jl_false, mark_mode, 0); +} + +#ifdef GC_VERIFY +static arraylist_t bits_save[4]; + +// set all mark bits to bits +// record the state of the heap and can replay it in restore() +// restore _must_ be called as this will overwrite parts of the +// freelist in pools +static void clear_mark(int bits) +{ + size_t i; + pool_t* pool; + gcpage_t* pg; + gcval_t* pv; + for(int i = 0; i < 4; i++) + bits_save[i].len = 0; + + bigval_t *bigs[] = { big_objects, big_objects_marked }; + for (int i = 0; i < 2; i++) { + bigval_t *v = bigs[i]; + while (v != NULL) { + void* gcv = &v->_data; + arraylist_push(&bits_save[gc_bits(gcv)], gcv); + gc_bits(gcv) = bits; + v = v->next; + } } - visit_mark_stack(); + for(i = 0; i < 2*N_POOLS; i++) { + pool = i < N_POOLS ? &norm_pools[i] : &ephe_pools[i - N_POOLS]; + pg = pool->pages; + while (pg != NULL) { + pv = (gcval_t*)pg->data; + char *lim = (char*)pv + GC_PAGE_SZ - pool->osize; + while ((char*)pv <= lim) { + arraylist_push(&bits_save[gc_bits(pv)], pv); + gc_bits(pv) = bits; + pv = (gcval_t*)((char*)pv + pool->osize); + } + pg = pg->next; + } + } +} + +static void restore(void) +{ + for(int b = 0; b < 4; b++) { + for(int i = 0; i < bits_save[b].len; i++) { + gc_bits(bits_save[b].items[i]) = b; + } + } +} +#endif +static void post_mark(int mark_mode) +{ // find unmarked objects that need to be finalized. // this must happen last. - for(i=0; i < finalizer_table.size; i+=2) { + for(size_t i=0; i < finalizer_table.size; i+=2) { if (finalizer_table.table[i+1] != HT_NOTFOUND) { - jl_value_t *v = (jl_value_t*)finalizer_table.table[i]; + jl_value_t *v = finalizer_table.table[i]; if (!gc_marked(v)) { - jl_value_t *fin = (jl_value_t*)finalizer_table.table[i+1]; + jl_value_t *fin = finalizer_table.table[i+1]; if (gc_typeof(fin) == (jl_value_t*)jl_voidpointer_type) { - void *p = ((void**)fin)[1]; + void *p = jl_unbox_voidpointer(fin); if (p) ((void (*)(void*))p)(jl_data_ptr(v)); finalizer_table.table[i+1] = HT_NOTFOUND; continue; } - gc_push_root(v, 0); + gc_push_root(v, mark_mode, 0); schedule_finalization(v); } - gc_push_root(finalizer_table.table[i+1], 0); + gc_push_root(finalizer_table.table[i+1], mark_mode, 0); + } + } + visit_mark_stack(GC_MARKED); +} + +static void gc_mark(int finalize) +{ + // mark all roots + + // active tasks + gc_push_root(jl_root_task, GC_MARKED_NOESC, 0); + gc_push_root(jl_current_task, GC_MARKED_NOESC, 0); + + // modules + gc_push_root(jl_main_module, GC_MARKED_NOESC, 0); + gc_push_root(jl_internal_main_module, GC_MARKED_NOESC, 0); + gc_push_root(jl_current_module, GC_MARKED_NOESC, 0); + if (jl_old_base_module) gc_push_root(jl_old_base_module, GC_MARKED_NOESC, 0); + + // invisible builtin values + if (jl_an_empty_cell) gc_push_root(jl_an_empty_cell, GC_MARKED_NOESC, 0); + gc_push_root(jl_exception_in_transit, GC_MARKED_NOESC, 0); + gc_push_root(jl_task_arg_in_transit, GC_MARKED_NOESC, 0); + gc_push_root(jl_unprotect_stack_func, GC_MARKED_NOESC, 0); + gc_push_root(jl_bottom_func, GC_MARKED_NOESC, 0); + gc_push_root(jl_typetype_type, GC_MARKED_NOESC, 0); + gc_push_root(jl_tupletype_type, GC_MARKED_NOESC, 0); + gc_push_root(typeToTypeId, GC_MARKED_NOESC, 0); + if (jl_module_init_order != NULL) + gc_push_root(jl_module_init_order, GC_MARKED_NOESC, 0); + + // constants + gc_push_root(jl_null, GC_MARKED_NOESC, 0); + gc_push_root(jl_true, GC_MARKED_NOESC, 0); + gc_push_root(jl_false, GC_MARKED_NOESC, 0); + + jl_mark_box_caches(); + + size_t i; + + // stuff randomly preserved + for(i=0; i < preserved_values.len; i++) { + gc_push_root((jl_value_t*)preserved_values.items[i], GC_MARKED_NOESC, 0); + } + + // objects currently being finalized + for(i=0; i < to_finalize.len; i++) { + gc_push_root(to_finalize.items[i], GC_MARKED_NOESC, 0); + } + + visit_mark_stack(GC_MARKED_NOESC); + mark_task_stacks(GC_MARKED_NOESC); + visit_mark_stack(GC_MARKED_NOESC); + + // find unmarked objects that need to be finalized. + // this must happen last. + if (finalize) { + for(i=0; i < finalizer_table.size; i+=2) { + if (finalizer_table.table[i+1] != HT_NOTFOUND) { + jl_value_t *v = (jl_value_t*)finalizer_table.table[i]; + if (!gc_marked(v)) { + jl_value_t *fin = (jl_value_t*)finalizer_table.table[i+1]; + if (gc_typeof(fin) == (jl_value_t*)jl_voidpointer_type) { + void *p = ((void**)fin)[1]; + if (p) + ((void (*)(void*))p)(jl_data_ptr(v)); + finalizer_table.table[i+1] = HT_NOTFOUND; + continue; + } + gc_push_root(v, GC_MARKED_NOESC, 0); + schedule_finalization(v); + } + gc_push_root(finalizer_table.table[i+1], GC_MARKED_NOESC, 0); + } } + + visit_mark_stack(GC_MARKED_NOESC); } +} - visit_mark_stack(); + +/* + How to debug a missing write barrier : + (or rather how I do it, if you know of a better way update this) + First, reproduce it with GC_VERIFY. It does change the allocation profile so if the error + is rare enough this may not be straightforward. If the backtracking goes well you should know + which object and which of its slots was written to without being caught by the write + barrier. Most times this allows you to take a guess. If this type of object is modified + by C code directly, look for missing gc_wb() on pointer updates. Be aware that there are + innocent looking functions which allocate (and thus trigger marking) only on special cases. + + If you cant find it, you can try the following : + - Ensure that should_timeout() is deterministic instead of clock based. + - Once you have a completly deterministic program which crashes on gc_verify, the addresses + should stay constant between different runs (with same binary, same environment ...). + Do not forget to turn off ASLR (linux: echo 0 > /proc/sys/kernel/randomize_va_space). + At this point you should be able to run under gdb and use a hw watch to look for writes + at the exact addr of the slot (use something like watch *slot_addr if *slot_addr == val). + - If it went well you are now stopped at the exact point the problem is happening. + Backtraces in JIT'd code wont work for me (but I'm not sure they should) so in that + case you can try to jl_throw(something) from gdb. + */ +// this does not yet detect missing writes from marked to marked_noesc +// the error is caught at the first long collection +#ifdef GC_VERIFY +static void gc_verify(void) +{ + lostval = NULL; + lostval_parents.len = 0; + lostval_parents_done.len = 0; + check_timeout = 0; + clear_mark(GC_CLEAN); + gc_mark(0); + + for(int i = 0; i < bits_save[GC_CLEAN].len; i++) { + gcval_t* v = (gcval_t*)bits_save[GC_CLEAN].items[i]; + if (gc_marked(v)) { + JL_PRINTF(JL_STDOUT, "Error. Early free of 0x%lx type :", (uptrint_t)v); + jl_(jl_typeof(v)); + JL_PRINTF(JL_STDOUT, "val : "); + jl_(v); + JL_PRINTF(JL_STDOUT, "Let's try to backtrack the missing write barrier :\n"); + lostval = v; + break; + } + } + if (lostval == NULL) { + restore(); // we did not miss anything + return; + } + restore(); + do { + arraylist_push(&lostval_parents_done, lostval); + JL_PRINTF(JL_STDOUT, "Now looking for 0x%lx =======\n", lostval); + clear_mark(GC_CLEAN); + gc_mark(0); + if (lostval_parents.len == 0) { + JL_PRINTF(JL_STDOUT, "Could not find the missing link. We missed a toplevel root. This is odd.\n"); + break; + } + jl_value_t* lostval_parent = NULL; + for(int i = 0; i < lostval_parents.len; i++) { + lostval_parent = (jl_value_t*)lostval_parents.items[i]; + for(int j = 0; j < bits_save[GC_CLEAN].len; j++) { + if (bits_save[GC_CLEAN].items[j] == lostval_parent) { + lostval = lostval_parent; + lostval_parent = NULL; + break; + } + } + if (lostval_parent != NULL) break; + } + if (lostval_parent == NULL) { // all parents of lostval were also scheduled for deletion + // lostval = arraylist_pop(&lostval_parents); + } + else { + JL_PRINTF(JL_STDOUT, "Missing write barrier found !\n"); + JL_PRINTF(JL_STDOUT, "0x%lx was written a reference to 0x%lx that was not recorded\n", lostval_parent, lostval); + JL_PRINTF(JL_STDOUT, "(details above)\n"); + lostval = NULL; + } + restore(); + } while(lostval != NULL); + abort(); } +#endif + // collector entry point and control @@ -927,8 +1784,8 @@ int64_t diff_gc_total_bytes(void) } void sync_gc_total_bytes(void) {last_gc_total_bytes = jl_gc_total_bytes();} -void jl_gc_ephemeral_on(void) { pools = &ephe_pools[0]; } -void jl_gc_ephemeral_off(void) { pools = &norm_pools[0]; } +void jl_gc_ephemeral_on(void) { }//pools = &ephe_pools[0]; } +void jl_gc_ephemeral_off(void) { }//pools = &norm_pools[0]; } #if defined(MEMPROFILE) static void all_pool_stats(void); @@ -936,17 +1793,207 @@ static void big_obj_stats(void); #endif #ifdef OBJPROFILE -static void print_obj_profile(void) +static void print_obj_profile(htable_t obj_counts) { for(int i=0; i < obj_counts.size; i+=2) { if (obj_counts.table[i+1] != HT_NOTFOUND) { - jl_printf(JL_STDERR, "%d ", obj_counts.table[i+1]-1); + jl_printf(JL_STDERR, " %d ", obj_counts.table[i+1]-1); jl_static_show(JL_STDERR, (jl_value_t*)obj_counts.table[i]); jl_printf(JL_STDERR, "\n"); } } } +static void print_obj_profiles(void) +{ + jl_printf(JL_STDERR, "Transient mark :\n"); + print_obj_profile(obj_counts[0]); + jl_printf(JL_STDERR, "Perm mark :\n"); + print_obj_profile(obj_counts[1]); +} +#endif + +int saved_mark_sp = 0; +int sweep_mask = GC_MARKED; +#define MIN_SCAN_BYTES 1024*1024 + +static void mark_task_stacks(int); +static void gc_mark_task_stack(jl_task_t*,int,int); + +void prepare_sweep(void) +{ + for(int i = 0; i < 2*N_POOLS; i++) { + pool_t *p = i < N_POOLS ? &norm_pools[i] : &ephe_pools[i - N_POOLS]; + if (p->pages) { + p->needsweep = p->pages; + p->pages = NULL; + p->freelist = NULL; + } + } +} + +#ifdef GC_INC +int residual = 0; + +void jl_gc_collect(void) +{ + if (!is_gc_enabled) return; + if (jl_in_gc) return; + jl_in_gc = 1; + JL_SIGATOMIC_BEGIN(); + double t0 = clock_now(); +#if defined(GC_TIME) || defined(GC_FINAL_STATS) + int wb_activations = mark_sp - saved_mark_sp; +#endif + if (!sweeping) { + + inc_count++; + quick_count++; + + scanned_bytes = 0; + scanned_bytes_goal = inc_count*(live_bytes/gc_inc_steps + mark_sp*sizeof(void*)); + scanned_bytes_goal = scanned_bytes_goal < MIN_SCAN_BYTES ? MIN_SCAN_BYTES : scanned_bytes_goal; + + check_timeout = 1; + double t = clock_now(); + + mark_stack -= perm_marked; + + mark_sp = perm_marked = perm_marked + mark_sp; + + if (live_bytes && gc_inc_steps > 1) visit_mark_stack_inc(GC_MARKED_NOESC); + else visit_mark_stack(GC_MARKED_NOESC); + + if (sweep_mask == GC_MARKED) + perm_marked = 0; + else + mark_stack += perm_marked; + + pre_mark(GC_MARKED_NOESC); + visit_mark_stack(GC_MARKED_NOESC); + + if (mark_sp == 0 || inc_count > gc_inc_steps) { // mark current stack last to avoid temporaries + visit_mark_stack(GC_MARKED_NOESC); // in case inc_count > inc_steps, we finish the marking in one go + + mark_task_stacks(GC_MARKED_NOESC); + visit_mark_stack(GC_MARKED_NOESC); + } + allocd_bytes_since_sweep += allocd_bytes + collect_interval/gc_steps; + allocd_bytes = -collect_interval/gc_steps; +#ifdef OBJPROFILE + print_obj_profiles(); + htable_reset(&obj_counts[0], 0); + htable_reset(&obj_counts[1], 0); +#endif + double mark_pause = (clock_now() - t0); +#ifdef GC_FINAL_STATS + total_mark_time += mark_pause; +#endif +#ifdef GC_TIME + JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | stack %d -> %d (wb %d)\n", mark_pause*1000, saved_mark_sp, mark_sp, wb_activations); + saved_mark_sp = mark_sp; +#endif + } + int pct = -1, bonus = -1; + double post_time = 0.0, finalize_time = 0.0; + if(mark_sp == 0 || sweeping) { +#if defined(GC_TIME) || defined(GC_FINAL_STATS) + double sweep_t0 = clock_now(); +#endif + size_t actual_allocd = allocd_bytes_since_sweep; + if (!sweeping) { +#ifdef GC_TIME + post_time = clock_now(); +#endif + post_mark(GC_MARKED_NOESC); + +#ifdef GC_TIME + post_time = clock_now() - post_time; +#endif +#ifdef GC_VERIFY + gc_verify(); +#endif + +#if defined(MEMPROFILE) + all_pool_stats(); + big_obj_stats(); +#endif + + total_allocd_bytes += allocd_bytes_since_sweep; + + prepare_sweep(); + gc_sweep_once(sweep_mask); + sweeping = 1; + gc_steps = gc_sweep_steps; + } + scanned_bytes = 0; + if (gc_sweep_inc(sweep_mask)) { + // sweeping is over + int tasks_end = 0; + for (int i = 0; i < tasks.len; i++) { + jl_value_t* ta = (jl_value_t*)tasks.items[i]; + if (gc_marked(ta)) { + tasks.items[tasks_end] = tasks.items[i]; + tasks_end++; + } + } + tasks.len = tasks_end; + sweep_weak_refs(); + prev_sweep_mask = sweep_mask; + sweeping = 0; + if (sweep_mask == GC_MARKED) { + tasks.len = 0; + } + finalize_time = clock_now(); + run_finalizers(); + finalize_time = clock_now() - finalize_time; + pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; + bonus = freed_bytes - (7*(actual_allocd/10)); + if (bonus - residual < 0 || quick_count > 10) { + if (collect_interval <= 2*(max_collect_interval/5)) { + collect_interval = 5*(collect_interval/2); + } + sweep_mask = GC_MARKED; // next collection is a full one + gc_steps = gc_inc_steps; + quick_count = 0; + residual = 0; + } + else { + residual += allocd_bytes_since_sweep - freed_bytes; + collect_interval = default_collect_interval; + sweep_mask = GC_MARKED_NOESC; // next collection is quick + gc_steps = gc_quick_steps; + } + + allocd_bytes = -collect_interval/gc_steps; + inc_count = 0; + live_bytes += -freed_bytes + allocd_bytes_since_sweep; + allocd_bytes_since_sweep = 0; + freed_bytes = 0; + } +#if defined(GC_FINAL_STATS) || defined(GC_TIME) + double sweep_pause = clock_now() - sweep_t0; +#endif +#ifdef GC_FINAL_STATS + total_sweep_time += sweep_pause - finalize_time - post_time; + total_fin_time += finalize_time + post_time; #endif +#ifdef GC_TIME + JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms (free %d%% of alloc %d - %d) (%.2f ms in post_mark, %.2f ms in fin) (marked in %d inc) mask %d\n", sweep_pause*1000, pct, bonus, residual, post_time*1000, finalize_time*1000, inc_count, sweep_mask); +#endif + } + n_pause++; +#ifdef GC_FINAL_STATS + double pause = clock_now() - t0; + total_gc_time += pause*1000*1000*1000; // i don't think ns precision is really relevant here + pause -= finalize_time; + // do not count the first pause as it is always a full collection + max_pause = (max_pause < pause && n_pause > 1) ? pause : max_pause; +#endif + JL_SIGATOMIC_END(); + jl_in_gc = 0; +} + +#else void jl_gc_collect(void) { @@ -957,7 +2004,7 @@ void jl_gc_collect(void) JL_SIGATOMIC_BEGIN(); jl_in_gc = 1; uint64_t t0 = jl_hrtime(); - gc_mark(); + gc_mark(1); #ifdef GCTIME JL_PRINTF(JL_STDERR, "mark time %.3f ms\n", (jl_hrtime()-t0)*1.0e6); #endif @@ -969,16 +2016,19 @@ void jl_gc_collect(void) uint64_t t1 = jl_hrtime(); #endif sweep_weak_refs(); - gc_sweep(); + prepare_sweep(); + gc_sweep(GC_MARKED); #ifdef GCTIME JL_PRINTF(JL_STDERR, "sweep time %.3f ms\n", (jl_hrtime()-t1)*1.0e6); #endif int nfinal = to_finalize.len; run_finalizers(); jl_in_gc = 0; + JL_SIGATOMIC_END(); total_gc_time += (jl_hrtime()-t0); #if defined(GC_FINAL_STATS) + n_pause++; total_freed_bytes += freed_bytes; #endif #ifdef OBJPROFILE @@ -1007,23 +2057,27 @@ void jl_gc_collect(void) } } +#endif + // allocator entry points void *allocb(size_t sz) { - void *b; + buff_t *b; sz += sizeof(void*); #ifdef MEMDEBUG b = alloc_big(sz); #else if (sz > 2048) { - b = alloc_big(sz); + b = (buff_t*)alloc_big(sz); + b->pooled = 0; } else { - b = pool_alloc(&pools[szclass(sz)]); + b = (buff_t*)pool_alloc(&pools[szclass(sz)]); + b->pooled = 1; } #endif - return (void*)((void**)b + 1); + return b->data; } DLLEXPORT void *allocobj(size_t sz) @@ -1031,9 +2085,10 @@ DLLEXPORT void *allocobj(size_t sz) #ifdef MEMDEBUG return alloc_big(sz); #endif - if (sz > 2048) + if (sz <= 2048) + return pool_alloc(&pools[szclass(sz)]); + else return alloc_big(sz); - return pool_alloc(&pools[szclass(sz)]); } DLLEXPORT void *alloc_2w(void) @@ -1042,9 +2097,9 @@ DLLEXPORT void *alloc_2w(void) return alloc_big(2*sizeof(void*)); #endif #ifdef _P64 - return pool_alloc(&pools[2]); + return _pool_alloc(&pools[2], 2*sizeof(void*)); #else - return pool_alloc(&pools[0]); + return _pool_alloc(&pools[0], 2*sizeof(void*)); #endif } @@ -1054,10 +2109,11 @@ DLLEXPORT void *alloc_3w(void) return alloc_big(3*sizeof(void*)); #endif #ifdef _P64 - return pool_alloc(&pools[4]); + return _pool_alloc(&pools[4], 3*sizeof(void*)); #else - return pool_alloc(&pools[1]); + return _pool_alloc(&pools[1], 3*sizeof(void*)); #endif + } DLLEXPORT void *alloc_4w(void) @@ -1066,12 +2122,12 @@ DLLEXPORT void *alloc_4w(void) return alloc_big(4*sizeof(void*)); #endif #ifdef _P64 - return pool_alloc(&pools[6]); + return _pool_alloc(&pools[6], 4*sizeof(void*)); #else return pool_alloc(&pools[2]); #endif } - +#define NS_TO_S(t) ((double)(t/1000)/(1000*1000)) #ifdef GC_FINAL_STATS static double process_t0; #include @@ -1081,9 +2137,14 @@ void jl_print_gc_stats(JL_STREAM *s) malloc_stats(); double ptime = clock_now()-process_t0; jl_printf(s, "exec time\t%.5f sec\n", ptime); - jl_printf(s, "gc time \t%.5f sec (%2.1f%%)\n", gct, (gct/ptime)*100); + jl_printf(s, "gc time \t%.5f sec (%2.1f%%)\n", NS_TO_S(total_gc_time), + (NS_TO_S(total_gc_time)/ptime)*100); + jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%.2f ms max\n", (NS_TO_S(total_gc_time)/n_pause)*1000, max_pause*1000); + jl_printf(s, "\t\t(%2.1f%% mark, %2.1f%% sweep, %2.1f%% finalizers)\n", (total_mark_time/NS_TO_S(total_gc_time))*100, (total_sweep_time/NS_TO_S(total_gc_time))*100, (total_fin_time/NS_TO_S(total_gc_time))*100); + jl_printf(s, "alloc pause\t%.2f ms\n", page_alloc_time); struct mallinfo mi = mallinfo(); jl_printf(s, "malloc size\t%d MB\n", mi.uordblks/1024/1024); + jl_printf(s, "max page alloc\t%ld MB\n", max_pg_count*GC_PAGE_SZ/1024/1024); jl_printf(s, "total freed\t%llu b\n", total_freed_bytes); jl_printf(s, "free rate\t%.1f MB/sec\n", (total_freed_bytes/gct)/1024/1024); } @@ -1104,23 +2165,46 @@ void jl_gc_init(void) 1536, 2048 }; int i; + for(i=0; i < N_POOLS; i++) { + assert(szc[i] % 4 == 0); norm_pools[i].osize = szc[i]; norm_pools[i].pages = NULL; norm_pools[i].freelist = NULL; + norm_pools[i].needsweep = NULL; + norm_pools[i].fl_linear = 1; + norm_pools[i].nfree = 0; + // norm_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; ephe_pools[i].osize = szc[i]; ephe_pools[i].pages = NULL; ephe_pools[i].freelist = NULL; + ephe_pools[i].needsweep = NULL; + // ephe_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; } + assert(offsetof(gcpages_t, data) == GC_PAGE_SZ); + +#ifdef GC_INC + gc_steps = gc_inc_steps; +#endif htable_new(&finalizer_table, 0); arraylist_new(&to_finalize, 0); arraylist_new(&preserved_values, 0); arraylist_new(&weak_refs, 0); +#ifdef GC_VERIFY + for(int i = 0; i < 4; i++) + arraylist_new(&bits_save[i], 0); + arraylist_new(&lostval_parents, 0); + arraylist_new(&lostval_parents_done, 0); +#endif +#ifdef GC_INC + arraylist_new(&tasks, 0); +#endif #ifdef OBJPROFILE - htable_new(&obj_counts, 0); + htable_new(&obj_counts[0], 0); + htable_new(&obj_counts[1], 0); #endif #ifdef GC_FINAL_STATS process_t0 = clock_now(); @@ -1137,7 +2221,7 @@ void jl_gc_init(void) // GC summary stats #if defined(MEMPROFILE) -static size_t pool_stats(pool_t *p, size_t *pwaste) +static size_t pool_stats(pool_t *p, size_t *pwaste, int *np) { gcval_t *v; gcpage_t *pg = p->pages; @@ -1146,10 +2230,11 @@ static size_t pool_stats(pool_t *p, size_t *pwaste) while (pg != NULL) { npgs++; - v = (gcval_t*)&pg->data[0]; + v = (gcval_t*)pg->data; char *lim = (char*)v + GC_PAGE_SZ - osize; + // this is not accurate anymore and can underestimate waste while ((char*)v <= lim) { - if (!v->marked) { + if (!gc_marked(v)) { nfree++; } else { @@ -1161,6 +2246,7 @@ static size_t pool_stats(pool_t *p, size_t *pwaste) pg = nextpg; } *pwaste = npgs*GC_PAGE_SZ - (nused*p->osize); + *np = npgs; JL_PRINTF(JL_STDOUT, "%4d : %7d/%7d objects, %5d pages, %8d bytes, %8d waste\n", p->osize, @@ -1175,21 +2261,23 @@ static size_t pool_stats(pool_t *p, size_t *pwaste) static void all_pool_stats(void) { int i; - size_t nb=0, w, tw=0, no=0, b; + size_t nb=0, w, tw=0, no=0,tp=0, b, np; for(i=0; i < N_POOLS; i++) { - b = pool_stats(&norm_pools[i], &w); + b = pool_stats(&norm_pools[i], &w, &np); nb += b; no += (b/norm_pools[i].osize); tw += w; + tp += np; - b = pool_stats(&ephe_pools[i], &w); + b = pool_stats(&ephe_pools[i], &w, &np); nb += b; no += (b/ephe_pools[i].osize); tw += w; + tp += np; } JL_PRINTF(JL_STDOUT, - "%d objects, %d total allocated, %d total fragments\n", - no, nb, tw); + "%d objects, %d total allocated, %d total fragments, in %d pages\n", + no, nb, tw, tp); } static void big_obj_stats(void) @@ -1197,7 +2285,7 @@ static void big_obj_stats(void) bigval_t *v = big_objects; size_t nused=0, nbytes=0; while (v != NULL) { - if (v->marked) { + if (gc_marked(&v->_data)) { nused++; nbytes += v->sz; } diff --git a/src/gf.c b/src/gf.c index 9efc13e8c87bb..2b5c31e7cc3e3 100644 --- a/src/gf.c +++ b/src/gf.c @@ -1,5 +1,5 @@ /* - Generic Functions + GENERIC Functions . method table and lookup . GF constructor, add_method . dispatch @@ -156,7 +156,7 @@ jl_methlist_t *mtcache_hash_lookup(jl_array_t *a, jl_value_t *ty, int tparam) return (jl_methlist_t*)JL_NULL; } -static void mtcache_rehash(jl_array_t **pa) +static void mtcache_rehash(jl_array_t **pa, jl_value_t* parent) { size_t len = (*pa)->nrows; jl_value_t **d = (jl_value_t**)(*pa)->data; @@ -173,11 +173,12 @@ static void mtcache_rehash(jl_array_t **pa) nd[uid & (len*2-1)] = (jl_value_t*)ml; } } + gc_wb(parent, n); *pa = n; } static jl_methlist_t **mtcache_hash_bp(jl_array_t **pa, jl_value_t *ty, - int tparam) + int tparam, jl_value_t* parent) { uptrint_t uid; if (jl_is_datatype(ty) && (uid = ((jl_datatype_t*)ty)->uid)) { @@ -191,7 +192,7 @@ static jl_methlist_t **mtcache_hash_bp(jl_array_t **pa, jl_value_t *ty, if (tparam) t = jl_tparam0(t); if (t == ty) return pml; - mtcache_rehash(pa); + mtcache_rehash(pa, parent); } } return NULL; @@ -309,6 +310,7 @@ jl_function_t *jl_instantiate_method(jl_function_t *f, jl_tuple_t *sp) jl_function_t *nf = jl_new_closure(f->fptr, f->env, NULL); JL_GC_PUSH1(&nf); nf->linfo = jl_add_static_parameters(f->linfo, sp); + gc_wb(nf, nf->linfo); JL_GC_POP(); return nf; } @@ -322,13 +324,14 @@ jl_function_t *jl_reinstantiate_method(jl_function_t *f, jl_lambda_info_t *li) static jl_methlist_t *jl_method_list_insert(jl_methlist_t **pml, jl_tuple_t *type, jl_function_t *method, jl_tuple_t *tvars, - int check_amb, int8_t isstaged); + int check_amb, int8_t isstaged, jl_value_t* parent); static jl_function_t *jl_method_cache_insert(jl_methtable_t *mt, jl_tuple_t *type, jl_function_t *method) { jl_methlist_t **pml = &mt->cache; + jl_value_t* cache_array = NULL; if (jl_tuple_len(type) > 0) { jl_value_t *t0 = jl_t0(type); uptrint_t uid=0; @@ -340,22 +343,28 @@ jl_function_t *jl_method_cache_insert(jl_methtable_t *mt, jl_tuple_t *type, if (jl_is_datatype(a0)) uid = ((jl_datatype_t*)a0)->uid; if (uid > 0) { - if (mt->cache_targ == JL_NULL) + if (mt->cache_targ == JL_NULL) { mt->cache_targ = jl_alloc_cell_1d(16); - pml = mtcache_hash_bp(&mt->cache_targ, a0, 1); + gc_wb(mt, mt->cache_targ); + } + pml = mtcache_hash_bp(&mt->cache_targ, a0, 1, (jl_value_t*)mt); + cache_array = (jl_value_t*)mt->cache_targ; goto ml_do_insert; } } if (jl_is_datatype(t0)) uid = ((jl_datatype_t*)t0)->uid; if (uid > 0) { - if (mt->cache_arg1 == JL_NULL) + if (mt->cache_arg1 == JL_NULL) { mt->cache_arg1 = jl_alloc_cell_1d(16); - pml = mtcache_hash_bp(&mt->cache_arg1, t0, 0); + gc_wb(mt, mt->cache_arg1); + } + pml = mtcache_hash_bp(&mt->cache_arg1, t0, 0, (jl_value_t*)mt); + cache_array = (jl_value_t*)mt->cache_arg1; } } ml_do_insert: - return jl_method_list_insert(pml, type, method, jl_null, 0, 0)->func; + return jl_method_list_insert(pml, type, method, jl_null, 0, 0, cache_array ? cache_array : (jl_value_t*)mt)->func; } extern jl_function_t *jl_typeinf_func; @@ -390,6 +399,7 @@ void jl_type_infer(jl_lambda_info_t *li, jl_tuple_t *argtypes, #ifdef ENABLE_INFERENCE jl_value_t *newast = jl_apply(jl_typeinf_func, fargs, 4); li->ast = jl_tupleref(newast, 0); + gc_wb(li, li->ast); li->inferred = 1; #endif li->inInference = 0; @@ -853,8 +863,10 @@ static jl_function_t *cache_method(jl_methtable_t *mt, jl_tuple_t *type, if (method->linfo->unspecialized == NULL) { method->linfo->unspecialized = jl_instantiate_method(method, jl_null); + gc_wb(method->linfo, method->linfo->unspecialized); } newmeth->linfo->unspecialized = method->linfo->unspecialized; + gc_wb(newmeth->linfo, newmeth->linfo->unspecialized); } if (newmeth->linfo != NULL && newmeth->linfo->ast != NULL) { @@ -868,6 +880,7 @@ static jl_function_t *cache_method(jl_methtable_t *mt, jl_tuple_t *type, jl_cell_1d_push(spe, (jl_value_t*)newmeth->linfo); } method->linfo->specializations = spe; + gc_wb(method->linfo, method->linfo->specializations); jl_type_infer(newmeth->linfo, type, method->linfo); } JL_GC_POP(); @@ -1183,7 +1196,7 @@ static int has_unions(jl_tuple_t *type) static jl_methlist_t *jl_method_list_insert(jl_methlist_t **pml, jl_tuple_t *type, jl_function_t *method, jl_tuple_t *tvars, - int check_amb, int8_t isstaged) + int check_amb, int8_t isstaged, jl_value_t* parent) { jl_methlist_t *l, **pl; @@ -1211,13 +1224,16 @@ jl_methlist_t *jl_method_list_insert(jl_methlist_t **pml, jl_tuple_t *type, } JL_SIGATOMIC_BEGIN(); l->sig = type; + gc_wb(l, l->sig); l->tvars = tvars; + gc_wb(l, l->tvars); l->va = (jl_tuple_len(type) > 0 && jl_is_vararg_type(jl_tupleref(type,jl_tuple_len(type)-1))) ? 1 : 0; l->isstaged = isstaged; l->invokes = (struct _jl_methtable_t *)JL_NULL; l->func = method; + gc_wb(l, l->func); JL_SIGATOMIC_END(); return l; } @@ -1225,6 +1241,7 @@ jl_methlist_t *jl_method_list_insert(jl_methlist_t **pml, jl_tuple_t *type, } pl = pml; l = *pml; + jl_value_t *pa = parent; while (l != JL_NULL) { if (jl_args_morespecific((jl_value_t*)type, (jl_value_t*)l->sig)) break; @@ -1234,6 +1251,7 @@ jl_methlist_t *jl_method_list_insert(jl_methlist_t **pml, jl_tuple_t *type, anonymous_sym, method->linfo); } pl = &l->next; + pa = (jl_value_t*)l; l = l->next; } jl_methlist_t *newrec = (jl_methlist_t*)allocobj(sizeof(jl_methlist_t)); @@ -1248,34 +1266,47 @@ jl_methlist_t *jl_method_list_insert(jl_methlist_t **pml, jl_tuple_t *type, newrec->invokes = (struct _jl_methtable_t*)JL_NULL; newrec->next = l; JL_SIGATOMIC_BEGIN(); + JL_GC_PUSH1(&newrec); *pl = newrec; + gc_wb(pa, newrec); // if this contains Union types, methods after it might actually be // more specific than it. we need to re-sort them. if (has_unions(type)) { + jl_value_t* item_parent = newrec; + jl_value_t* next_parent = 0; jl_methlist_t *item = newrec->next, *next; jl_methlist_t **pitem = &newrec->next, **pnext; while (item != JL_NULL) { pl = pml; l = *pml; + pa = parent; next = item->next; pnext = &item->next; + next_parent = item; while (l != newrec->next) { if (jl_args_morespecific((jl_value_t*)item->sig, (jl_value_t*)l->sig)) { // reinsert item earlier in the list *pitem = next; + gc_wb(item_parent, next); item->next = l; + gc_wb(item, item->next); *pl = item; + gc_wb(pa, item); pnext = pitem; + next_parent = item_parent; break; } pl = &l->next; + pa = (jl_value_t*)l; l = l->next; } item = next; pitem = pnext; + item_parent = next_parent; } } + JL_GC_POP(); JL_SIGATOMIC_END(); return newrec; } @@ -1302,21 +1333,26 @@ jl_methlist_t *jl_method_table_insert(jl_methtable_t *mt, jl_tuple_t *type, if (jl_tuple_len(tvars) == 1) tvars = (jl_tuple_t*)jl_t0(tvars); JL_SIGATOMIC_BEGIN(); - jl_methlist_t *ml = jl_method_list_insert(&mt->defs,type,method,tvars,1,isstaged); + jl_methlist_t *ml = jl_method_list_insert(&mt->defs,type,method,tvars,1,isstaged,(jl_value_t*)mt); // invalidate cached methods that overlap this definition remove_conflicting(&mt->cache, (jl_value_t*)type); + gc_wb(mt, mt->cache); if (mt->cache_arg1 != JL_NULL) { for(int i=0; i < jl_array_len(mt->cache_arg1); i++) { jl_methlist_t **pl = (jl_methlist_t**)&jl_cellref(mt->cache_arg1,i); - if (*pl && *pl != JL_NULL) + if (*pl && *pl != JL_NULL) { remove_conflicting(pl, (jl_value_t*)type); + gc_wb(mt->cache_arg1, jl_cellref(mt->cache_arg1,i)); + } } } if (mt->cache_targ != JL_NULL) { for(int i=0; i < jl_array_len(mt->cache_targ); i++) { jl_methlist_t **pl = (jl_methlist_t**)&jl_cellref(mt->cache_targ,i); - if (*pl && *pl != JL_NULL) + if (*pl && *pl != JL_NULL) { remove_conflicting(pl, (jl_value_t*)type); + gc_wb(mt->cache_targ, jl_cellref(mt->cache_targ,i)); + } } } // update max_args @@ -1344,10 +1380,10 @@ jl_value_t *jl_no_method_error(jl_function_t *f, jl_value_t **args, size_t na) static jl_tuple_t *arg_type_tuple(jl_value_t **args, size_t nargs) { jl_tuple_t *tt = jl_alloc_tuple(nargs); - JL_GC_PUSH1(&tt); + jl_value_t *a = NULL; + JL_GC_PUSH2(&tt, &a); size_t i; for(i=0; i < nargs; i++) { - jl_value_t *a; if (jl_is_type(args[i])) { a = (jl_value_t*)jl_wrap_Type(args[i]); } @@ -1613,6 +1649,7 @@ JL_CALLABLE(jl_apply_generic) jl_lambda_info_t *li = mfunc->linfo; if (li->unspecialized == NULL) { li->unspecialized = jl_instantiate_method(mfunc, li->sparams); + gc_wb(li, li->unspecialized); } mfunc = li->unspecialized; assert(mfunc != jl_bottom_func); @@ -1695,6 +1732,7 @@ jl_value_t *jl_gf_invoke(jl_function_t *gf, jl_tuple_t *types, jl_lambda_info_t *li = mfunc->linfo; if (li->unspecialized == NULL) { li->unspecialized = jl_instantiate_method(mfunc, li->sparams); + gc_wb(li, li->unspecialized); } mfunc = li->unspecialized; } @@ -1707,8 +1745,9 @@ jl_value_t *jl_gf_invoke(jl_function_t *gf, jl_tuple_t *types, if (m->invokes == JL_NULL) { m->invokes = new_method_table(mt->name); + gc_wb(m, m->invokes); // this private method table has just this one definition - jl_method_list_insert(&m->invokes->defs,m->sig,m->func,m->tvars,0,0); + jl_method_list_insert(&m->invokes->defs,m->sig,m->func,m->tvars,0,0,(jl_value_t*)m->invokes); } tt = arg_type_tuple(args, nargs); @@ -1752,6 +1791,7 @@ void jl_initialize_generic_function(jl_function_t *f, jl_sym_t *name) { f->fptr = jl_apply_generic; f->env = (jl_value_t*)new_method_table(name); + gc_wb(f, f->env); } jl_function_t *jl_new_generic_function(jl_sym_t *name) diff --git a/src/init.c b/src/init.c index 02dee85ab1ab7..4873961824934 100644 --- a/src/init.c +++ b/src/init.c @@ -85,7 +85,8 @@ jl_compileropts_t jl_compileropts = { NULL, // build_path JL_COMPILEROPT_CHECK_BOUNDS_DEFAULT, JL_COMPILEROPT_DUMPBITCODE_OFF, 0, // int_literals - JL_COMPILEROPT_COMPILE_DEFAULT + JL_COMPILEROPT_COMPILE_DEFAULT, + 0, // int32_literals }; int jl_boot_file_loaded = 0; diff --git a/src/interpreter.c b/src/interpreter.c index 0f412890e27e9..04f08d3e9c287 100644 --- a/src/interpreter.c +++ b/src/interpreter.c @@ -146,6 +146,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) jl_lambda_info_t *li = (jl_lambda_info_t*)e; if (jl_boot_file_loaded && li->ast && jl_is_expr(li->ast)) { li->ast = jl_compress_ast(li, li->ast); + gc_wb(li, li->ast); } return (jl_value_t*)jl_new_closure(NULL, (jl_value_t*)jl_null, li); } @@ -187,6 +188,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) } for(int i=0; i < na; i++) { ar[i*2+1] = eval(args[i+1], locals, nl); + gc_wb(ex->args, ar[i*2+1]); } if (na != nreq) { jl_error("wrong number of arguments"); @@ -212,7 +214,9 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) size_t i; for (i=0; i < nl; i++) { if (locals[i*2] == sym) { - return (locals[i*2+1] = eval(args[1], locals, nl)); + locals[i*2+1] = eval(args[1], locals, nl); + gc_wb(jl_current_module, locals[i*2+1]); // not sure about jl_current_module + return locals[i*2+1]; } } jl_binding_t *b = jl_get_binding_wr(jl_current_module, (jl_sym_t*)sym); @@ -381,6 +385,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) 0, args[6]==jl_true ? 1 : 0); dt->fptr = jl_f_ctor_trampoline; dt->name->ctor_factory = eval(args[3], locals, nl); + gc_wb(dt->name, dt->name->ctor_factory); jl_binding_t *b = jl_get_binding_wr(jl_current_module, (jl_sym_t*)name); temp = b->value; // save old value @@ -392,6 +397,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) // operations that can fail inside_typedef = 1; dt->types = (jl_tuple_t*)eval(args[5], locals, nl); + gc_wb(dt, dt->types); inside_typedef = 0; jl_check_type_tuple(dt->types, dt->name->name, "type definition"); super = eval(args[4], locals, nl); @@ -430,6 +436,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) f->linfo && f->linfo->ast && jl_is_expr(f->linfo->ast)) { jl_lambda_info_t *li = f->linfo; li->ast = jl_compress_ast(li, li->ast); + gc_wb(li, li->ast); li->name = nm; } jl_set_global(jl_current_module, nm, (jl_value_t*)f); diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp index a171a1b3a10e2..8f66874985df5 100644 --- a/src/intrinsics.cpp +++ b/src/intrinsics.cpp @@ -803,7 +803,7 @@ static Value *emit_pointerset(jl_value_t *e, jl_value_t *x, jl_value_t *i, jl_co else val = emit_unboxed(x,ctx); } - typed_store(thePtr, im1, val, ety, ctx); + typed_store(thePtr, im1, val, ety, ctx, NULL); } return mark_julia_type(thePtr, aty); } diff --git a/src/jltypes.c b/src/jltypes.c index 745d18e865098..a88aca67a27bd 100644 --- a/src/jltypes.c +++ b/src/jltypes.c @@ -1760,6 +1760,7 @@ static void cache_type_(jl_value_t *type) memcpy(nc->data, ((jl_tuple_t*)cache)->data, sizeof(void*)*jl_tuple_len(cache)); cache = (jl_value_t*)nc; ((jl_datatype_t*)type)->name->cache = cache; + gc_wb(((jl_datatype_t*)type)->name, cache); } assert(jl_is_array(cache)); jl_cell_1d_push((jl_array_t*)cache, (jl_value_t*)type); @@ -1771,6 +1772,7 @@ static void cache_type_(jl_value_t *type) memcpy(nc->data, ((jl_tuple_t*)cache)->data, sizeof(void*) * n); jl_tupleset(nc, n, (jl_value_t*)type); ((jl_datatype_t*)type)->name->cache = (jl_value_t*)nc; + gc_wb(((jl_datatype_t*)type)->name, nc); } } @@ -1927,9 +1929,12 @@ static jl_value_t *inst_type_w_(jl_value_t *t, jl_value_t **env, size_t n, top.prev = stack; stack = ⊤ ndt->name = tn; + gc_wb(ndt, ndt->name); ndt->super = jl_any_type; ndt->parameters = iparams_tuple; + gc_wb(ndt, ndt->parameters); ndt->names = dt->names; + gc_wb(ndt, ndt->names); ndt->types = jl_null; // to be filled in below if (isabstract || !jl_is_function(tn->ctor_factory)) ndt->fptr = jl_f_no_function; @@ -1938,16 +1943,20 @@ static jl_value_t *inst_type_w_(jl_value_t *t, jl_value_t **env, size_t n, ndt->mutabl = dt->mutabl; ndt->abstract = dt->abstract; ndt->env = (jl_value_t*)ndt; + gc_wb(ndt, ndt->env); ndt->linfo = NULL; ndt->instance = NULL; ndt->uid = 0; ndt->struct_decl = NULL; ndt->size = ndt->alignment = 0; + ndt->super = (jl_datatype_t*)inst_type_w_((jl_value_t*)dt->super, env,n,stack, 1); + gc_wb(ndt, ndt->super); ftypes = dt->types; if (ftypes != NULL) { // recursively instantiate the types of the fields ndt->types = (jl_tuple_t*)inst_type_w_((jl_value_t*)ftypes, env, n, stack, 1); + gc_wb(ndt, ndt->types); if (!isabstract) { if (jl_tuple_len(ftypes) == 0) { ndt->alignment = ndt->size = dt->size; @@ -2014,9 +2023,11 @@ void jl_reinstantiate_inner_types(jl_datatype_t *t) env[i*2+1] = env[i*2]; } t->super = (jl_datatype_t*)inst_type_w_((jl_value_t*)t->super, env, n, &top, 1); + gc_wb(t, t->super); if (jl_is_datatype(t)) { jl_datatype_t *st = (jl_datatype_t*)t; st->types = (jl_tuple_t*)inst_type_w_((jl_value_t*)st->types, env, n, &top, 1); + gc_wb(st, st->types); } } diff --git a/src/julia.expmap b/src/julia.expmap index 08f54c22f5a0c..305bd5a7600e0 100644 --- a/src/julia.expmap +++ b/src/julia.expmap @@ -32,6 +32,8 @@ uv_*; add_library_mapping; utf8proc_*; + gc_queue_root; + gc_wb_slow; /* freebsd */ environ; diff --git a/src/julia.h b/src/julia.h index f12ccbd3ae884..58cd21fe78103 100644 --- a/src/julia.h +++ b/src/julia.h @@ -89,7 +89,8 @@ typedef struct { size_t length; #endif - unsigned short ndims:11; + unsigned short ndims:10; + unsigned short pooled:1; unsigned short ptrarray:1; // representation is pointer array /* how - allocation style @@ -431,16 +432,27 @@ extern jl_sym_t *arrow_sym; extern jl_sym_t *ldots_sym; #ifdef OVERLAP_TUPLE_LEN #define jl_typeof(v) ((jl_value_t*)((uptrint_t)((jl_value_t*)(v))->type & 0x000ffffffffffffeULL)) #else +#ifdef GC_INC +#define jl_typeof(v) ((jl_value_t*)((uptrint_t)((jl_value_t*)(v))->type & ((uintptr_t)~3))) +#else #define jl_typeof(v) (((jl_value_t*)(v))->type) #endif +#endif + #define jl_typeis(v,t) (jl_typeof(v)==(jl_value_t*)(t)) + + #ifdef OVERLAP_TUPLE_LEN #define jl_tupleref(t,i) (((jl_value_t**)(t))[1+(i)]) #define jl_tupleset(t,i,x) ((((jl_value_t**)(t))[1+(i)])=(jl_value_t*)(x)) #else #define jl_tupleref(t,i) (((jl_value_t**)(t))[2+(i)]) -#define jl_tupleset(t,i,x) ((((jl_value_t**)(t))[2+(i)])=(jl_value_t*)(x)) +#define jl_tupleset(t,i,x) do { \ + jl_value_t *xx = (jl_value_t*)(x); \ + if (xx) gc_wb(t, xx); \ + (((jl_value_t**)(t))[2+(i)])=xx; \ + } while(0) #endif #define jl_t0(t) jl_tupleref(t,0) #define jl_t1(t) jl_tupleref(t,1) @@ -449,9 +461,14 @@ extern jl_sym_t *arrow_sym; extern jl_sym_t *ldots_sym; #define jl_tuple_set_len_unsafe(t,n) (((jl_tuple_t*)(t))->length=(n)) #define jl_cellref(a,i) (((jl_value_t**)((jl_array_t*)a)->data)[(i)]) -#define jl_cellset(a,i,x) ((((jl_value_t**)((jl_array_t*)a)->data)[(i)])=((jl_value_t*)(x))) +#define jl_cellset(a,i,x) do { \ + jl_value_t *xx = (jl_value_t*)(x); \ + if (xx) gc_wb_back(a); \ + (((jl_value_t**)((jl_array_t*)a)->data)[(i)])=xx; \ + } while(0); #define jl_exprarg(e,n) jl_cellref(((jl_expr_t*)(e))->args,n) +#define jl_exprargset(e, n, v) jl_cellset(((jl_expr_t*)(e))->args, n, v) #define jl_fieldref(s,i) jl_get_nth_field(((jl_value_t*)s),i) @@ -467,6 +484,7 @@ extern jl_sym_t *arrow_sym; extern jl_sym_t *ldots_sym; #define jl_tparam0(t) jl_tupleref(((jl_datatype_t*)(t))->parameters, 0) #define jl_tparam1(t) jl_tupleref(((jl_datatype_t*)(t))->parameters, 1) + #define jl_cell_data(a) ((jl_value_t**)((jl_array_t*)a)->data) #define jl_string_data(s) ((char*)((jl_array_t*)((jl_value_t**)(s))[1])->data) #define jl_iostr_data(s) ((char*)((jl_array_t*)((jl_value_t**)(s))[1])->data) @@ -483,7 +501,6 @@ extern jl_sym_t *arrow_sym; extern jl_sym_t *ldots_sym; #define jl_datatype_size(t) (((jl_datatype_t*)t)->size) // basic predicates ----------------------------------------------------------- - #define jl_is_null(v) (((jl_value_t*)(v)) == ((jl_value_t*)jl_null)) #define jl_is_nothing(v) (((jl_value_t*)(v)) == ((jl_value_t*)jl_nothing)) #define jl_is_tuple(v) jl_typeis(v,jl_tuple_type) @@ -1236,7 +1253,6 @@ void jl_longjmp(jmp_buf _Buf,int _Value); for (i__ca=1, jl_eh_restore_state(&__eh); i__ca; i__ca=0) #endif - // I/O system ----------------------------------------------------------------- #define JL_STREAM uv_stream_t @@ -1329,7 +1345,6 @@ void jl_print_gc_stats(JL_STREAM *s); void show_execution_point(char *filename, int lno); // compiler options ----------------------------------------------------------- - typedef struct { char *build_path; int8_t code_coverage; @@ -1358,6 +1373,40 @@ extern DLLEXPORT jl_compileropts_t jl_compileropts; #define JL_COMPILEROPT_DUMPBITCODE_ON 1 #define JL_COMPILEROPT_DUMPBITCODE_OFF 2 +DLLEXPORT void gc_queue_root(void *root); +void gc_setmark_buf(void *buf); +DLLEXPORT void gc_wb_slow(void* parent, void* ptr); + +static inline void gc_wb(void* parent, void* ptr) +{ + #ifdef GC_INC + // if parent is marked and ptr is clean + if(__unlikely((*((uintptr_t*)parent) & 3) == 1 && (*((uintptr_t*)ptr) & 3) == 0)) { + gc_queue_root(ptr); + } + #endif +} + +static inline void gc_wb_buf(void *parent, void *bufptr) +{ + #ifdef GC_INC + // if parent is marked + if((*((uintptr_t*)parent) & 3) == 1) + gc_setmark_buf(bufptr); + #endif +} + +static inline void gc_wb_back(void *ptr) +{ + #ifdef GC_INC + // if ptr is marked + if((*((uintptr_t*)ptr) & 3) == 1) { + *((uintptr_t*)ptr) &= ~(uintptr_t)3; // clear the mark + gc_queue_root(ptr); + } + #endif +} + #ifdef __cplusplus } #endif diff --git a/src/module.c b/src/module.c index 41aa9513e707d..d151812db7fa1 100644 --- a/src/module.c +++ b/src/module.c @@ -47,6 +47,7 @@ JL_CALLABLE(jl_f_new_module) } jl_module_t *m = jl_new_module(name); m->parent = jl_main_module; + gc_wb(m, m->parent); jl_add_standard_imports(m); return (jl_value_t*)m; } @@ -89,6 +90,7 @@ jl_binding_t *jl_get_binding_wr(jl_module_t *m, jl_sym_t *var) b = new_binding(var); b->owner = m; *bp = b; + gc_wb_buf(m, b); return *bp; } @@ -116,6 +118,7 @@ jl_binding_t *jl_get_binding_for_method_def(jl_module_t *m, jl_sym_t *var) b = new_binding(var); b->owner = m; *bp = b; + gc_wb_buf(m, b); return *bp; } @@ -232,6 +235,7 @@ static void module_import_(jl_module_t *to, jl_module_t *from, jl_sym_t *s, nb->owner = b->owner; nb->imported = (explici!=0); *bp = nb; + gc_wb_buf(to, nb); } } } @@ -300,6 +304,7 @@ void jl_module_export(jl_module_t *from, jl_sym_t *s) // don't yet know who the owner is b->owner = NULL; *bp = b; + gc_wb_buf(from, b); } assert(*bp != HT_NOTFOUND); (*bp)->exportp = 1; @@ -337,6 +342,7 @@ void jl_set_global(jl_module_t *m, jl_sym_t *var, jl_value_t *val) jl_binding_t *bp = jl_get_binding_wr(m, var); if (!bp->constp) { bp->value = val; + gc_wb(m, val); } } @@ -346,6 +352,7 @@ void jl_set_const(jl_module_t *m, jl_sym_t *var, jl_value_t *val) if (!bp->constp) { bp->value = val; bp->constp = 1; + gc_wb(m, val); } } @@ -367,6 +374,7 @@ DLLEXPORT void jl_checked_assignment(jl_binding_t *b, jl_value_t *rhs) JL_PRINTF(JL_STDERR,"Warning: redefining constant %s\n",b->name->name); } } + gc_wb(((void**)b)-1, rhs); b->value = rhs; } diff --git a/src/options.h b/src/options.h index eba5ea1e4f23b..33a4d64ab91ba 100644 --- a/src/options.h +++ b/src/options.h @@ -25,7 +25,9 @@ // only one GC is supported at this time #define JL_GC_MARKSWEEP - +#ifndef GC_NO_INC +#define GC_INC +#endif // debugging options // with MEMDEBUG, every object is allocated explicitly with malloc, and diff --git a/src/table.c b/src/table.c index 1e0ecd319eba9..ff8e86199b226 100644 --- a/src/table.c +++ b/src/table.c @@ -14,9 +14,16 @@ void jl_idtable_rehash(jl_array_t **pa, size_t newsz) size_t i; void **ol = (void**)(*pa)->data; *pa = jl_alloc_cell_1d(newsz); + // we do not check the write barrier here + // because pa always points to a C stack location + // (see eqtable_put) + // it should be changed if this assumption no longer holds for(i=0; i < sz; i+=2) { if (ol[i+1] != NULL) { (*jl_table_lookup_bp(pa, ol[i])) = ol[i+1]; + gc_wb(*pa, ol[i+1]); + // it is however necessary here because allocation + // can (and will) occur in a recursive call inside table_lookup_bp } } } @@ -40,6 +47,7 @@ static void **jl_table_lookup_bp(jl_array_t **pa, void *key) do { if (tab[index+1] == NULL) { tab[index] = key; + gc_wb(a, key); return &tab[index+1]; } @@ -66,7 +74,7 @@ static void **jl_table_lookup_bp(jl_array_t **pa, void *key) jl_idtable_rehash(pa, newsz); a = *pa; - tab = (void**)a->data; + tab = (void**)a->data; sz = hash_size(a); maxprobe = max_probe(sz); @@ -108,6 +116,7 @@ jl_array_t *jl_eqtable_put(jl_array_t *h, void *key, void *val) { void **bp = jl_table_lookup_bp(&h, key); *bp = val; + gc_wb(h, val); return h; } diff --git a/src/task.c b/src/task.c index 4af206a582d64..ac859b4a07a33 100644 --- a/src/task.c +++ b/src/task.c @@ -7,7 +7,6 @@ #include #include #include -//#include #include #include #include "julia.h" @@ -39,16 +38,16 @@ static int _stack_grows_up; static size_t _frame_offset; struct _probe_data { - intptr_t low_bound; /* below probe on stack */ - intptr_t probe_local; /* local to probe on stack */ - intptr_t high_bound; /* above probe on stack */ - intptr_t prior_local; /* value of probe_local from earlier call */ + intptr_t low_bound; /* below probe on stack */ + intptr_t probe_local; /* local to probe on stack */ + intptr_t high_bound; /* above probe on stack */ + intptr_t prior_local; /* value of probe_local from earlier call */ - jl_jmp_buf probe_env; /* saved environment of probe */ - jl_jmp_buf probe_sameAR; /* second environment saved by same call */ - jl_jmp_buf probe_samePC; /* environment saved on previous call */ + jl_jmp_buf probe_env; /* saved environment of probe */ + jl_jmp_buf probe_sameAR; /* second environment saved by same call */ + jl_jmp_buf probe_samePC; /* environment saved on previous call */ - jl_jmp_buf * ref_probe; /* switches between probes */ + jl_jmp_buf * ref_probe; /* switches between probes */ }; static void boundhigh(struct _probe_data *p) @@ -176,6 +175,10 @@ static void save_stack(jl_task_t *t) } t->ssize = nb; memcpy(buf, (char*)&_x, nb); + // this task's stack could have been modified after + // it was marked by an incremental collection + // move the barrier back instead of walking it again here + gc_wb_back(t); } #if defined(_OS_WINDOWS_) && !defined(_COMPILER_MINGW_) @@ -258,6 +261,7 @@ static void ctx_switch(jl_task_t *t, jl_jmp_buf *where) } t->last = jl_current_task; + gc_wb(t, t->last); jl_current_task = t; #ifdef COPY_STACKS @@ -390,6 +394,7 @@ static void finish_task(jl_task_t *t, jl_value_t *resultval) else t->state = done_sym; t->result = resultval; + gc_wb(t, t->result); // TODO: early free of t->stkbuf #ifdef COPY_STACKS t->stkbuf = NULL; @@ -801,6 +806,7 @@ jl_task_t *jl_new_task(jl_function_t *start, size_t ssize) char *stk = allocb(ssize+pagesz+(pagesz-1)); t->stkbuf = stk; + gc_wb_buf(t, t->stkbuf); stk = (char*)LLT_ALIGN((uptrint_t)stk, pagesz); // add a guard page to detect stack overflow // the GC might read this area, which is ok, just prevent writes diff --git a/src/toplevel.c b/src/toplevel.c index c7753a2d500d0..bdb869a5a2f66 100644 --- a/src/toplevel.c +++ b/src/toplevel.c @@ -109,6 +109,9 @@ jl_value_t *jl_eval_module_expr(jl_expr_t *ex) jl_module_t *newm = jl_new_module(name); newm->parent = parent_module; b->value = (jl_value_t*)newm; + + gc_wb(parent_module, newm); + if (parent_module == jl_main_module && name == jl_symbol("Base")) { // pick up Base module during bootstrap jl_old_base_module = jl_base_module; @@ -397,7 +400,7 @@ jl_value_t *jl_toplevel_eval_flex(jl_value_t *e, int fast) jl_error("syntax: malformed \"importall\" statement"); m = (jl_module_t*)jl_eval_global_var(m, name); if (!jl_is_module(m)) - jl_errorf("invalid %s statement: name exists but does not refer to a module", ex->head->name); + jl_errorf("invalid %s statement: name exists but does not refer to a module", ex->head->name); jl_module_importall(jl_current_module, m); return jl_nothing; } @@ -615,6 +618,7 @@ void jl_set_datatype_super(jl_datatype_t *tt, jl_value_t *super) jl_errorf("invalid subtyping in definition of %s",tt->name->name->name); } tt->super = (jl_datatype_t*)super; + gc_wb(tt, tt->super); if (jl_tuple_len(tt->parameters) > 0) { tt->name->cache = (jl_value_t*)jl_null; jl_reinstantiate_inner_types(tt); @@ -701,6 +705,11 @@ DLLEXPORT jl_value_t *jl_method_def(jl_sym_t *name, jl_value_t **bp, jl_binding_ if (*bp == NULL) { gf = (jl_value_t*)jl_new_generic_function(name); *bp = gf; + #ifdef GC_INC + // this would be better as gc_wb(whatever_jlvalue_bp_points_into, *bp); but this function is used in several places so this will do for now + // (in case changing the sig of this function do not forget methodfunc in codegen) + gc_queue_root(gf); + #endif } JL_GC_PUSH1(&gf); assert(jl_is_function(f)); @@ -712,6 +721,7 @@ DLLEXPORT jl_value_t *jl_method_def(jl_sym_t *name, jl_value_t **bp, jl_binding_ f->linfo && f->linfo->ast && jl_is_expr(f->linfo->ast)) { jl_lambda_info_t *li = f->linfo; li->ast = jl_compress_ast(li, li->ast); + gc_wb(li, li->ast); } JL_GC_POP(); return gf; From 894fc4bdd7c606247681cb03fb97a7de65238cc2 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Tue, 29 Jul 2014 15:11:32 +0200 Subject: [PATCH 02/17] All wb are now backward for quick collections, big objects can be transients, bugfixes... Still in a pretty broken state (at the very least incremental codepath isn't working) @trrousse :-) --- src/array.c | 4 +- src/builtins.c | 3 + src/codegen.cpp | 3 + src/gc.c | 524 +++++++++++++++++++++++++++++------------------- src/julia.h | 8 +- src/module.c | 2 +- src/options.h | 4 +- 7 files changed, 333 insertions(+), 215 deletions(-) diff --git a/src/array.c b/src/array.c index 91cd09f589c51..9bf2dddc47593 100644 --- a/src/array.c +++ b/src/array.c @@ -576,8 +576,8 @@ static void array_resize_buffer(jl_array_t *a, size_t newlen, size_t oldlen, siz a->isshared = 0; if (a->ptrarray || es==1) memset(newdata+offsnb+oldnbytes, 0, nbytes-oldnbytes-offsnb); - if(a->how == 1) - gc_wb_buf(a, newdata); // to protect data : if a is already marked we wont mark newdata (in cases how = 1) on the next collection + if (a->how == 1) + gc_wb_buf(a, newdata); a->maxsize = newlen; } diff --git a/src/builtins.c b/src/builtins.c index 709a7bb392c3e..6934f79243dc7 100644 --- a/src/builtins.c +++ b/src/builtins.c @@ -1146,6 +1146,9 @@ size_t jl_static_show_x(JL_STREAM *out, jl_value_t *v, int depth) if (v == NULL) { n += JL_PRINTF(out, "#"); } + else if((uintptr_t)v <= 0x10) { + n += JL_PRINTF(out, "#", v); + } else if(jl_typeof(v) == NULL) { n += JL_PRINTF(out, ""); } diff --git a/src/codegen.cpp b/src/codegen.cpp index 62a03ddc6a15c..81bfaecc550ec 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1364,6 +1364,7 @@ static void emit_write_barrier(jl_codectx_t* ctx, Value *parent, Value *ptr) #ifdef GC_INC /* builder.CreateCall2(wbfunc, builder.CreateBitCast(parent, jl_pvalue_llvmt), builder.CreateBitCast(ptr, jl_pvalue_llvmt)); return;*/ + ptr = NULL; parent = builder.CreateBitCast(parent, T_psize); Value* parent_type = builder.CreateLoad(parent); Value* parent_mark_bits = builder.CreateAnd(parent_type, 3); @@ -1423,7 +1424,9 @@ static void jl_add_linfo_root(jl_lambda_info_t *li, jl_value_t *val) { li = li->def; if (li->roots == NULL) { + JL_GC_PUSH1(&val); li->roots = jl_alloc_cell_1d(1); + JL_GC_POP(); gc_wb(li, li->roots); jl_cellset(li->roots, 0, val); } diff --git a/src/gc.c b/src/gc.c index 4daa4695bccda..683d0556167b5 100644 --- a/src/gc.c +++ b/src/gc.c @@ -21,75 +21,16 @@ #include #endif -//#define GC_VERIFY - - -#if defined(GC_TRACK_ESC) && !defined(GC_INC) -#undef GC_TRACK_ESC -#warning GC_TRACK_ESC requires GC_INC -#endif - #ifdef __cplusplus extern "C" { #endif -/*#ifdef _P64 -#define GC_PAGE_SZ (1536*sizeof(void*))//bytes -#else*/ +#pragma pack(push, 1) + #define GC_PG_LG2 14 #define GC_PAGE_SZ (4*4096) // ((1 << GC_PAGE_W) - 16) #define SYS_PAGE_SZ 4096 - -#define PAGE_COOKIE 0xD1CED0 -#pragma pack(push, 1) -// the cookie field must be before the page data -// becaue we will be doing GC_PAGE(v)->cookie for -// some v not in a page and it must not segfault -typedef struct _gcpage_t { - union { - uint32_t cookie; - struct { - // this is a bitwise | of all gc_bits in this page - uint32_t gc_bits : 2; - // if this is 1, the freelist in this page contains only 2 cells. - // one is the first free cell, it points to the last cell of the page - // every cell in between is free - uint32_t linear : 1; - }; - }; - uint16_t nfree; - uint16_t nmarked; - - struct _gcpage_t *next; - // struct _gcpage_t **prev; // point to the next field of the previous page - char *data; // this is not strictly necessary - uint16_t osize; - - struct { - char bits; - } old[GC_PAGE_SZ/(8*8)]; // one bit per object -} gcpage_t; - -#define PAGE_GROUP_COUNT 31 -// We pack pages by groups of 31 which means a little less than 512k = 32*4 vm pages -#define PAGE_GROUP_LG2 19 -#define PAGE_GROUP_SZ 1 << PAGE_GROUP_LG2 - -typedef struct { - union { - gcpage_t pages[PAGE_GROUP_COUNT]; - char _pad[GC_PAGE_SZ]; - }; - char data[PAGE_GROUP_COUNT][GC_PAGE_SZ]; -} gcpages_t; - -#define GC_PAGES(x) ((gcpage_t*)(((uintptr_t)x) >> PAGE_GROUP_LG2 << PAGE_GROUP_LG2)) -#define GC_PAGE_IDX(x) (((uintptr_t)(x) - (uintptr_t)GC_PAGES(x) - GC_PAGE_SZ)/GC_PAGE_SZ) -#define GC_PAGE(x) (gcpage_t*)(&(GC_PAGES(x)[GC_PAGE_IDX(x)])) -#define GC_PAGE_DATA(x) ((char*)((uintptr_t)(x) >> GC_PG_LG2 << GC_PG_LG2)) -#define GC_POOL_END_OFS(osize) (((GC_PAGE_SZ/osize) - 1)*osize) - -#define REGION_PG_COUNT 2*1024 +#define REGION_PG_COUNT 8*4096 typedef struct { // union { @@ -100,15 +41,18 @@ typedef struct { char pages[REGION_PG_COUNT][GC_PAGE_SZ]; } region_t; -#define BVOFFS 4 +#define HEAP_COUNT 64 +static region_t *heaps[HEAP_COUNT] = {NULL}; typedef struct _bigval_t { struct _bigval_t *next; struct _bigval_t **prev; // pointer to the next field of the prev entry size_t sz; - uptrint_t _pad0; + union { + uptrint_t _pad0; + uptrint_t old : 1; + }; // must be 16-aligned here, in 32 & 64b - union { uptrint_t flags; uptrint_t gc_bits:2; @@ -116,6 +60,7 @@ typedef struct _bigval_t { }; } bigval_t; +#define BVOFFS (offsetof(bigval_t, _data)/sizeof(void*)) #define bigval_header(data) ((bigval_t*)((char*)(data) - BVOFFS*sizeof(void*))) #pragma pack(pop) @@ -142,14 +87,63 @@ typedef struct _gcval_t { typedef struct _pool_t { gcval_t *freelist ; int32_t fl_linear; - int32_t nfree; - // size_t end_offset; // avoid to compute this at each allocation - gcpage_t *pages; - gcpage_t *needsweep; + uint16_t end_offset; // avoid to compute this at each allocation + struct _gcpage_t *pages; + struct _gcpage_t *needsweep; uint16_t osize; } pool_t; -#define HEAP_COUNT 64 -static region_t *heaps[HEAP_COUNT] = {NULL}; + +/*#ifdef _P64 +#define GC_PAGE_SZ (1536*sizeof(void*))//bytes +#else*/ + +// the cookie field must be before the page data +// becaue we will be doing GC_PAGE(v)->cookie for +// some v not in a page and it must not segfault +typedef struct _gcpage_t { + union { + struct { + uint32_t pool_n : 8; + // this is a bitwise | of all gc_bits in this page + uint32_t gc_bits : 2; + // if this is 1, the freelist in this page contains only 2 cells. + // one is the first free cell, it points to the last cell of the page + // every cell in between is free + uint32_t linear : 1; + }; + }; + uint16_t nfree; + uint16_t nmarked; + + struct _gcpage_t *next; + // struct _gcpage_t **prev; // point to the next field of the previous page + char *data; // this is not strictly necessary + uint16_t osize; + + struct { + char bits; + } old[GC_PAGE_SZ/(8*8)]; // one bit per object +} gcpage_t; + +#define PAGE_GROUP_COUNT 31 +// We pack pages by groups of 31 which means a little less than 512k = 32*4 vm pages +#define PAGE_GROUP_LG2 19 +#define PAGE_GROUP_SZ 1 << PAGE_GROUP_LG2 + +typedef struct { + union { + gcpage_t pages[PAGE_GROUP_COUNT]; + char _pad[GC_PAGE_SZ]; + }; + char data[PAGE_GROUP_COUNT][GC_PAGE_SZ]; +} gcpages_t; + +#define GC_PAGES(x) ((gcpage_t*)(((uintptr_t)x) >> PAGE_GROUP_LG2 << PAGE_GROUP_LG2)) +#define GC_PAGE_IDX(x) (((uintptr_t)(x) - (uintptr_t)GC_PAGES(x) - GC_PAGE_SZ)/GC_PAGE_SZ) +#define GC_PAGE(x) ((gcpage_t*)(&(GC_PAGES(x)[GC_PAGE_IDX(x)]))) +#define GC_PAGE_DATA(x) ((char*)((uintptr_t)(x) >> GC_PG_LG2 << GC_PG_LG2)) +#define GC_POOL_END_OFS(osize) (((GC_PAGE_SZ/osize) - 1)*osize) + //static int free_lb = 0; // GC knobs and self-measurement variables @@ -167,7 +161,7 @@ static size_t max_pg_count = 0;*/ #ifdef GC_INC static int gc_inc_steps = 1; -static int gc_quick_steps = 1; +static int gc_quick_steps = 16; static int gc_sweep_steps = 1; #else static const int gc_inc_steps = 1; @@ -180,8 +174,8 @@ static size_t max_collect_interval = 1250000000UL; static size_t max_collect_interval = 500000000UL; #endif // keep those 3 together -static int allocd_bytes; -static size_t collect_interval; +static int64_t allocd_bytes; +static size_t collect_interval = default_collect_interval; static int gc_steps; #define N_POOLS 42 static __attribute__((aligned (64))) pool_t norm_pools[N_POOLS]; @@ -189,8 +183,8 @@ static pool_t ephe_pools[N_POOLS]; static const pool_t *pools = &norm_pools[0]; static int64_t total_allocd_bytes = 0; -static size_t allocd_bytes_since_sweep = 0; -static size_t freed_bytes = 0; +static int64_t allocd_bytes_since_sweep = 0; +static int64_t freed_bytes = 0; static uint64_t total_gc_time=0; static size_t live_bytes = 0; static size_t current_pg_count = 0; @@ -228,7 +222,7 @@ static size_t scanned_bytes_goal; const int prev_sweep_mask = GC_MARKED; #endif -#define gc_bits(o) ((gcval_t*)(o))->gc_bits +#define gc_bits(o) (((gcval_t*)(o))->gc_bits) #define gc_marked(o) (((gcval_t*)(o))->gc_bits & GC_MARKED) #define _gc_setmark(o, mark_mode) (((gcval_t*)(o))->gc_bits = mark_mode) @@ -237,6 +231,7 @@ const int prev_sweep_mask = GC_MARKED; static jl_value_t* lostval = 0; static arraylist_t lostval_parents; static arraylist_t lostval_parents_done; +static int verifying; static void add_lostval_parent(jl_value_t* parent) { @@ -281,6 +276,9 @@ static bigval_t *big_objects_marked = NULL; static inline void objprofile_count(void* v, int old) { #ifdef OBJPROFILE +#ifdef GC_VERIFY + if (verifying) return; +#endif if (jl_typeof(v) <= 0x10) return; void **bp = ptrhash_bp(&obj_counts[old], jl_typeof(v)); if (*bp == HT_NOTFOUND) @@ -299,7 +297,21 @@ static inline int gc_setmark_other(void *o, int mark_mode) static inline int gc_setmark_big(void *o, int mark_mode) { +#ifdef GC_VERIFY + if (verifying) { + _gc_setmark(o, mark_mode); + return mark_mode; + } +#endif bigval_t* hdr = bigval_header(o); + if (hdr->old) { + mark_mode = GC_MARKED; + } + else { + if (!gc_marked(o)) + hdr->old = 1; + } + if ((mark_mode == GC_MARKED) & (gc_bits(o) != GC_MARKED)) { *hdr->prev = hdr->next; if (hdr->next) @@ -317,8 +329,14 @@ static inline int gc_setmark_big(void *o, int mark_mode) static inline int gc_setmark_pool(void *o, int mark_mode) { +#ifdef GC_VERIFY + if (verifying) { + _gc_setmark(o, mark_mode); + return mark_mode; + } +#endif gcpage_t* page = GC_PAGE(o); - int obj_i = ((uintptr_t)o - (uintptr_t)page->data)/page->osize; + int obj_i = ((uintptr_t)o - (uintptr_t)page->data)/8; if (page->old[obj_i/8].bits & (1 << (obj_i % 8))) { _gc_setmark(o, GC_MARKED); mark_mode = GC_MARKED; @@ -329,7 +347,7 @@ static inline int gc_setmark_pool(void *o, int mark_mode) _gc_setmark(o, mark_mode); } page->nmarked += (mark_mode == GC_MARKED); - page->cookie |= gc_bits(o); + page->gc_bits |= gc_bits(o); verify_val(o); return mark_mode; } @@ -346,15 +364,13 @@ static inline int gc_setmark(void *o, int sz, int mark_mode) #define gc_typeof(v) ((jl_value_t*)(((uptrint_t)jl_typeof(v))&(~(uintptr_t)3))) #define gc_val_buf(o) ((gcval_t*)(((void**)(o))-1)) -inline void gc_setmark_buf(void *o) +inline void gc_setmark_buf(void *o, int mark_mode) { buff_t *buf = (buff_t*)gc_val_buf(o); - // buffers are always old but it does not matter since they do not contain any reference - // directly, it is handled by the parent object if (buf->pooled) - gc_setmark_pool(buf, GC_MARKED); + gc_setmark_pool(buf, mark_mode); else - gc_setmark_big(buf, GC_MARKED); + gc_setmark_big(buf, mark_mode); } // malloc wrappers, aligned allocation @@ -563,7 +579,7 @@ void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, int isaligned) #endif if (b == NULL) jl_throw(jl_memory_exception); - allocd_bytes += sz; + allocd_bytes += (sz - oldsz); return b; } @@ -851,7 +867,7 @@ static inline gcval_t *reset_page(pool_t *p, gcpage_t *pg, gcval_t *fl) { pg->nfree = GC_PAGE_SZ/p->osize; pg->nmarked = 0; - pg->cookie = PAGE_COOKIE; + pg->pool_n = p - norm_pools; memset(pg->old, 0x0, GC_PAGE_SZ/(8*8)); gcval_t *beg = (gcval_t*)pg->data; gcval_t *end = (gcval_t*)((char*)beg + (pg->nfree - 1)*p->osize); @@ -868,11 +884,10 @@ static __attribute__((noinline)) void add_page(pool_t *p) if (data == NULL) jl_throw(jl_memory_exception); gcpage_t *pg = GC_PAGE(data); -// jl_printf(JL_STDOUT, "add page [%d] : 0x%lx 0x%lx = 0x%lx hdr 0x%lx\n", GC_PAGE_IDX(data), pg, data, (uintptr_t)data - (uintptr_t)pg, GC_PAGES(data)); + //jl_printf(JL_STDOUT, "add page [%d] : 0x%lx 0x%lx = 0x%lx hdr 0x%lx\n", GC_PAGE_IDX(data), pg, data, (uintptr_t)data - (uintptr_t)pg, GC_PAGES(data)); pg->data = data; pg->osize = p->osize; gcval_t *fl = reset_page(p, pg, p->freelist); - p->nfree += pg->nfree; // these statements are ordered so that interrupting after any of them // leaves the system in a valid state pg->next = p->pages; @@ -892,35 +907,38 @@ static __attribute__((noinline)) void add_page(pool_t *p) p->freelist = next_lin; }*/ -static inline void *_pool_alloc(pool_t* p, int osize) +static inline void *__pool_alloc(pool_t* p, int osize, int end_offset) { gcval_t *v, *end; - int end_offset = GC_POOL_END_OFS(osize); - int ab = allocd_bytes; - p->nfree--; - allocd_bytes += osize; - maybe_collect(); + if (__unlikely((allocd_bytes += osize) >= 0)) { + jl_gc_collect(); + } if (__unlikely(!p->freelist)) { add_page(p); } - v = p->freelist; - p->freelist = (char*)p->freelist + osize; + v->flags = 0; + p->freelist = (char*)v + osize; + GC_PAGE(v)->nfree--; end = &(GC_PAGE_DATA(v)[end_offset]); - if (__unlikely(!((v != end) & (!!p->fl_linear)))) { + if (__unlikely(!((v != end) & (!!GC_PAGE(v)->linear)))) { p->freelist = v->next; if (p->freelist) p->fl_linear = GC_PAGE(p->freelist)->linear; } // p->freelist = next; - // v->flags = 0; // pg->nfree--; return v; } +static inline void *_pool_alloc(pool_t *p, int osize) +{ + return __pool_alloc(p, osize, GC_POOL_END_OFS(osize)); +} + static inline void *pool_alloc(pool_t *p) { - return _pool_alloc(p, p->osize); + return __pool_alloc(p, p->osize, p->end_offset); } static int szclass(size_t sz) @@ -968,49 +986,49 @@ static void sweep_pool(pool_t *p, int sweep_mask) gcpage_t **ppg = &p->needsweep; gcval_t **pfl = &p->freelist; size_t osize = p->osize; - size_t old_nfree = p->nfree; + size_t old_nfree = 0, nfree = 0; int pg_freedall = 0, pg_total = 0; - int stats[4] = {0, 0, 0, 0}; int pg_skpd = 0, pg_wont_skip = 0; int obj_per_page = GC_PAGE_SZ/osize; int whole_page = 0; while (pg != NULL) { // if ((pg->cookie & ~(uint32_t)7) != PAGE_COOKIE) // abort(); - v = (gcval_t*)pg->data; + char *data = pg->data; + char *old = pg->old; + v = (gcval_t*)data; char *lim = (char*)v + GC_PAGE_SZ - osize; freedall = 1; prev_pfl = pfl; - - if (gc_bits(&pg->cookie) == GC_MARKED) { + old_nfree += pg->nfree; + + if (pg->gc_bits == GC_MARKED) { // skip - if (sweep_mask == GC_MARKED_NOESC && pg->nmarked > 0) { + if (sweep_mask == GC_MARKED_NOESC && pg->nmarked*10 >= obj_per_page*8) { freedall = 0; pg_skpd++; goto free_page; } } - else if(gc_bits(&pg->cookie) == GC_CLEAN) { + else if(pg->gc_bits == GC_CLEAN) { // if (whole_page) // p->nfree += obj_per_page; // overestimation // else - p->nfree++; // underestimation - whole_page = 1; - lazy_freed_pages++; goto free_page; } - int obj_i = 0; - pg->nmarked = 0; + if (sweep_mask == GC_MARKED) + pg->nmarked = 0; + int pg_nfree = 0; while ((char*)v <= lim) { - stats[gc_bits(v)]++; + int obj_i = ((uintptr_t)v - (uintptr_t)data)/8; // we can encouter a queued value at this point // if a write barrier was moved back between two // sweeping increments if (!gc_marked(v) & (gc_bits(v) != GC_QUEUED)) { *pfl = v; pfl = &v->next; - p->nfree++; - pg->old[obj_i/8].bits &= ~(1 << (obj_i % 8)); + pg_nfree++; + old[obj_i/8] &= ~(1 << (obj_i % 8)); } else { if ((sweep_mask & gc_bits(v)) == sweep_mask) @@ -1018,8 +1036,8 @@ static void sweep_pool(pool_t *p, int sweep_mask) freedall = 0; } v = (gcval_t*)((char*)v + osize); - obj_i++; } + pg->nfree = pg_nfree; page_done++; free_page: // nfreed += this_page_nfree; @@ -1035,12 +1053,12 @@ static void sweep_pool(pool_t *p, int sweep_mask) pg_total++; nextpg = pg->next; if (freedall) { - if (prev_sweep_mask == GC_MARKED_NOESC) { - gcval_t *begin = reset_page(p, pg, *prev_pfl); - p->nfree += pg->nfree; - pfl = (gcval_t**)((char*)begin + (pg->nfree - 1)*p->osize); + if (0 && sweep_mask == GC_MARKED_NOESC) { + gcval_t *begin = reset_page(p, pg, NULL); *prev_pfl = begin; + pfl = (gcval_t**)((char*)begin + (pg->nfree - 1)*osize); ppg = &pg->next; + lazy_freed_pages++; } else { pfl = prev_pfl; @@ -1051,17 +1069,19 @@ static void sweep_pool(pool_t *p, int sweep_mask) free_page(pg->data); } freed_pages++; + nfree += obj_per_page; } else { - gc_bits(&pg->cookie) = GC_MARKED; + pg->gc_bits = GC_MARKED; ppg = &pg->next; pg->linear = 0; + nfree += pg->nfree; } - if (should_timeout() && nextpg) { + /* if (should_timeout() && nextpg) { pg->next = NULL; pg = nextpg; break; - } + }*/ scanned_bytes += GC_PAGE_SZ; pg = nextpg; } @@ -1081,7 +1101,7 @@ static void sweep_pool(pool_t *p, int sweep_mask) } /* if (stats[0] + stats[1] + stats[2] + stats[2] > 0) jl_printf(JL_STDOUT, "Pool : %d %d %d %d\n", stats[0], stats[1], stats[2], stats[3]);*/ - freed_bytes += (p->nfree - old_nfree)*osize; + freed_bytes += (nfree - old_nfree)*osize; } // sweep phase @@ -1177,18 +1197,23 @@ void grow_mark_stack(void) mark_stack_size = newsz; } +int max_msp = 0; DLLEXPORT void gc_queue_root(void *p) { + void* p2 = (uintptr_t)p & ~(uintptr_t)3; + if (gc_bits(p2) == GC_QUEUED) return; if(mark_sp + perm_marked >= mark_stack_size) grow_mark_stack(); - gc_bits((uintptr_t)p & ~(uintptr_t)3) = GC_QUEUED; + gc_bits(p2) = GC_QUEUED; mark_stack[mark_sp++] = (jl_value_t*)p; + max_msp = max_msp > mark_sp ? max_msp : mark_sp; } #ifdef GC_INC static arraylist_t tasks; +static arraylist_t remset; #endif static void push_root(jl_value_t *v, int mark_mode, int d); -#define gc_push_root(v,mark_mode,d) do { assert((v) != NULL); verify_val(v); if ((!gc_marked(v)) | ((gc_bits(v) & mark_mode) != gc_bits(v))) { push_root((jl_value_t*)(v),mark_mode,d); } } while(0) +#define gc_push_root(v,mark_mode,d) do { assert((v) != NULL); verify_val(v); if (!gc_bits(v)) { push_root((jl_value_t*)(v),mark_mode,d); } } while(0) void jl_gc_setmark(jl_value_t *v) // TODO rename this as it is misleading now { @@ -1222,23 +1247,27 @@ static void gc_mark_stack(jl_value_t* ta, jl_gcframe_t *s, ptrint_t offset, int } } -static void gc_mark_module(jl_module_t *m, int mark_mode, int d) +static int gc_mark_module(jl_module_t *m, int mark_mode, int d) { size_t i; + int refyoung = 0; void **table = m->bindings.table; for(i=1; i < m->bindings.size; i+=2) { if (table[i] != HT_NOTFOUND) { jl_binding_t *b = (jl_binding_t*)table[i]; - gc_setmark_buf(b); + gc_setmark_buf(b, gc_bits(m)); void* vb = gc_val_buf(b); verify_parent("module", m, &vb, "binding_buff"); scanned_bytes += sizeof(jl_binding_t); if (b->value != NULL) { verify_parent("module", m, &b->value, "binding(%s)", b->name->name); gc_push_root(b->value, mark_mode, d); + refyoung |= gc_bits(b->value) == GC_MARKED_NOESC; } - if (b->type != (jl_value_t*)jl_any_type) + if (b->type != (jl_value_t*)jl_any_type) { gc_push_root(b->type, mark_mode, d); + refyoung |= gc_bits(b->type) == GC_MARKED_NOESC; + } } } // this is only necessary because bindings for "using" modules @@ -1247,16 +1276,21 @@ static void gc_mark_module(jl_module_t *m, int mark_mode, int d) // contain the only reference. for(i=0; i < m->usings.len; i++) { gc_push_root(m->usings.items[i], mark_mode, d); + refyoung |= gc_bits(m->usings.items[i]) == GC_MARKED_NOESC; } - if (m->constant_table) + if (m->constant_table) { + verify_parent("module", m, &m->constant_table, "constant_table"); gc_push_root(m->constant_table, mark_mode, d); + refyoung |= gc_bits(m->constant_table) == GC_MARKED_NOESC; + } + return refyoung; } static void gc_mark_task_stack(jl_task_t *ta, int mark_mode, int d) { if (ta->stkbuf != NULL || ta == jl_current_task) { if (ta->stkbuf != NULL) - gc_setmark_buf(ta->stkbuf); + gc_setmark_buf(ta->stkbuf, gc_bits(ta)); #ifdef COPY_STACKS ptrint_t offset; if (ta == jl_current_task) { @@ -1290,7 +1324,7 @@ static void gc_mark_task(jl_task_t *ta, int mark_mode, int d) if (ta->start) gc_push_root(ta->start, mark_mode, d); if (ta->result) gc_push_root(ta->result, mark_mode, d); #ifdef GC_INC - if (mark_mode == GC_MARKED_NOESC) { + if (0 && mark_mode == GC_MARKED_NOESC) { gc_mark_task_stack(ta, mark_mode, d); } else { arraylist_push(&tasks, (void*)ta); @@ -1314,9 +1348,18 @@ static void push_root(jl_value_t *v, int mark_mode, int d) assert(v != NULL); jl_value_t *vt = (jl_value_t*)gc_typeof(v); // gc_setmark(v); + int remember = 0; + int refyoung = 0; + + if (mark_mode == GC_MARKED) { + // abort(); + mark_mode = GC_MARKED_NOESC; + // remember = 1; + // if (gc_marked(v)) goto ret; + } if (vt == (jl_value_t*)jl_weakref_type) { - mark_mode = gc_setmark(v, jl_datatype_size(jl_weakref_type), mark_mode); + gc_setmark(v, jl_datatype_size(jl_weakref_type), mark_mode); goto ret; } if ((jl_is_datatype(vt) && ((jl_datatype_t*)vt)->pointerfree)) { @@ -1337,7 +1380,7 @@ static void push_root(jl_value_t *v, int mark_mode, int d) // some values have special representations if (vt == (jl_value_t*)jl_tuple_type) { size_t l = jl_tuple_len(v); - mark_mode = gc_setmark(v, l*sizeof(void*) + sizeof(jl_tuple_t), mark_mode); + gc_setmark(v, l*sizeof(void*) + sizeof(jl_tuple_t), mark_mode); jl_value_t **data = ((jl_tuple_t*)v)->data; for(size_t i=0; i < l; i++) { jl_value_t *elt = data[i]; @@ -1345,24 +1388,26 @@ static void push_root(jl_value_t *v, int mark_mode, int d) if (elt != NULL) { verify_parent("tuple", v, &data[i], "elem(%d)", i); gc_push_root(elt, mark_mode, d); + refyoung |= gc_bits(elt) == GC_MARKED_NOESC; } } } else if (((jl_datatype_t*)(vt))->name == jl_array_typename) { jl_array_t *a = (jl_array_t*)v; if (a->pooled) - mark_mode = gc_setmark_pool(a, mark_mode); + gc_setmark_pool(a, mark_mode); else - mark_mode = gc_setmark_big(a, mark_mode); + gc_setmark_big(a, mark_mode); if (a->how == 3) { jl_value_t *owner = jl_array_data_owner(a); gc_push_root(owner, mark_mode, d); + refyoung |= gc_bits(owner) == GC_MARKED_NOESC; goto ret; } else if (a->how == 1) { void* val_buf = gc_val_buf((char*)a->data - a->offset*a->elsize); verify_parent("array", v, &val_buf, "buffer ('loc' addr is meaningless)"); - gc_setmark_buf((char*)a->data - a->offset*a->elsize); + gc_setmark_buf((char*)a->data - a->offset*a->elsize, gc_bits(v)); } if (a->ptrarray && a->data!=NULL) { size_t l = jl_array_len(a); @@ -1373,12 +1418,14 @@ static void push_root(jl_value_t *v, int mark_mode, int d) } else { void *data = a->data; + int has_young_elt = 0; for(size_t i=0; i < l; i++) { jl_value_t *elt = ((jl_value_t**)data)[i]; scanned_bytes += sizeof(void*); - if (elt != NULL){ + if (elt != NULL) { verify_parent("array", v, &((jl_value_t**)data)[i], "elem(%d)", i); - gc_push_root(elt, mark_mode, d); + gc_push_root(elt, GC_MARKED_NOESC, d); + refyoung |= gc_bits(elt) == GC_MARKED_NOESC; } // try to split large array marking // if (should_timeout() && l > 1000) goto queue_the_root; @@ -1390,17 +1437,18 @@ static void push_root(jl_value_t *v, int mark_mode, int d) } } else if (vt == (jl_value_t*)jl_module_type) { - mark_mode = gc_setmark(v, sizeof(jl_module_t), mark_mode); - gc_mark_module((jl_module_t*)v, mark_mode, d); + gc_setmark(v, sizeof(jl_module_t), mark_mode); + refyoung |= gc_mark_module((jl_module_t*)v, mark_mode, d); scanned_bytes += sizeof(jl_module_t); } else if (vt == (jl_value_t*)jl_task_type) { - mark_mode = gc_setmark(v, sizeof(jl_task_t), mark_mode); + gc_setmark(v, sizeof(jl_task_t), mark_mode); gc_mark_task((jl_task_t*)v, mark_mode, d); + refyoung = 1; scanned_bytes += sizeof(jl_task_t); } else if(vt == (jl_value_t*)jl_symbol_type) { - mark_mode = gc_setmark_other(v, mark_mode); // symbols are not pooled + gc_setmark_other(v, GC_MARKED); // symbols are not pooled } else if( #ifdef GC_VERIFY @@ -1411,7 +1459,7 @@ static void push_root(jl_value_t *v, int mark_mode, int d) #endif ) { jl_datatype_t *dt = (jl_datatype_t*)vt; - mark_mode = gc_setmark(v, jl_datatype_size(dt), mark_mode); + gc_setmark(v, jl_datatype_size(dt), mark_mode); int nf = (int)jl_tuple_len(dt->names); for(int i=0; i < nf; i++) { if (dt->fields[i].isptr) { @@ -1421,6 +1469,7 @@ static void push_root(jl_value_t *v, int mark_mode, int d) if (fld) { verify_parent("object", v, slot, "field(%d)", i); gc_push_root(fld, mark_mode, d); + refyoung |= gc_bits(fld) == GC_MARKED_NOESC; } } else { @@ -1435,14 +1484,25 @@ static void push_root(jl_value_t *v, int mark_mode, int d) abort(); } #endif + ret: +#ifdef GC_VERIFY + if (verifying) return; +#endif + if (refyoung && gc_bits(v) == GC_MARKED) { + /*for (int i = 0; i < remset.len; i++) { + if (remset.items[i] == v) + abort(); + }*/ + arraylist_push(&remset, (void*)v); + } objprofile_count(v, gc_bits(v) == GC_MARKED ? 1 : 0); return; queue_the_root: scanned_bytes += sizeof(void*); // save the mark mode in the lower bits of the pointer - gc_queue_root((void*)((uintptr_t)v | mark_mode)); + gc_queue_root((void*)((uintptr_t)v | gc_bits(v))); } static void visit_mark_stack_inc(int mark_mode) @@ -1511,8 +1571,9 @@ static void pre_mark(int mark_mode) gc_push_root(to_finalize.items[i], mark_mode, 0); } - // if (inc_count > 1) return; // the following roots are constant and will stay marked in between increments - jl_mark_box_caches(); + //if (inc_count > 1 || quick_count > 1) return; // the following roots are constant and will stay marked in between increments + if (prev_sweep_mask == GC_MARKED) + jl_mark_box_caches(); gc_push_root(jl_unprotect_stack_func, mark_mode, 0); gc_push_root(jl_bottom_func, mark_mode, 0); gc_push_root(jl_typetype_type, mark_mode, 0); @@ -1647,34 +1708,31 @@ static void gc_mark(int finalize) gc_push_root(to_finalize.items[i], GC_MARKED_NOESC, 0); } - visit_mark_stack(GC_MARKED_NOESC); - mark_task_stacks(GC_MARKED_NOESC); visit_mark_stack(GC_MARKED_NOESC); // find unmarked objects that need to be finalized. // this must happen last. - if (finalize) { - for(i=0; i < finalizer_table.size; i+=2) { - if (finalizer_table.table[i+1] != HT_NOTFOUND) { - jl_value_t *v = (jl_value_t*)finalizer_table.table[i]; - if (!gc_marked(v)) { - jl_value_t *fin = (jl_value_t*)finalizer_table.table[i+1]; - if (gc_typeof(fin) == (jl_value_t*)jl_voidpointer_type) { - void *p = ((void**)fin)[1]; - if (p) - ((void (*)(void*))p)(jl_data_ptr(v)); - finalizer_table.table[i+1] = HT_NOTFOUND; - continue; - } - gc_push_root(v, GC_MARKED_NOESC, 0); - schedule_finalization(v); + for(i=0; i < finalizer_table.size; i+=2) { + if (finalizer_table.table[i+1] != HT_NOTFOUND) { + jl_value_t *v = (jl_value_t*)finalizer_table.table[i]; + if (!gc_marked(v)) { + jl_value_t *fin = (jl_value_t*)finalizer_table.table[i+1]; + if (finalize && gc_typeof(fin) == (jl_value_t*)jl_voidpointer_type) { + void *p = ((void**)fin)[1]; + if (p) + ((void (*)(void*))p)(jl_data_ptr(v)); + finalizer_table.table[i+1] = HT_NOTFOUND; + continue; } - gc_push_root(finalizer_table.table[i+1], GC_MARKED_NOESC, 0); + gc_push_root(v, GC_MARKED_NOESC, 0); + if (finalize) schedule_finalization(v); } + gc_push_root(finalizer_table.table[i+1], GC_MARKED_NOESC, 0); } - - visit_mark_stack(GC_MARKED_NOESC); } + visit_mark_stack(GC_MARKED_NOESC); + mark_task_stacks(GC_MARKED_NOESC); + visit_mark_stack(GC_MARKED_NOESC); } @@ -1704,15 +1762,16 @@ static void gc_mark(int finalize) #ifdef GC_VERIFY static void gc_verify(void) { + verifying = 1; lostval = NULL; lostval_parents.len = 0; lostval_parents_done.len = 0; check_timeout = 0; clear_mark(GC_CLEAN); gc_mark(0); - - for(int i = 0; i < bits_save[GC_CLEAN].len; i++) { - gcval_t* v = (gcval_t*)bits_save[GC_CLEAN].items[i]; + int clean_len = bits_save[GC_CLEAN].len; + for(int i = 0; i < clean_len + bits_save[GC_QUEUED].len; i++) { + gcval_t* v = (gcval_t*)bits_save[i >= clean_len ? GC_QUEUED : GC_CLEAN].items[i >= clean_len ? i - clean_len : i]; if (gc_marked(v)) { JL_PRINTF(JL_STDOUT, "Error. Early free of 0x%lx type :", (uptrint_t)v); jl_(jl_typeof(v)); @@ -1725,6 +1784,7 @@ static void gc_verify(void) } if (lostval == NULL) { restore(); // we did not miss anything + verifying = 0; return; } restore(); @@ -1750,7 +1810,7 @@ static void gc_verify(void) if (lostval_parent != NULL) break; } if (lostval_parent == NULL) { // all parents of lostval were also scheduled for deletion - // lostval = arraylist_pop(&lostval_parents); + lostval = arraylist_pop(&lostval_parents); } else { JL_PRINTF(JL_STDOUT, "Missing write barrier found !\n"); @@ -1832,7 +1892,7 @@ void prepare_sweep(void) } #ifdef GC_INC -int residual = 0; +int64_t residual = 0; void jl_gc_collect(void) { @@ -1852,21 +1912,32 @@ void jl_gc_collect(void) scanned_bytes = 0; scanned_bytes_goal = inc_count*(live_bytes/gc_inc_steps + mark_sp*sizeof(void*)); scanned_bytes_goal = scanned_bytes_goal < MIN_SCAN_BYTES ? MIN_SCAN_BYTES : scanned_bytes_goal; - - check_timeout = 1; + if (gc_inc_steps > 1) + check_timeout = 1; double t = clock_now(); mark_stack -= perm_marked; - + mark_sp = perm_marked = perm_marked + mark_sp; - + void** scratch = 0; + if (sweep_mask != GC_MARKED) { + scratch = malloc(sizeof(void*)*perm_marked); + memcpy(scratch, mark_stack, perm_marked*sizeof(void*)); + } + if (live_bytes && gc_inc_steps > 1) visit_mark_stack_inc(GC_MARKED_NOESC); else visit_mark_stack(GC_MARKED_NOESC); if (sweep_mask == GC_MARKED) perm_marked = 0; - else + else { + for (int i = 0; i < perm_marked; i++) { + gc_bits((uintptr_t)scratch[i] & ~(uintptr_t)3) = GC_MARKED; + } + memcpy(mark_stack, scratch, perm_marked*sizeof(void*)); + free(scratch); mark_stack += perm_marked; + } pre_mark(GC_MARKED_NOESC); visit_mark_stack(GC_MARKED_NOESC); @@ -1877,8 +1948,8 @@ void jl_gc_collect(void) mark_task_stacks(GC_MARKED_NOESC); visit_mark_stack(GC_MARKED_NOESC); } - allocd_bytes_since_sweep += allocd_bytes + collect_interval/gc_steps; - allocd_bytes = -collect_interval/gc_steps; + allocd_bytes_since_sweep += allocd_bytes + (int)collect_interval/gc_steps; + allocd_bytes = -(int)collect_interval/gc_steps; #ifdef OBJPROFILE print_obj_profiles(); htable_reset(&obj_counts[0], 0); @@ -1889,21 +1960,21 @@ void jl_gc_collect(void) total_mark_time += mark_pause; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | stack %d -> %d (wb %d)\n", mark_pause*1000, saved_mark_sp, mark_sp, wb_activations); + JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %d kB | stack %d -> %d (wb %d) | remset %d %d %d\n", mark_pause*1000, scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, remset.len, max_msp, allocd_bytes/1024); saved_mark_sp = mark_sp; #endif } - int pct = -1, bonus = -1; + int64_t pct = -1, bonus = -1, SAVE = -1, SAVE2 = -1; double post_time = 0.0, finalize_time = 0.0; if(mark_sp == 0 || sweeping) { #if defined(GC_TIME) || defined(GC_FINAL_STATS) double sweep_t0 = clock_now(); #endif - size_t actual_allocd = allocd_bytes_since_sweep; + int64_t actual_allocd = allocd_bytes_since_sweep; if (!sweeping) { #ifdef GC_TIME post_time = clock_now(); -#endif +#endif post_mark(GC_MARKED_NOESC); #ifdef GC_TIME @@ -1927,7 +1998,17 @@ void jl_gc_collect(void) } scanned_bytes = 0; if (gc_sweep_inc(sweep_mask)) { - // sweeping is over + if (sweep_mask == GC_MARKED_NOESC) { + for (int i = 0; i < perm_marked; i++) { + gc_bits((mark_stack - perm_marked)[i]) = GC_QUEUED; + } + for (int i = 0; i < remset.len; i++) { + gc_queue_root(remset.items[i]); + } + } + remset.len = 0; + + // sweeping is over int tasks_end = 0; for (int i = 0; i < tasks.len; i++) { jl_value_t* ta = (jl_value_t*)tasks.items[i]; @@ -1945,10 +2026,13 @@ void jl_gc_collect(void) } finalize_time = clock_now(); run_finalizers(); + finalize_time = clock_now() - finalize_time; pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; bonus = freed_bytes - (7*(actual_allocd/10)); - if (bonus - residual < 0 || quick_count > 10) { + SAVE = residual; + SAVE2 = freed_bytes; + if (bonus - residual < 0 && sweep_mask == GC_MARKED_NOESC) { if (collect_interval <= 2*(max_collect_interval/5)) { collect_interval = 5*(collect_interval/2); } @@ -1958,13 +2042,17 @@ void jl_gc_collect(void) residual = 0; } else { - residual += allocd_bytes_since_sweep - freed_bytes; + if (sweep_mask == GC_MARKED) + residual = 0; + else + residual = actual_allocd - freed_bytes; collect_interval = default_collect_interval; sweep_mask = GC_MARKED_NOESC; // next collection is quick gc_steps = gc_quick_steps; } - allocd_bytes = -collect_interval/gc_steps; + allocd_bytes = -(int64_t)collect_interval/gc_steps; + // jl_printf(JL_STDOUT, "ALLOCD %ld %ld %ld\n", allocd_bytes, collect_interval, default_collect_interval); inc_count = 0; live_bytes += -freed_bytes + allocd_bytes_since_sweep; allocd_bytes_since_sweep = 0; @@ -1978,7 +2066,7 @@ void jl_gc_collect(void) total_fin_time += finalize_time + post_time; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms (free %d%% of alloc %d - %d) (%.2f ms in post_mark, %.2f ms in fin) (marked in %d inc) mask %d\n", sweep_pause*1000, pct, bonus, residual, post_time*1000, finalize_time*1000, inc_count, sweep_mask); + JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms (freed %d kB = %d%% of alloc b/r %d/%d) (%.2f ms in post_mark, %.2f ms in fin) (marked in %d inc) mask %d\n", sweep_pause*1000, SAVE2/1024, pct, bonus/1024, SAVE/1024, post_time*1000, finalize_time*1000, inc_count, sweep_mask); #endif } n_pause++; @@ -2173,14 +2261,13 @@ void jl_gc_init(void) norm_pools[i].freelist = NULL; norm_pools[i].needsweep = NULL; norm_pools[i].fl_linear = 1; - norm_pools[i].nfree = 0; - // norm_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; + norm_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; ephe_pools[i].osize = szc[i]; ephe_pools[i].pages = NULL; ephe_pools[i].freelist = NULL; ephe_pools[i].needsweep = NULL; - // ephe_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; + ephe_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; } assert(offsetof(gcpages_t, data) == GC_PAGE_SZ); @@ -2200,6 +2287,7 @@ void jl_gc_init(void) #endif #ifdef GC_INC arraylist_new(&tasks, 0); + arraylist_new(&remset, 0); #endif #ifdef OBJPROFILE @@ -2221,63 +2309,73 @@ void jl_gc_init(void) // GC summary stats #if defined(MEMPROFILE) -static size_t pool_stats(pool_t *p, size_t *pwaste, int *np) +static size_t pool_stats(pool_t *p, size_t *pwaste, size_t *np, size_t *pnold) { gcval_t *v; gcpage_t *pg = p->pages; size_t osize = p->osize; - size_t nused=0, nfree=0, npgs=0; + size_t nused=0, nfree=0, npgs=0, nold = 0; while (pg != NULL) { npgs++; v = (gcval_t*)pg->data; char *lim = (char*)v + GC_PAGE_SZ - osize; - // this is not accurate anymore and can underestimate waste + int i = 0; while ((char*)v <= lim) { if (!gc_marked(v)) { nfree++; } else { nused++; + if (gc_bits(v) == GC_MARKED) { + nold++; + } } v = (gcval_t*)((char*)v + osize); + i++; } gcpage_t *nextpg = pg->next; pg = nextpg; } *pwaste = npgs*GC_PAGE_SZ - (nused*p->osize); *np = npgs; - JL_PRINTF(JL_STDOUT, - "%4d : %7d/%7d objects, %5d pages, %8d bytes, %8d waste\n", - p->osize, - nused, - nused+nfree, - npgs, - nused*p->osize, - *pwaste); + *pnold = nold; + if (npgs != 0) { + JL_PRINTF(JL_STDOUT, + "%4d : %7d/%7d objects (%3d%% old), %5d pages, %5d kB, %5d kB waste\n", + p->osize, + nused, + nused+nfree, + nused ? (nold*100)/nused : 0, + npgs, + (nused*p->osize)/1024, + *pwaste/1024); + } return nused*p->osize; } static void all_pool_stats(void) { int i; - size_t nb=0, w, tw=0, no=0,tp=0, b, np; + size_t nb=0, w, tw=0, no=0,tp=0, nold=0,noldbytes=0, b, np, nol; for(i=0; i < N_POOLS; i++) { - b = pool_stats(&norm_pools[i], &w, &np); + b = pool_stats(&norm_pools[i], &w, &np, &nol); nb += b; no += (b/norm_pools[i].osize); tw += w; tp += np; - + nold += nol; + noldbytes += nol*norm_pools[i].osize; + /* b = pool_stats(&ephe_pools[i], &w, &np); nb += b; no += (b/ephe_pools[i].osize); tw += w; - tp += np; + tp += np;*/ } JL_PRINTF(JL_STDOUT, - "%d objects, %d total allocated, %d total fragments, in %d pages\n", - no, nb, tw, tp); + "%d objects (%d%% old), %d kB (%d%% old) total allocated, %d total fragments (%d%% overhead), in %d pages\n", + no, (nold*100)/no, nb/1024, (noldbytes*100)/nb, tw, (tw*100)/nb, tp); } static void big_obj_stats(void) @@ -2291,6 +2389,16 @@ static void big_obj_stats(void) } v = v->next; } + v = big_objects_marked; + size_t nused_old=0, nbytes_old=0; + while (v != NULL) { + if (gc_marked(&v->_data)) { + nused_old++; + nbytes_old += v->sz; + } + v = v->next; + } + mallocarray_t *ma = mallocarrays; while (ma != NULL) { if (gc_marked(ma->a)) { @@ -2300,7 +2408,7 @@ static void big_obj_stats(void) ma = ma->next; } - JL_PRINTF(JL_STDOUT, "%d bytes in %d large objects\n", nbytes, nused); + JL_PRINTF(JL_STDOUT, "%d kB (%d%% old) in %d large objects (%d%% old)\n", (nbytes + nbytes_old)/1024, nbytes + nbytes_old ? (nbytes_old*100)/(nbytes + nbytes_old) : 0, nused + nused_old, nused+nused_old ? (nused_old*100)/(nused + nused_old) : 0); } #endif //MEMPROFILE diff --git a/src/julia.h b/src/julia.h index 58cd21fe78103..c184572a9abd4 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1374,10 +1374,10 @@ extern DLLEXPORT jl_compileropts_t jl_compileropts; #define JL_COMPILEROPT_DUMPBITCODE_OFF 2 DLLEXPORT void gc_queue_root(void *root); -void gc_setmark_buf(void *buf); +void gc_setmark_buf(void *buf, int); DLLEXPORT void gc_wb_slow(void* parent, void* ptr); -static inline void gc_wb(void* parent, void* ptr) +static inline void gc_wb_fwd(void* parent, void* ptr) { #ifdef GC_INC // if parent is marked and ptr is clean @@ -1387,12 +1387,14 @@ static inline void gc_wb(void* parent, void* ptr) #endif } +#define gc_wb(a,b) gc_wb_back(a) + static inline void gc_wb_buf(void *parent, void *bufptr) { #ifdef GC_INC // if parent is marked if((*((uintptr_t*)parent) & 3) == 1) - gc_setmark_buf(bufptr); + gc_setmark_buf(bufptr, *(uintptr_t*)parent & 3); #endif } diff --git a/src/module.c b/src/module.c index d151812db7fa1..43bc764af62d8 100644 --- a/src/module.c +++ b/src/module.c @@ -374,7 +374,7 @@ DLLEXPORT void jl_checked_assignment(jl_binding_t *b, jl_value_t *rhs) JL_PRINTF(JL_STDERR,"Warning: redefining constant %s\n",b->name->name); } } - gc_wb(((void**)b)-1, rhs); + gc_wb_fwd(((void**)b)-1, rhs); b->value = rhs; } diff --git a/src/options.h b/src/options.h index 33a4d64ab91ba..805a5906b191f 100644 --- a/src/options.h +++ b/src/options.h @@ -35,6 +35,8 @@ // catch invalid accesses. //#define MEMDEBUG +//#define GC_VERIFY + // profiling options // GC_FINAL_STATS prints total GC stats at exit @@ -44,7 +46,7 @@ //#define MEMPROFILE // GCTIME prints time taken by each phase of GC -//#define GCTIME +#define GC_TIME // OBJPROFILE counts objects by type //#define OBJPROFILE From a6bd8397f5384f829cd41c72da4bbdd90492412d Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Sat, 9 Aug 2014 19:46:27 +0200 Subject: [PATCH 03/17] working version of promotion at sweep --- src/array.c | 2 + src/builtins.c | 9 - src/cgutils.cpp | 2 +- src/codegen.cpp | 59 +- src/gc.c | 1190 ++++++++++++++++++++++++++--------------- src/interpreter.c | 11 +- src/julia.h | 29 +- src/julia_internal.h | 2 +- src/options.h | 2 +- src/toplevel.c | 12 +- test/perf/perfcomp.jl | 2 +- 11 files changed, 816 insertions(+), 504 deletions(-) diff --git a/src/array.c b/src/array.c index 9bf2dddc47593..885c0d2c69214 100644 --- a/src/array.c +++ b/src/array.c @@ -230,6 +230,7 @@ jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data, size_t nel, if (own_buffer) { a->how = 2; jl_gc_track_malloced_array(a); + jl_gc_count_allocd(nel*elsz + (elsz == 1 ? 1 : 0)); } else { a->how = 0; @@ -281,6 +282,7 @@ jl_array_t *jl_ptr_to_array(jl_value_t *atype, void *data, jl_tuple_t *dims, if (own_buffer) { a->how = 2; jl_gc_track_malloced_array(a); + jl_gc_count_allocd(nel*elsz + (elsz == 1 ? 1 : 0)); } else { a->how = 0; diff --git a/src/builtins.c b/src/builtins.c index 6934f79243dc7..bd9527ac1e372 100644 --- a/src/builtins.c +++ b/src/builtins.c @@ -490,15 +490,6 @@ JL_CALLABLE(jl_f_isdefined) return jl_boundp(m, s) ? jl_true : jl_false; } -/*jl_value_t* jl_tupleset(void* t, int i, void* x) { - if(x != 0) gc_wb(t, x); - return ((((jl_value_t**)(t))[2+(i)])=(jl_value_t*)(x)); - }*/ -/*jl_value_t* jl_cellset(void* a, int i, void* x) { - if(x != 0) gc_wb_back(a); - return ((((jl_value_t**)((jl_array_t*)a)->data)[(i)])=((jl_value_t*)(x))); - }*/ - // tuples --------------------------------------------------------------------- JL_CALLABLE(jl_f_tuple) diff --git a/src/cgutils.cpp b/src/cgutils.cpp index cce4f7eed52d9..5c7789feb3679 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -910,7 +910,7 @@ static Value *typed_load(Value *ptr, Value *idx_0based, jl_value_t *jltype, static Value *emit_unbox(Type *to, Value *x, jl_value_t *jt); -static Value *typed_store(Value *ptr, Value *idx_0based, Value *rhs, +static void typed_store(Value *ptr, Value *idx_0based, Value *rhs, jl_value_t *jltype, jl_codectx_t *ctx, Value* parent) // for the write barrier, NULL if no barrier needed { Type *elty = julia_type_to_llvm(jltype); diff --git a/src/codegen.cpp b/src/codegen.cpp index 81bfaecc550ec..f1cabd03e64c8 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1364,7 +1364,6 @@ static void emit_write_barrier(jl_codectx_t* ctx, Value *parent, Value *ptr) #ifdef GC_INC /* builder.CreateCall2(wbfunc, builder.CreateBitCast(parent, jl_pvalue_llvmt), builder.CreateBitCast(ptr, jl_pvalue_llvmt)); return;*/ - ptr = NULL; parent = builder.CreateBitCast(parent, T_psize); Value* parent_type = builder.CreateLoad(parent); Value* parent_mark_bits = builder.CreateAnd(parent_type, 3); @@ -1374,23 +1373,16 @@ static void emit_write_barrier(jl_codectx_t* ctx, Value *parent, Value *ptr) Value* parent_marked = builder.CreateICmpEQ(parent_mark_bits, ConstantInt::get(T_size, 1)); BasicBlock* cont = BasicBlock::Create(getGlobalContext(), "cont"); - BasicBlock* barrier_may_trigger; - if (ptr) barrier_may_trigger = BasicBlock::Create(getGlobalContext(), "wb_may_trigger", ctx->f); + BasicBlock* barrier_may_trigger = BasicBlock::Create(getGlobalContext(), "wb_may_trigger", ctx->f); BasicBlock* barrier_trigger = BasicBlock::Create(getGlobalContext(), "wb_trigger", ctx->f); - builder.CreateCondBr(parent_marked, ptr ? barrier_may_trigger : barrier_trigger, cont); - - if (ptr) { - builder.SetInsertPoint(barrier_may_trigger); - Value* ptr_mark_bits = builder.CreateAnd(builder.CreateLoad(builder.CreateBitCast(ptr, T_psize)), 3); - Value* ptr_not_marked = builder.CreateICmpEQ(ptr_mark_bits, ConstantInt::get(T_size, 0)); - // builder.CreateCall2(expect_func, ptr_not_marked, ConstantInt::get(T_int1, 0)); - builder.CreateCondBr(ptr_not_marked, barrier_trigger, cont); - } + builder.CreateCondBr(parent_marked, barrier_may_trigger, cont); + + builder.SetInsertPoint(barrier_may_trigger); + Value* ptr_mark_bits = builder.CreateAnd(builder.CreateLoad(builder.CreateBitCast(ptr, T_psize)), 3); + Value* ptr_not_marked = builder.CreateICmpEQ(ptr_mark_bits, ConstantInt::get(T_size, 0)); + builder.CreateCondBr(ptr_not_marked, barrier_trigger, cont); builder.SetInsertPoint(barrier_trigger); - if (!ptr) { // clear the mark - builder.CreateStore(builder.CreateAnd(parent_type, ~(uintptr_t)3), parent); - } - builder.CreateCall(queuerootfun, ptr ? ptr : builder.CreateBitCast(parent, jl_pvalue_llvmt)); + builder.CreateCall(prepare_call(queuerootfun), builder.CreateBitCast(parent, jl_pvalue_llvmt)); builder.CreateBr(cont); ctx->f->getBasicBlockList().push_back(cont); builder.SetInsertPoint(cont); @@ -1401,19 +1393,15 @@ static void emit_checked_write_barrier(jl_codectx_t *ctx, Value *parent, Value * { #ifdef GC_INC BasicBlock *cont; - if (ptr) { - Value *not_null = builder.CreateICmpNE(ptr, V_null); - BasicBlock *if_not_null = BasicBlock::Create(getGlobalContext(), "wb_not_null", ctx->f); - cont = BasicBlock::Create(getGlobalContext(), "cont"); - builder.CreateCondBr(not_null, if_not_null, cont); - builder.SetInsertPoint(if_not_null); - } + Value *not_null = builder.CreateICmpNE(ptr, V_null); + BasicBlock *if_not_null = BasicBlock::Create(getGlobalContext(), "wb_not_null", ctx->f); + cont = BasicBlock::Create(getGlobalContext(), "cont"); + builder.CreateCondBr(not_null, if_not_null, cont); + builder.SetInsertPoint(if_not_null); emit_write_barrier(ctx, parent, ptr); - if (ptr) { - builder.CreateBr(cont); - ctx->f->getBasicBlockList().push_back(cont); - builder.SetInsertPoint(cont); - } + builder.CreateBr(cont); + ctx->f->getBasicBlockList().push_back(cont); + builder.SetInsertPoint(cont); #endif } @@ -2174,10 +2162,7 @@ static Value *emit_known_call(jl_value_t *ff, jl_value_t **args, size_t nargs, Value* v = ety==(jl_value_t*)jl_any_type ? emit_expr(args[2],ctx) : emit_unboxed(args[2],ctx); typed_store(emit_arrayptr(ary,args[1],ctx), idx, v, - ety, ctx, /*ety == (jl_value_t*)jl_any_type ? ary : */NULL); - if (ety == (jl_value_t*)jl_any_type) { - emit_write_barrier(ctx, ary, NULL); - } + ety, ctx, ety == (jl_value_t*)jl_any_type ? ary : NULL); } JL_GC_POP(); return ary; @@ -2879,12 +2864,13 @@ static Value *emit_expr(jl_value_t *expr, jl_codectx_t *ctx, bool isboxed, int last_depth = ctx->argDepth; Value *name = literal_pointer_val(mn); jl_binding_t *bnd = NULL; - Value *bp; + Value *bp, *bp_owner = V_null; if (iskw) { // fenv = theF->env Value *fenv = emit_nthptr(theF, 2, tbaa_func); // bp = &((jl_methtable_t*)fenv)->kwsorter bp = emit_nthptr_addr(fenv, 7); + bp_owner = fenv; } else if (theF != NULL) { bp = make_gcroot(theF, ctx); @@ -2893,6 +2879,7 @@ static Value *emit_expr(jl_value_t *expr, jl_codectx_t *ctx, bool isboxed, if (is_global((jl_sym_t*)mn, ctx)) { bnd = jl_get_binding_for_method_def(ctx->module, (jl_sym_t*)mn); bp = julia_binding_gv(bnd); + bp_owner = literal_pointer_val((jl_value_t*)ctx->module); } else { bp = var_binding_pointer((jl_sym_t*)mn, &bnd, false, ctx); @@ -2902,7 +2889,7 @@ static Value *emit_expr(jl_value_t *expr, jl_codectx_t *ctx, bool isboxed, make_gcroot(a1, ctx); Value *a2 = boxed(emit_expr(args[2], ctx),ctx); make_gcroot(a2, ctx); - Value *mdargs[6] = { name, bp, literal_pointer_val(bnd), a1, a2, literal_pointer_val(args[3]) }; + Value *mdargs[6] = { name, bp, bp_owner, literal_pointer_val(bnd), a1, a2, literal_pointer_val(args[3]) }; ctx->argDepth = last_depth; return builder.CreateCall(prepare_call(jlmethod_func), ArrayRef(&mdargs[0], 6)); } @@ -4431,11 +4418,11 @@ static void init_julia_llvm_env(Module *m) queuerootfun = Function::Create(FunctionType::get(T_void, args_1ptr, false), Function::ExternalLinkage, "gc_queue_root", m); - jl_ExecutionEngine->addGlobalMapping(queuerootfun, (void*)&gc_queue_root); + add_named_global(queuerootfun, (void*)&gc_queue_root); wbfunc = Function::Create(FunctionType::get(T_void, wbargs, false), Function::ExternalLinkage, "gc_wb_slow", m); - jl_ExecutionEngine->addGlobalMapping(wbfunc, (void*)&gc_wb_slow); + add_named_global(wbfunc, (void*)&gc_wb_slow); std::vector exp_args(0); exp_args.push_back(T_int1); diff --git a/src/gc.c b/src/gc.c index 683d0556167b5..169b35db53aaa 100644 --- a/src/gc.c +++ b/src/gc.c @@ -50,7 +50,7 @@ typedef struct _bigval_t { size_t sz; union { uptrint_t _pad0; - uptrint_t old : 1; + uptrint_t age : 2; }; // must be 16-aligned here, in 32 & 64b union { @@ -86,11 +86,17 @@ typedef struct _gcval_t { typedef struct _pool_t { gcval_t *freelist ; - int32_t fl_linear; uint16_t end_offset; // avoid to compute this at each allocation - struct _gcpage_t *pages; - struct _gcpage_t *needsweep; uint16_t osize; + union { + struct _gcpage_t *pages; + struct { + uint16_t allocd : 1; + uint16_t linear : 1; + }; + }; + struct _gcpage_t *needsweep; + uint16_t nfree; } pool_t; /*#ifdef _P64 @@ -101,29 +107,29 @@ typedef struct _pool_t { // becaue we will be doing GC_PAGE(v)->cookie for // some v not in a page and it must not segfault typedef struct _gcpage_t { - union { - struct { - uint32_t pool_n : 8; - // this is a bitwise | of all gc_bits in this page - uint32_t gc_bits : 2; - // if this is 1, the freelist in this page contains only 2 cells. - // one is the first free cell, it points to the last cell of the page - // every cell in between is free - uint32_t linear : 1; - }; + struct { + uint16_t pool_n : 8; + uint16_t allocd : 1; + // this is a bitwise | of all gc_bits in this page + uint16_t gc_bits : 2; + // if this is 1, the freelist in this page contains only 2 cells. + // one is the first free cell, it points to the last cell of the page + // every cell in between is free + uint16_t linear : 1; }; uint16_t nfree; uint16_t nmarked; - - struct _gcpage_t *next; - // struct _gcpage_t **prev; // point to the next field of the previous page - char *data; // this is not strictly necessary uint16_t osize; - - struct { - char bits; - } old[GC_PAGE_SZ/(8*8)]; // one bit per object + uint16_t fl_begin_offset; + uint16_t fl_end_offset; + // struct _gcpage_t **prev; // point to the next field of the previous page + uint32_t data_offset; // this is not strictly necessary + char age[2*GC_PAGE_SZ/(8*8)]; // two bits per object } gcpage_t; +#define PAGE_DATA_PRE(p) ((char*)(p) + (p)->data_offset) +#define PAGE_DATA(p) ((char*)GC_PAGES(p) + GC_PAGE_SZ*(((char*)(p) - (char*)GC_PAGES(p))/sizeof(gcpage_t) + 1)) +#define PAGE_PFL_BEG(p) ((gcval_t**)(PAGE_DATA(p) + (p)->fl_begin_offset)) +#define PAGE_PFL_END(p) ((gcval_t**)(PAGE_DATA(p) + (p)->fl_end_offset)) #define PAGE_GROUP_COUNT 31 // We pack pages by groups of 31 which means a little less than 512k = 32*4 vm pages @@ -168,6 +174,7 @@ static const int gc_inc_steps = 1; #endif #ifdef _P64 #define default_collect_interval (5600*1024*sizeof(void*)) +//#define default_collect_interval (560*1024*sizeof(void*)) static size_t max_collect_interval = 1250000000UL; #else #define default_collect_interval (3200*1024*sizeof(void*)) @@ -175,7 +182,8 @@ static size_t max_collect_interval = 500000000UL; #endif // keep those 3 together static int64_t allocd_bytes; -static size_t collect_interval = default_collect_interval; +static size_t collect_interval; +static size_t long_collect_interval; static int gc_steps; #define N_POOLS 42 static __attribute__((aligned (64))) pool_t norm_pools[N_POOLS]; @@ -186,14 +194,16 @@ static int64_t total_allocd_bytes = 0; static int64_t allocd_bytes_since_sweep = 0; static int64_t freed_bytes = 0; static uint64_t total_gc_time=0; -static size_t live_bytes = 0; +static int64_t live_bytes = 0; +static int64_t live_bytes2 = 0; static size_t current_pg_count = 0; static size_t max_pg_count = 0; int jl_in_gc; // referenced from switchto task.c #ifdef OBJPROFILE -static htable_t obj_counts[2]; +static htable_t obj_counts[3]; +static htable_t obj_sizes[3]; #endif #ifdef GC_FINAL_STATS @@ -215,7 +225,8 @@ static int n_pause = 0; int sweeping = 0; #ifdef GC_INC -size_t scanned_bytes; +static int64_t scanned_bytes; +static int64_t perm_scanned_bytes; static int prev_sweep_mask = GC_MARKED; static size_t scanned_bytes_goal; #else @@ -272,47 +283,60 @@ static void add_lostval_parent(jl_value_t* parent) static bigval_t *big_objects = NULL; static bigval_t *big_objects_marked = NULL; +const void *BUFFTY = (void*)0xdeadb00f; +const void *MATY = (void*)0xdeadaa01; -static inline void objprofile_count(void* v, int old) +static size_t array_nbytes(jl_array_t*); +static inline void objprofile_count(void* ty, int old, int sz) { #ifdef OBJPROFILE #ifdef GC_VERIFY if (verifying) return; #endif - if (jl_typeof(v) <= 0x10) return; - void **bp = ptrhash_bp(&obj_counts[old], jl_typeof(v)); + if ((intptr_t)ty <= 0x10) + ty = BUFFTY; + void **bp = ptrhash_bp(&obj_counts[old], ty); if (*bp == HT_NOTFOUND) *bp = (void*)2; else (*((ptrint_t*)bp))++; + bp = ptrhash_bp(&obj_sizes[old], ty); + if (*bp == HT_NOTFOUND) + *bp = (void*)(1 + sz); + else + *((ptrint_t*)bp) += sz; #endif } -static inline int gc_setmark_other(void *o, int mark_mode) +static inline void gc_setmark_other(void *o, int mark_mode) { _gc_setmark(o, mark_mode); verify_val(o); - return mark_mode; } +#define inc_sat(v,s) v = (v) >= s ? s : (v)+1; +#define PROMOTE_AGE 2 + static inline int gc_setmark_big(void *o, int mark_mode) { #ifdef GC_VERIFY if (verifying) { _gc_setmark(o, mark_mode); - return mark_mode; + return 0; } #endif bigval_t* hdr = bigval_header(o); - if (hdr->old) { + int bits = gc_bits(o); + /* if (hdr->age >= PROMOTE_AGE) { mark_mode = GC_MARKED; } else { - if (!gc_marked(o)) - hdr->old = 1; - } - - if ((mark_mode == GC_MARKED) & (gc_bits(o) != GC_MARKED)) { + if (!bits) + inc_sat(hdr->age, PROMOTE_AGE); + }*/ + if (bits == GC_QUEUED || bits == GC_MARKED) + mark_mode = GC_MARKED; + if ((mark_mode == GC_MARKED) & (bits != GC_MARKED)) { *hdr->prev = hdr->next; if (hdr->next) hdr->next->prev = hdr->prev; @@ -322,6 +346,15 @@ static inline int gc_setmark_big(void *o, int mark_mode) big_objects_marked->prev = &hdr->next; big_objects_marked = hdr; } +#ifdef OBJPROFILE + if (!bits) { + if (mark_mode == GC_MARKED) + perm_scanned_bytes += hdr->sz; + else + scanned_bytes += hdr->sz; + objprofile_count(jl_typeof(o), mark_mode == GC_MARKED, hdr->sz); + } +#endif _gc_setmark(o, mark_mode); verify_val(o); return mark_mode; @@ -332,22 +365,39 @@ static inline int gc_setmark_pool(void *o, int mark_mode) #ifdef GC_VERIFY if (verifying) { _gc_setmark(o, mark_mode); - return mark_mode; + return; } #endif gcpage_t* page = GC_PAGE(o); - int obj_i = ((uintptr_t)o - (uintptr_t)page->data)/8; - if (page->old[obj_i/8].bits & (1 << (obj_i % 8))) { - _gc_setmark(o, GC_MARKED); + int bits = gc_bits(o); + if (bits == GC_QUEUED || bits == GC_MARKED) + mark_mode = GC_MARKED; + /* int obj_i = ((uintptr_t)o - (uintptr_t)GC_PAGE_DATA(o))/8; + int sh = (obj_i % 4)*2; + char *ages = page->age; + int age = (ages[obj_i/4] >> sh) & 3; + if (age >= PROMOTE_AGE) { mark_mode = GC_MARKED; } else { - if (!gc_marked(o)) - page->old[obj_i/8].bits |= 1 << (obj_i % 8); - _gc_setmark(o, mark_mode); - } - page->nmarked += (mark_mode == GC_MARKED); - page->gc_bits |= gc_bits(o); + if (!bits) { + inc_sat(age, PROMOTE_AGE); + ages[obj_i/4] &= ~(3 << sh); + ages[obj_i/4] |= age << sh; + } + }*/ +#ifdef OBJPROFILE + if (!bits) { + if (mark_mode == GC_MARKED) + perm_scanned_bytes += page->osize; + else + scanned_bytes += page->osize; + objprofile_count(jl_typeof(o), mark_mode == GC_MARKED, page->osize); + } +#endif + _gc_setmark(o, mark_mode); + // page->nmarked += (mark_mode == GC_MARKED); + page->gc_bits |= mark_mode; verify_val(o); return mark_mode; } @@ -355,7 +405,7 @@ static inline int gc_setmark_pool(void *o, int mark_mode) static inline int gc_setmark(void *o, int sz, int mark_mode) { - if(sz <= 2048) + if (sz <= 2048) return gc_setmark_pool(o, mark_mode); else return gc_setmark_big(o, mark_mode); @@ -371,6 +421,7 @@ inline void gc_setmark_buf(void *o, int mark_mode) gc_setmark_pool(buf, mark_mode); else gc_setmark_big(buf, mark_mode); + // objprofile_count(BUFFTY, gc_bits(buf) == GC_MARKED); } // malloc wrappers, aligned allocation @@ -412,7 +463,7 @@ static __attribute__((noinline)) void *malloc_page(void) heap = heaps[heap_i]; if (heap == NULL) { #ifdef _OS_WINDOWS_ - char* mem = VirtualAlloc(NULL, sizeof(region_t) + GC_PAGE_SZ, MEM_RESERVE, PAGE_READWRITE); + char* mem = VirtualAlloc(NULL, sizeof(region_t) + GC_PAGE_SZ*32, MEM_RESERVE, PAGE_READWRITE); #else char* mem = mmap(NULL, sizeof(region_t) + GC_PAGE_SZ*32, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); mem = mem == MAP_FAILED ? NULL : mem; @@ -421,6 +472,7 @@ static __attribute__((noinline)) void *malloc_page(void) jl_printf(JL_STDERR, "could not allocate pools\n"); abort(); } + // we may waste up to around 500k of virtual address space for alignment but those pages are never committed heap = (region_t*)((char*)GC_PAGES(mem + SYS_PAGE_SZ + GC_PAGE_SZ*32 - 1) - SYS_PAGE_SZ); heaps[heap_i] = heap; #ifdef _OS_WINDOWS_ @@ -510,7 +562,7 @@ static inline int maybe_collect(void) DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { maybe_collect(); - allocd_bytes += sz; + // allocd_bytes += sz; void *b = malloc(sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -520,13 +572,13 @@ DLLEXPORT void *jl_gc_counted_malloc(size_t sz) DLLEXPORT void jl_gc_counted_free(void *p, size_t sz) { free(p); - freed_bytes += sz; + // freed_bytes += sz; } DLLEXPORT void *jl_gc_counted_realloc(void *p, size_t sz) { maybe_collect(); - allocd_bytes += ((sz+1)/2); // NOTE: wild guess at growth amount + // allocd_bytes += ((sz+1)/2); // NOTE: wild guess at growth amount void *b = realloc(p, sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -536,8 +588,7 @@ DLLEXPORT void *jl_gc_counted_realloc(void *p, size_t sz) DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { maybe_collect(); - if (sz > old) - allocd_bytes += (sz-old); + // allocd_bytes += (sz-old); void *b = realloc(p, sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -719,6 +770,7 @@ void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) static __attribute__((noinline)) void *alloc_big(size_t sz) { + // jl_printf(JL_STDOUT, "BIG: %d\n", sz); maybe_collect(); size_t offs = BVOFFS*sizeof(void*); if (sz+offs+15 < offs+15) // overflow in adding offs, size was "negative" @@ -731,7 +783,7 @@ static __attribute__((noinline)) void *alloc_big(size_t sz) #ifdef MEMDEBUG memset(v, 0xee, allocsz); #endif - v->sz = sz; + v->sz = allocsz; v->flags = 0; v->next = big_objects; v->prev = &big_objects; @@ -753,10 +805,27 @@ static jl_value_t** sweep_big_list(int sweep_mask, bigval_t** pv) bigval_t *nxt = v->next; if (gc_marked(&v->_data)) { pv = &v->next; - if ((gc_bits(&v->_data) & sweep_mask) == sweep_mask) { - gc_bits(&v->_data) = GC_CLEAN; - big_reset++; + // objprofile_count(&v->_data, gc_bits(&v->_data) == GC_MARKED, v->sz); + live_bytes2 += v->sz; + int age = v->age; + int bits = gc_bits(&v->_data); + if (age >= PROMOTE_AGE) { + if (sweep_mask == GC_MARKED) { + bits = GC_CLEAN; + big_reset++; + } + else if (bits == GC_MARKED_NOESC) + bits = GC_QUEUED; } + else { + inc_sat(age, PROMOTE_AGE); + v->age = age; + if ((sweep_mask & bits) == sweep_mask) { + bits = GC_CLEAN; + big_reset++; + } + } + gc_bits(&v->_data) = bits; } else { *pv = nxt; @@ -815,12 +884,19 @@ void jl_gc_track_malloced_array(jl_array_t *a) mallocarrays = ma; } +void jl_gc_count_allocd(size_t sz) +{ + allocd_bytes += sz; +} + static size_t array_nbytes(jl_array_t *a) { + size_t sz = 0; if (jl_array_ndims(a)==1) - return a->elsize * a->maxsize; + sz = a->elsize * a->maxsize + (a->elsize == 1 ? 1 : 0); else - return a->elsize * jl_array_len(a); + sz = a->elsize * jl_array_len(a); + return sz; } void jl_gc_free_array(jl_array_t *a) @@ -847,6 +923,8 @@ static void sweep_malloced_arrays(void) mallocarray_t *nxt = ma->next; if (gc_marked(ma->a)) { pma = &ma->next; + // objprofile_count(&MATY, MATY, array_nbytes(ma->a)); + live_bytes2 += array_nbytes(ma->a); } else { *pma = nxt; @@ -861,39 +939,88 @@ static void sweep_malloced_arrays(void) } } +int isinfl(gcval_t* v, void* needle) +{ + while(v != NULL) { + if (v == needle) + return 1; + v = v->next; + } + return 0; +} + // pool allocation +#ifdef __SSE__ +#include +#endif +// gcc/libc doesn't do this (on this setup at least) +// even with __builtin_assume_aligned +// assumes p is 16-bytes aligned and sz % 16 == 0 +static inline void bzero_small_a16(char *p, size_t sz) +{ +#ifndef __SSE__ + memset(p, 0, sz); +#else + __m128i c = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); + for(int i=0; i < sz/16; i++) + _mm_store_si128((__m128i*)p, c); +#endif +} static inline gcval_t *reset_page(pool_t *p, gcpage_t *pg, gcval_t *fl) { + pg->gc_bits = 0; pg->nfree = GC_PAGE_SZ/p->osize; pg->nmarked = 0; pg->pool_n = p - norm_pools; - memset(pg->old, 0x0, GC_PAGE_SZ/(8*8)); - gcval_t *beg = (gcval_t*)pg->data; + bzero_small_a16(pg->age, 2*GC_PAGE_SZ/(8*8)); + gcval_t *beg = (gcval_t*)PAGE_DATA(pg); gcval_t *end = (gcval_t*)((char*)beg + (pg->nfree - 1)*p->osize); - // madvise(beg, GC_PAGE_SZ, MADV_FREE); end->next = fl; pg->linear = 1; + pg->allocd = 0; + pg->fl_begin_offset = 0; + pg->fl_end_offset = (char*)end - (char*)beg; return beg; } + +static inline void _update_freelist(pool_t* p, gcval_t* next) +{ + gcval_t *cur = p->freelist; + p->freelist = next; + if (__likely(GC_PAGE_DATA(cur) == GC_PAGE_DATA(next))) return; + gcpage_t *cpg = cur ? GC_PAGE(cur) : NULL; + gcpage_t *npg = next ? GC_PAGE(next) : NULL; + if (npg == cpg) return; + if (cpg) { + cpg->linear = p->linear; + cpg->nfree = p->nfree; + cpg->allocd = p->allocd; + } + if (npg) { + p->linear = npg->linear; + p->nfree = npg->nfree; + p->allocd = npg->allocd; + } +} + static __attribute__((noinline)) void add_page(pool_t *p) { //gcpage_t *pg = (gcpage_t*)malloc_a16(sizeof(gcpage_t)); - void *data = malloc_page(); + char *data = malloc_page(); if (data == NULL) jl_throw(jl_memory_exception); gcpage_t *pg = GC_PAGE(data); //jl_printf(JL_STDOUT, "add page [%d] : 0x%lx 0x%lx = 0x%lx hdr 0x%lx\n", GC_PAGE_IDX(data), pg, data, (uintptr_t)data - (uintptr_t)pg, GC_PAGES(data)); - pg->data = data; + pg->data_offset = data - (char*)pg; pg->osize = p->osize; gcval_t *fl = reset_page(p, pg, p->freelist); // these statements are ordered so that interrupting after any of them // leaves the system in a valid state - pg->next = p->pages; - p->pages = pg; - p->freelist = fl; - p->fl_linear = 1; + // pg->next = p->pages; + // p->pages = pg; + _update_freelist(p, fl); } /*static inline void *_pool_alloc_fast(pool_t* p, int osize, int end_offset) @@ -909,6 +1036,7 @@ static __attribute__((noinline)) void add_page(pool_t *p) static inline void *__pool_alloc(pool_t* p, int osize, int end_offset) { + // jl_printf(JL_STDOUT, "POOL: %d\n", osize); gcval_t *v, *end; if (__unlikely((allocd_bytes += osize) >= 0)) { jl_gc_collect(); @@ -917,15 +1045,16 @@ static inline void *__pool_alloc(pool_t* p, int osize, int end_offset) add_page(p); } v = p->freelist; - v->flags = 0; - p->freelist = (char*)v + osize; - GC_PAGE(v)->nfree--; + p->nfree--; + p->allocd = 1; end = &(GC_PAGE_DATA(v)[end_offset]); - if (__unlikely(!((v != end) & (!!GC_PAGE(v)->linear)))) { + if ((v == end) | (!p->linear)) { + _update_freelist(p, v->next); p->freelist = v->next; - if (p->freelist) - p->fl_linear = GC_PAGE(p->freelist)->linear; + } else { + p->freelist = (char*)v + osize; } + v->flags = 0; // p->freelist = next; // pg->nfree--; return v; @@ -941,6 +1070,18 @@ static inline void *pool_alloc(pool_t *p) return __pool_alloc(p, p->osize, p->end_offset); } +static int sizeclasses[N_POOLS] = { + 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, + 64, 72, 80, 88, 96, //#=18 + + 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, + + 288, 320, 352, 384, 416, 448, 480, 512, + + 640, 768, 896, 1024, + + 1536, 2048 }; + static int szclass(size_t sz) { #ifndef _P64 @@ -957,9 +1098,16 @@ static int szclass(size_t sz) return 41; } +static int allocdsz(size_t sz) +{ + if (sz > 2048) return sz; + return sizeclasses[szclass(sz)]; +} + #ifdef GC_INC int check_timeout = 0; -#define should_timeout() (check_timeout && scanned_bytes >= scanned_bytes_goal) +//#define should_timeout() (check_timeout && scanned_bytes >= scanned_bytes_goal) +#define should_timeout() 0 #else #define should_timeout() 0 #endif @@ -972,136 +1120,192 @@ static int lazy_freed_pages = 0; static int page_done = 0; static int obj_old = 0; static int obj_young = 0; +static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl,int,int); +static void _update_freelist(pool_t* p, gcval_t* next); +static void sweep_pool_region(region_t* heap, int sweep_mask) +{ + gcval_t **pfl[N_POOLS]; + for (int i = 0; i < N_POOLS; i++) { + _update_freelist(&norm_pools[i], NULL); + pfl[i] = &norm_pools[i].freelist; + } + for (int pg_i = 0; pg_i < REGION_PG_COUNT/32; pg_i++) { + uint32_t line = heap->freemap[pg_i]; + if (!!~line) { + for (int j = 1; j < 32; j++) { + if (!((line >> j) & 1)) { + gcpage_t *pg = GC_PAGE(heap->pages[pg_i*32 + j]); + int p_n = pg->pool_n; + pool_t *p = &norm_pools[p_n]; + int osize = pg->osize; + pfl[p_n] = sweep_page(p, pg, pfl[p_n], sweep_mask, osize); + } + } + } + } + int i = 0; + for (pool_t* p = norm_pools; p < norm_pools + N_POOLS; p++) { + *pfl[i++] = NULL; + if (p->freelist) { + gcval_t *begin = p->freelist; + p->freelist = NULL; + _update_freelist(p, begin); + } + } +} -static void sweep_pool(pool_t *p, int sweep_mask) +static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_mask, int osize) { #ifdef FREE_PAGES_EAGER int freedall; #else int empty; #endif - gcval_t **prev_pfl; + gcval_t **prev_pfl = pfl; gcval_t *v; - gcpage_t *pg = p->needsweep; - gcpage_t **ppg = &p->needsweep; - gcval_t **pfl = &p->freelist; - size_t osize = p->osize; + // gcpage_t **ppg = &p->needsweep; size_t old_nfree = 0, nfree = 0; int pg_freedall = 0, pg_total = 0; int pg_skpd = 0, pg_wont_skip = 0; int obj_per_page = GC_PAGE_SZ/osize; int whole_page = 0; - while (pg != NULL) { - // if ((pg->cookie & ~(uint32_t)7) != PAGE_COOKIE) - // abort(); - char *data = pg->data; - char *old = pg->old; - v = (gcval_t*)data; - char *lim = (char*)v + GC_PAGE_SZ - osize; - freedall = 1; - prev_pfl = pfl; - old_nfree += pg->nfree; - - if (pg->gc_bits == GC_MARKED) { - // skip - if (sweep_mask == GC_MARKED_NOESC && pg->nmarked*10 >= obj_per_page*8) { - freedall = 0; + char *data = PAGE_DATA_PRE(pg); + char *ages = pg->age; + v = (gcval_t*)data; + char *lim = (char*)v + GC_PAGE_SZ - osize; + freedall = 1; + old_nfree += pg->nfree; + prev_pfl = pfl; + if (pg->gc_bits == GC_MARKED) { + // skip + if (sweep_mask == GC_MARKED_NOESC && (!pg->allocd/* || pg->nmarked >= (8*obj_per_page)/10)*/)) { + // pg->allocd = 0; + if (!pg->allocd) { pg_skpd++; + freedall = 0; + if (pg->fl_begin_offset != (uint16_t)-1) { + *pfl = (gcval_t*)PAGE_PFL_BEG(pg); + pfl = prev_pfl = PAGE_PFL_END(pg); + } goto free_page; } } - else if(pg->gc_bits == GC_CLEAN) { - // if (whole_page) - // p->nfree += obj_per_page; // overestimation - // else - goto free_page; + pg->allocd = 0; + } + else if(pg->gc_bits == GC_CLEAN) { + // if (whole_page) + // p->nfree += obj_per_page; // overestimation + // else + pg->allocd = 0; + goto free_page; + } + if (sweep_mask == GC_MARKED) + pg->nmarked = 0; + int pg_nfree = 0; + gcval_t **pfl_begin = NULL; + while ((char*)v <= lim) { + int obj_i = ((uintptr_t)v - (uintptr_t)data)/8; + // we can encouter a queued value at this point + // if a write barrier was moved back between two + // sweeping increments + int bits = gc_bits(v); + int sh = (obj_i % 4)*2; + int age = (ages[obj_i/4] >> sh) & 3; + if (!bits) { + *pfl = v; + pfl = &v->next; + pfl_begin = pfl_begin ? pfl_begin : pfl; + pg_nfree++; + ages[obj_i/4] &= ~(3 << (obj_i % 4)*2); } - if (sweep_mask == GC_MARKED) - pg->nmarked = 0; - int pg_nfree = 0; - while ((char*)v <= lim) { - int obj_i = ((uintptr_t)v - (uintptr_t)data)/8; - // we can encouter a queued value at this point - // if a write barrier was moved back between two - // sweeping increments - if (!gc_marked(v) & (gc_bits(v) != GC_QUEUED)) { - *pfl = v; - pfl = &v->next; - pg_nfree++; - old[obj_i/8] &= ~(1 << (obj_i % 8)); - } - else { - if ((sweep_mask & gc_bits(v)) == sweep_mask) + else { + if (age >= PROMOTE_AGE) { + if (sweep_mask == GC_MARKED) gc_bits(v) = GC_CLEAN; - freedall = 0; - } - v = (gcval_t*)((char*)v + osize); + else if (bits == GC_MARKED_NOESC) + gc_bits(v) = GC_QUEUED; + } else if ((sweep_mask & bits) == sweep_mask) + gc_bits(v) = GC_CLEAN; + + // else { + inc_sat(age, PROMOTE_AGE); + ages[obj_i/4] &= ~(3 << sh); + ages[obj_i/4] |= age << sh; + // } + freedall = 0; } - pg->nfree = pg_nfree; - page_done++; - free_page: - // nfreed += this_page_nfree; - // pg->nfree = this_page_nfree; - if (sweep_mask == GC_MARKED) - pg->nmarked = 0; - pg_freedall += freedall; - - // lazy version: (empty) if the whole page was already unused, free it - // eager version: (freedall) free page as soon as possible - // the eager one uses less memory. - gcpage_t *nextpg; - pg_total++; - nextpg = pg->next; - if (freedall) { - if (0 && sweep_mask == GC_MARKED_NOESC) { - gcval_t *begin = reset_page(p, pg, NULL); - *prev_pfl = begin; - pfl = (gcval_t**)((char*)begin + (pg->nfree - 1)*osize); - ppg = &pg->next; - lazy_freed_pages++; - } - else { - pfl = prev_pfl; - *ppg = nextpg; - #ifdef MEMDEBUG - memset(pg, 0xbb, sizeof(gcpage_t)); - #endif - free_page(pg->data); - } - freed_pages++; - nfree += obj_per_page; + v = (gcval_t*)((char*)v + osize); + } + + pg->fl_begin_offset = pfl_begin ? (char*)pfl_begin - data : (uint16_t)-1; + pg->fl_end_offset = pfl_begin ? (char*)pfl - data : (uint16_t)-1; + + pg->nfree = pg_nfree; + page_done++; + free_page: + // nfreed += this_page_nfree; + // pg->nfree = this_page_nfree; + if (sweep_mask == GC_MARKED) + pg->nmarked = 0; + pg_freedall += freedall; + + // lazy version: (empty) if the whole page was already unused, free it + // eager version: (freedall) free page as soon as possible + // the eager one uses less memory. + pg_total++; + if (freedall) { + if (prev_sweep_mask == GC_MARKED_NOESC && lazy_freed_pages <= default_collect_interval/4*4096) { + gcval_t *begin = reset_page(p, pg, 0x1234); + *prev_pfl = begin; + pfl = (gcval_t**)((char*)begin + ((int)pg->nfree - 1)*osize); + begin->next = (gcval_t*)0xdeadbeef; + // jl_printf(JL_STDOUT, "SZ: 0x%lx 0x%lx 0x%lx\n", begin, prev_pfl, ((intptr_t)pfl - (intptr_t)begin)); + // if (!isinfl(p->freelist, begin)) + // abort(); + // ppg = &pg->next; + lazy_freed_pages++; } else { - pg->gc_bits = GC_MARKED; - ppg = &pg->next; - pg->linear = 0; - nfree += pg->nfree; + pfl = prev_pfl; +#ifdef MEMDEBUG + memset(pg->data, 0xbb, GC_PAGE_SZ); +#endif + free_page(data); +#ifdef MEMDEBUG + memset(pg, 0xbb, sizeof(gcpage_t)); +#endif } - /* if (should_timeout() && nextpg) { + freed_pages++; + nfree += obj_per_page; + } + else { + pg->gc_bits = GC_MARKED; + // ppg = &pg->next; + pg->linear = 0; + nfree += pg->nfree; + } + /* if (should_timeout() && nextpg) { pg->next = NULL; pg = nextpg; break; }*/ - scanned_bytes += GC_PAGE_SZ; - pg = nextpg; - } + // scanned_bytes += GC_PAGE_SZ; + // pg = nextpg; //gcpage_t* pgs = p->pages; - *ppg = p->pages; - p->pages = p->needsweep; + // *ppg = p->pages; + /* p->pages = p->needsweep; if (pg == NULL) { p->needsweep = NULL; } else { p->needsweep = pg; - } + }*/ skipped_pages += pg_skpd; total_pages += pg_total; - *pfl = NULL; - if (p->freelist) { - p->fl_linear = GC_PAGE(p->freelist)->linear; - } + // *pfl = NULL; /* if (stats[0] + stats[1] + stats[2] + stats[2] > 0) jl_printf(JL_STDOUT, "Pool : %d %d %d %d\n", stats[0], stats[1], stats[2], stats[3]);*/ freed_bytes += (nfree - old_nfree)*osize; + return pfl; } // sweep phase @@ -1151,13 +1355,17 @@ static int gc_sweep_inc(int sweep_mask) #ifdef GC_INC int ct = check_timeout; if (sweep_mask == GC_MARKED_NOESC || gc_steps == 1) check_timeout = 0; + check_timeout = 0; #endif - for(i=0; i < N_POOLS; i++) { + for (int i = 0; i < HEAP_COUNT; i++) { + if (heaps[i]) + sweep_pool_region(heaps[i], sweep_mask); + } + /* for(i=0; i < N_POOLS; i++) { sweep_pool(&norm_pools[i], sweep_mask); finished &= !norm_pools[i].needsweep; - /* sweep_pool(&ephe_pools[i], sweep_mask); - finished &= !ephe_pools[i].needsweep;*/ - } + }*/ + finished = 1; #ifdef GC_INC check_timeout = ct; #endif @@ -1198,29 +1406,50 @@ void grow_mark_stack(void) } int max_msp = 0; +#ifdef GC_INC +static arraylist_t tasks; +static arraylist_t _remset[2]; +static arraylist_t *remset = &_remset[0]; +static arraylist_t *last_remset = &_remset[1]; +void reset_remset(void) +{ + arraylist_t *tmp = remset; + remset = last_remset; + last_remset = tmp; + remset->len = 0; +} +#endif DLLEXPORT void gc_queue_root(void *p) { - void* p2 = (uintptr_t)p & ~(uintptr_t)3; - if (gc_bits(p2) == GC_QUEUED) return; - if(mark_sp + perm_marked >= mark_stack_size) grow_mark_stack(); - gc_bits(p2) = GC_QUEUED; - mark_stack[mark_sp++] = (jl_value_t*)p; - max_msp = max_msp > mark_sp ? max_msp : mark_sp; + void *ptr = (void*)((uintptr_t)p & ~(uintptr_t)1); + if (gc_bits(ptr) == GC_QUEUED) return; + gc_bits(ptr) = GC_QUEUED; + arraylist_push(remset, p); } -#ifdef GC_INC -static arraylist_t tasks; -static arraylist_t remset; -#endif -static void push_root(jl_value_t *v, int mark_mode, int d); -#define gc_push_root(v,mark_mode,d) do { assert((v) != NULL); verify_val(v); if (!gc_bits(v)) { push_root((jl_value_t*)(v),mark_mode,d); } } while(0) +static int push_root(jl_value_t *v, int d, int); +static inline int gc_push_root(void *v, int d) +{ + assert((v) != NULL); + verify_val(v); + int bits = gc_bits(v); + if (!gc_marked(v)) { + return push_root((jl_value_t*)(v),d, bits); + } + return bits; +} void jl_gc_setmark(jl_value_t *v) // TODO rename this as it is misleading now { - gc_setmark_pool(v, GC_MARKED); + // int64_t s = perm_scanned_bytes; + if (!gc_marked(v)) { + objprofile_count(jl_typeof(v), 1, 16); + gc_setmark_pool(v, GC_MARKED_NOESC); + } + // perm_scanned_bytes = s; } -static void gc_mark_stack(jl_value_t* ta, jl_gcframe_t *s, ptrint_t offset, int mark_mode, int d) +static void gc_mark_stack(jl_value_t* ta, jl_gcframe_t *s, ptrint_t offset, int d) { while (s != NULL) { s = (jl_gcframe_t*)((char*)s + offset); @@ -1229,17 +1458,17 @@ static void gc_mark_stack(jl_value_t* ta, jl_gcframe_t *s, ptrint_t offset, int if (s->nroots & 1) { for(size_t i=0; i < nr; i++) { jl_value_t **ptr = (jl_value_t**)((char*)rts[i] + offset); - scanned_bytes += sizeof(void*); + // scanned_bytes += sizeof(void*); if (*ptr != NULL) - gc_push_root(*ptr, mark_mode, d); + gc_push_root(*ptr, d); } } else { for(size_t i=0; i < nr; i++) { - scanned_bytes += sizeof(void*); + // scanned_bytes += sizeof(void*); if (rts[i] != NULL) { verify_parent("task", ta, &rts[i], "stack(%d)", i); - gc_push_root(rts[i], mark_mode, d); + gc_push_root(rts[i], d); } } } @@ -1247,7 +1476,7 @@ static void gc_mark_stack(jl_value_t* ta, jl_gcframe_t *s, ptrint_t offset, int } } -static int gc_mark_module(jl_module_t *m, int mark_mode, int d) +__attribute__((noinline)) static int gc_mark_module(jl_module_t *m, int d) { size_t i; int refyoung = 0; @@ -1258,15 +1487,13 @@ static int gc_mark_module(jl_module_t *m, int mark_mode, int d) gc_setmark_buf(b, gc_bits(m)); void* vb = gc_val_buf(b); verify_parent("module", m, &vb, "binding_buff"); - scanned_bytes += sizeof(jl_binding_t); + // scanned_bytes += allocdsz(sizeof(jl_binding_t) + sizeof(void*)); if (b->value != NULL) { verify_parent("module", m, &b->value, "binding(%s)", b->name->name); - gc_push_root(b->value, mark_mode, d); - refyoung |= gc_bits(b->value) == GC_MARKED_NOESC; + refyoung |= gc_push_root(b->value, d); } if (b->type != (jl_value_t*)jl_any_type) { - gc_push_root(b->type, mark_mode, d); - refyoung |= gc_bits(b->type) == GC_MARKED_NOESC; + refyoung |= gc_push_root(b->type, d); } } } @@ -1275,62 +1502,62 @@ static int gc_mark_module(jl_module_t *m, int mark_mode, int d) // after "using" it but before accessing it, this array might // contain the only reference. for(i=0; i < m->usings.len; i++) { - gc_push_root(m->usings.items[i], mark_mode, d); - refyoung |= gc_bits(m->usings.items[i]) == GC_MARKED_NOESC; + refyoung |= gc_push_root(m->usings.items[i], d); } if (m->constant_table) { verify_parent("module", m, &m->constant_table, "constant_table"); - gc_push_root(m->constant_table, mark_mode, d); - refyoung |= gc_bits(m->constant_table) == GC_MARKED_NOESC; + refyoung |= gc_push_root(m->constant_table, d); } return refyoung; } -static void gc_mark_task_stack(jl_task_t *ta, int mark_mode, int d) +static void gc_mark_task_stack(jl_task_t *ta, int d) { if (ta->stkbuf != NULL || ta == jl_current_task) { - if (ta->stkbuf != NULL) + if (ta->stkbuf != NULL) { gc_setmark_buf(ta->stkbuf, gc_bits(ta)); + // scanned_bytes += ta->ssize + 2*4096 - 1; + } #ifdef COPY_STACKS ptrint_t offset; if (ta == jl_current_task) { offset = 0; - gc_mark_stack((jl_value_t*)ta, jl_pgcstack, offset, mark_mode, d); + gc_mark_stack((jl_value_t*)ta, jl_pgcstack, offset, d); } else { offset = (char *)ta->stkbuf - ((char *)ta->stackbase - ta->ssize); - gc_mark_stack((jl_value_t*)ta, ta->gcstack, offset, mark_mode, d); + gc_mark_stack((jl_value_t*)ta, ta->gcstack, offset, d); } #else - gc_mark_stack((jl_value_t*)ta, ta->gcstack, 0, mark_mode, d); + gc_mark_stack((jl_value_t*)ta, ta->gcstack, 0, d); #endif } } -static void mark_task_stacks(int mark_mode) { +static void mark_task_stacks(void) { for (int i = 0; i < tasks.len; i++) { - gc_mark_task_stack(tasks.items[i], mark_mode, 0); + gc_mark_task_stack(tasks.items[i], 0); } } -static void gc_mark_task(jl_task_t *ta, int mark_mode, int d) +__attribute__((noinline)) static void gc_mark_task(jl_task_t *ta, int d) { - if (ta->parent) gc_push_root(ta->parent, mark_mode, d); - if (ta->last) gc_push_root(ta->last, mark_mode, d); - gc_push_root(ta->tls, mark_mode, d); - gc_push_root(ta->consumers, mark_mode, d); - gc_push_root(ta->donenotify, mark_mode, d); - gc_push_root(ta->exception, mark_mode, d); - if (ta->start) gc_push_root(ta->start, mark_mode, d); - if (ta->result) gc_push_root(ta->result, mark_mode, d); + if (ta->parent) gc_push_root(ta->parent, d); + if (ta->last) gc_push_root(ta->last, d); + gc_push_root(ta->tls, d); + gc_push_root(ta->consumers, d); + gc_push_root(ta->donenotify, d); + gc_push_root(ta->exception, d); + if (ta->start) gc_push_root(ta->start, d); + if (ta->result) gc_push_root(ta->result, d); #ifdef GC_INC - if (0 && mark_mode == GC_MARKED_NOESC) { - gc_mark_task_stack(ta, mark_mode, d); - } else { + // if (1 || mark_mode == GC_MARKED_NOESC) { + gc_mark_task_stack(ta, d); + /* } else { arraylist_push(&tasks, (void*)ta); - } + }*/ #else - gc_mark_task_stack(ta, mark_mode, d); + gc_mark_task_stack(ta, d); #endif } @@ -1342,66 +1569,69 @@ DLLEXPORT void jl_gc_lookfor(jl_value_t *v) { lookforme = v; } */ #define MAX_MARK_DEPTH 400 - -static void push_root(jl_value_t *v, int mark_mode, int d) +// returns 1 if v is young after this marking +static int push_root(jl_value_t *v, int d, int bits) { assert(v != NULL); jl_value_t *vt = (jl_value_t*)gc_typeof(v); // gc_setmark(v); - int remember = 0; int refyoung = 0; - if (mark_mode == GC_MARKED) { - // abort(); - mark_mode = GC_MARKED_NOESC; - // remember = 1; - // if (gc_marked(v)) goto ret; - } - if (vt == (jl_value_t*)jl_weakref_type) { - gc_setmark(v, jl_datatype_size(jl_weakref_type), mark_mode); + bits = gc_setmark(v, jl_datatype_size(jl_weakref_type), GC_MARKED_NOESC); goto ret; } if ((jl_is_datatype(vt) && ((jl_datatype_t*)vt)->pointerfree)) { int sz = jl_datatype_size(vt); - gc_setmark(v, sz, mark_mode); - scanned_bytes += sz; + bits = gc_setmark(v, sz, GC_MARKED_NOESC); + // scanned_bytes += allocdsz(sz); goto ret; } - - if (d >= MAX_MARK_DEPTH) - goto queue_the_root; - - if (should_timeout()) - goto queue_the_root; + int marked = 0; +#define MARK(v, s) do { \ + s; \ + if (d >= MAX_MARK_DEPTH) \ + goto queue_the_root; \ + if (should_timeout()) \ + goto queue_the_root; \ + } while (0) d++; - + // __builtin_prefetch(&(GC_PAGE(v)->age[v - PAGE_DATA); // some values have special representations if (vt == (jl_value_t*)jl_tuple_type) { size_t l = jl_tuple_len(v); - gc_setmark(v, l*sizeof(void*) + sizeof(jl_tuple_t), mark_mode); + MARK(v, bits = gc_setmark(v, l*sizeof(void*) + sizeof(jl_tuple_t), GC_MARKED_NOESC)); jl_value_t **data = ((jl_tuple_t*)v)->data; for(size_t i=0; i < l; i++) { jl_value_t *elt = data[i]; - scanned_bytes += sizeof(void*); if (elt != NULL) { verify_parent("tuple", v, &data[i], "elem(%d)", i); - gc_push_root(elt, mark_mode, d); - refyoung |= gc_bits(elt) == GC_MARKED_NOESC; + refyoung |= gc_push_root(elt, d); } } } else if (((jl_datatype_t*)(vt))->name == jl_array_typename) { jl_array_t *a = (jl_array_t*)v; if (a->pooled) - gc_setmark_pool(a, mark_mode); + MARK(a, bits = gc_setmark_pool(a, GC_MARKED_NOESC); if (a->how == 2) { + objprofile_count(MATY, gc_bits(a) == GC_MARKED, array_nbytes(a)); + if (gc_bits(a) == GC_MARKED) + perm_scanned_bytes += array_nbytes(a); + else + scanned_bytes += array_nbytes(a); + }); else - gc_setmark_big(a, mark_mode); + MARK(a, bits = gc_setmark_big(a, GC_MARKED_NOESC); if (a->how == 2) { + objprofile_count(MATY, gc_bits(a) == GC_MARKED, array_nbytes(a)); + if (gc_bits(a) == GC_MARKED) + perm_scanned_bytes += array_nbytes(a); + else + scanned_bytes += array_nbytes(a); + }); if (a->how == 3) { jl_value_t *owner = jl_array_data_owner(a); - gc_push_root(owner, mark_mode, d); - refyoung |= gc_bits(owner) == GC_MARKED_NOESC; + refyoung |= gc_push_root(owner, d); goto ret; } else if (a->how == 1) { @@ -1411,7 +1641,7 @@ static void push_root(jl_value_t *v, int mark_mode, int d) } if (a->ptrarray && a->data!=NULL) { size_t l = jl_array_len(a); - if (l > 100000 && d > MAX_MARK_DEPTH-10) { + if (0 && l > 100000 && d > MAX_MARK_DEPTH-10) { // don't mark long arrays at high depth, to try to avoid // copying the whole array into the mark queue goto queue_the_root; @@ -1421,11 +1651,10 @@ static void push_root(jl_value_t *v, int mark_mode, int d) int has_young_elt = 0; for(size_t i=0; i < l; i++) { jl_value_t *elt = ((jl_value_t**)data)[i]; - scanned_bytes += sizeof(void*); + // scanned_bytes += sizeof(void*); if (elt != NULL) { verify_parent("array", v, &((jl_value_t**)data)[i], "elem(%d)", i); - gc_push_root(elt, GC_MARKED_NOESC, d); - refyoung |= gc_bits(elt) == GC_MARKED_NOESC; + refyoung |= gc_push_root(elt, d); } // try to split large array marking // if (should_timeout() && l > 1000) goto queue_the_root; @@ -1433,19 +1662,18 @@ static void push_root(jl_value_t *v, int mark_mode, int d) } } else { - scanned_bytes += array_nbytes(a); } } else if (vt == (jl_value_t*)jl_module_type) { - gc_setmark(v, sizeof(jl_module_t), mark_mode); - refyoung |= gc_mark_module((jl_module_t*)v, mark_mode, d); - scanned_bytes += sizeof(jl_module_t); + MARK(v, bits = gc_setmark(v, sizeof(jl_module_t), GC_MARKED_NOESC)); + refyoung |= gc_mark_module((jl_module_t*)v, d); + // scanned_bytes += allocdsz(sizeof(jl_module_t)); } else if (vt == (jl_value_t*)jl_task_type) { - gc_setmark(v, sizeof(jl_task_t), mark_mode); - gc_mark_task((jl_task_t*)v, mark_mode, d); - refyoung = 1; - scanned_bytes += sizeof(jl_task_t); + MARK(v, bits = gc_setmark(v, sizeof(jl_task_t), GC_MARKED_NOESC)); + gc_mark_task((jl_task_t*)v, d); + refyoung = GC_MARKED_NOESC; + // scanned_bytes += allocdsz(sizeof(jl_task_t)); } else if(vt == (jl_value_t*)jl_symbol_type) { gc_setmark_other(v, GC_MARKED); // symbols are not pooled @@ -1459,23 +1687,29 @@ static void push_root(jl_value_t *v, int mark_mode, int d) #endif ) { jl_datatype_t *dt = (jl_datatype_t*)vt; - gc_setmark(v, jl_datatype_size(dt), mark_mode); + MARK(v, bits = gc_setmark(v, jl_datatype_size(dt), GC_MARKED_NOESC)); int nf = (int)jl_tuple_len(dt->names); + int fdsz = sizeof(void*)*nf; + // void** children = alloca(fdsz); + jl_fielddesc_t* fields = dt->fields; + int ci = 0; for(int i=0; i < nf; i++) { - if (dt->fields[i].isptr) { - scanned_bytes += sizeof(void*); - jl_value_t **slot = (jl_value_t**)((char*)v + dt->fields[i].offset + sizeof(void*)); + if (fields[i].isptr) { + // scanned_bytes += sizeof(void*); + jl_value_t **slot = (jl_value_t**)((char*)v + fields[i].offset + sizeof(void*)); jl_value_t *fld = *slot; if (fld) { verify_parent("object", v, slot, "field(%d)", i); - gc_push_root(fld, mark_mode, d); - refyoung |= gc_bits(fld) == GC_MARKED_NOESC; + // children[ci++] = fld; + refyoung |= gc_push_root(fld, d); } } else { - scanned_bytes += jl_field_size(dt, i); + // scanned_bytes += jl_field_size(dt, i); } } + // while(ci) + // refyoung |= gc_push_root(children[--ci], d); } #ifdef GC_VERIFY else { @@ -1489,20 +1723,22 @@ static void push_root(jl_value_t *v, int mark_mode, int d) #ifdef GC_VERIFY if (verifying) return; #endif - if (refyoung && gc_bits(v) == GC_MARKED) { + // objprofile_count(jl_typeof(v), gc_bits(v) == GC_MARKED ? 1 : 0, ); + if ((bits == GC_MARKED) && (refyoung == GC_MARKED_NOESC)) { /*for (int i = 0; i < remset.len; i++) { if (remset.items[i] == v) abort(); }*/ - arraylist_push(&remset, (void*)v); + arraylist_push(remset, v); } - objprofile_count(v, gc_bits(v) == GC_MARKED ? 1 : 0); - return; + return bits; queue_the_root: - scanned_bytes += sizeof(void*); - // save the mark mode in the lower bits of the pointer - gc_queue_root((void*)((uintptr_t)v | gc_bits(v))); + scanned_bytes += 0;//sizeof(void*); + if(mark_sp >= mark_stack_size) grow_mark_stack(); + mark_stack[mark_sp++] = (jl_value_t*)v; + max_msp = max_msp > mark_sp ? max_msp : mark_sp; + return bits; } static void visit_mark_stack_inc(int mark_mode) @@ -1510,8 +1746,7 @@ static void visit_mark_stack_inc(int mark_mode) while(mark_sp > 0 && !should_timeout()) { gcval_t* v = (gcval_t*)mark_stack[--mark_sp]; // assert(gc_bits(v) == GC_QUEUED || gc_bits(v) == GC_MARKED || gc_bits(v) == GC_MARKED_NOESC); - int mode = ((uintptr_t)v & 3) ? ((uintptr_t)v & 3) : mark_mode; - push_root((jl_value_t*)((uintptr_t)v & ~(uintptr_t)3), mode, 0); + push_root(v, 0, gc_bits(v)); } } @@ -1541,51 +1776,51 @@ extern jl_array_t *jl_module_init_order; static int inc_count = 0; static int quick_count = 0; -static void pre_mark(int mark_mode) +static void pre_mark(void) { // modules - gc_push_root(jl_main_module, mark_mode, 0); - gc_push_root(jl_current_module, mark_mode, 0); - if (jl_old_base_module) gc_push_root(jl_old_base_module, mark_mode, 0); - gc_push_root(jl_internal_main_module, mark_mode, 0); - gc_push_root(jl_root_task, mark_mode, 0); - gc_push_root(jl_current_task, mark_mode, 0); + gc_push_root(jl_main_module, 0); + gc_push_root(jl_current_module, 0); + if (jl_old_base_module) gc_push_root(jl_old_base_module, 0); + gc_push_root(jl_internal_main_module, 0); + gc_push_root(jl_root_task, 0); + gc_push_root(jl_current_task, 0); // invisible builtin values - if (jl_an_empty_cell) gc_push_root(jl_an_empty_cell, mark_mode, 0); - gc_push_root(jl_exception_in_transit, mark_mode, 0); - gc_push_root(jl_task_arg_in_transit, mark_mode, 0); - gc_push_root(typeToTypeId, mark_mode, 0); + if (jl_an_empty_cell) gc_push_root(jl_an_empty_cell, 0); + gc_push_root(jl_exception_in_transit, 0); + gc_push_root(jl_task_arg_in_transit, 0); + gc_push_root(typeToTypeId, 0); if (jl_module_init_order != NULL) - gc_push_root(jl_module_init_order, mark_mode, 0); + gc_push_root(jl_module_init_order, 0); size_t i; // stuff randomly preserved for(i=0; i < preserved_values.len; i++) { - gc_push_root((jl_value_t*)preserved_values.items[i], mark_mode, 0); + gc_push_root((jl_value_t*)preserved_values.items[i], 0); } // objects currently being finalized for(i=0; i < to_finalize.len; i++) { - gc_push_root(to_finalize.items[i], mark_mode, 0); + gc_push_root(to_finalize.items[i], 0); } //if (inc_count > 1 || quick_count > 1) return; // the following roots are constant and will stay marked in between increments - if (prev_sweep_mask == GC_MARKED) - jl_mark_box_caches(); - gc_push_root(jl_unprotect_stack_func, mark_mode, 0); - gc_push_root(jl_bottom_func, mark_mode, 0); - gc_push_root(jl_typetype_type, mark_mode, 0); - gc_push_root(jl_tupletype_type, mark_mode, 0); + // if (prev_sweep_mask == GC_MARKED) + jl_mark_box_caches(); + gc_push_root(jl_unprotect_stack_func, 0); + gc_push_root(jl_bottom_func, 0); + gc_push_root(jl_typetype_type, 0); + gc_push_root(jl_tupletype_type, 0); // constants - gc_push_root(jl_null, mark_mode, 0); - gc_push_root(jl_true, mark_mode, 0); - gc_push_root(jl_false, mark_mode, 0); + gc_push_root(jl_null, 0); + gc_push_root(jl_true, 0); + gc_push_root(jl_false, 0); } -#ifdef GC_VERIFY + static arraylist_t bits_save[4]; // set all mark bits to bits @@ -1611,23 +1846,30 @@ static void clear_mark(int bits) v = v->next; } } - - for(i = 0; i < 2*N_POOLS; i++) { - pool = i < N_POOLS ? &norm_pools[i] : &ephe_pools[i - N_POOLS]; - pg = pool->pages; - while (pg != NULL) { - pv = (gcval_t*)pg->data; - char *lim = (char*)pv + GC_PAGE_SZ - pool->osize; - while ((char*)pv <= lim) { - arraylist_push(&bits_save[gc_bits(pv)], pv); - gc_bits(pv) = bits; - pv = (gcval_t*)((char*)pv + pool->osize); + for (int h = 0; h < HEAP_COUNT; h++) { + region_t* heap = heaps[h]; + if (!heap) break; + for (int pg_i = 0; pg_i < REGION_PG_COUNT/32; pg_i++) { + uint32_t line = heap->freemap[pg_i]; + if (!!~line) { + for (int j = 1; j < 32; j++) { + if (!((line >> j) & 1)) { + gcpage_t *pg = GC_PAGE(heap->pages[pg_i*32 + j]); + pool_t *pool = &norm_pools[pg->pool_n]; + pv = (gcval_t*)PAGE_DATA(pg); + char *lim = (char*)pv + GC_PAGE_SZ - pool->osize; + while ((char*)pv <= lim) { + arraylist_push(&bits_save[gc_bits(pv)], pv); + gc_bits(pv) = bits; + pv = (gcval_t*)((char*)pv + pool->osize); + } + } + } } - pg = pg->next; } } } - +#ifdef GC_VERIFY static void restore(void) { for(int b = 0; b < 4; b++) { @@ -1638,8 +1880,11 @@ static void restore(void) } #endif -static void post_mark(int mark_mode) +static int n_finalized; + +static void post_mark(void) { + n_finalized = 0; // find unmarked objects that need to be finalized. // this must happen last. for(size_t i=0; i < finalizer_table.size; i+=2) { @@ -1648,19 +1893,26 @@ static void post_mark(int mark_mode) if (!gc_marked(v)) { jl_value_t *fin = finalizer_table.table[i+1]; if (gc_typeof(fin) == (jl_value_t*)jl_voidpointer_type) { + /* jl_printf(JL_STDOUT, "CFINA: "); + jl_static_show(JL_STDOUT, v); + jl_printf(JL_STDOUT, "\n");*/ void *p = jl_unbox_voidpointer(fin); if (p) ((void (*)(void*))p)(jl_data_ptr(v)); finalizer_table.table[i+1] = HT_NOTFOUND; continue; } - gc_push_root(v, mark_mode, 0); + gc_push_root(v, 0); schedule_finalization(v); + //jl_printf(JL_STDOUT, "FINA: "); + //jl_static_show(JL_STDOUT, v); + //jl_printf(JL_STDOUT, "\n"); + n_finalized++; } - gc_push_root(finalizer_table.table[i+1], mark_mode, 0); + gc_push_root(finalizer_table.table[i+1], 0); } } - visit_mark_stack(GC_MARKED); + visit_mark_stack(GC_MARKED_NOESC); } static void gc_mark(int finalize) @@ -1668,31 +1920,31 @@ static void gc_mark(int finalize) // mark all roots // active tasks - gc_push_root(jl_root_task, GC_MARKED_NOESC, 0); - gc_push_root(jl_current_task, GC_MARKED_NOESC, 0); + gc_push_root(jl_root_task, 0); + gc_push_root(jl_current_task, 0); // modules - gc_push_root(jl_main_module, GC_MARKED_NOESC, 0); - gc_push_root(jl_internal_main_module, GC_MARKED_NOESC, 0); - gc_push_root(jl_current_module, GC_MARKED_NOESC, 0); - if (jl_old_base_module) gc_push_root(jl_old_base_module, GC_MARKED_NOESC, 0); + gc_push_root(jl_main_module, 0); + gc_push_root(jl_internal_main_module, 0); + gc_push_root(jl_current_module, 0); + if (jl_old_base_module) gc_push_root(jl_old_base_module, 0); // invisible builtin values - if (jl_an_empty_cell) gc_push_root(jl_an_empty_cell, GC_MARKED_NOESC, 0); - gc_push_root(jl_exception_in_transit, GC_MARKED_NOESC, 0); - gc_push_root(jl_task_arg_in_transit, GC_MARKED_NOESC, 0); - gc_push_root(jl_unprotect_stack_func, GC_MARKED_NOESC, 0); - gc_push_root(jl_bottom_func, GC_MARKED_NOESC, 0); - gc_push_root(jl_typetype_type, GC_MARKED_NOESC, 0); - gc_push_root(jl_tupletype_type, GC_MARKED_NOESC, 0); - gc_push_root(typeToTypeId, GC_MARKED_NOESC, 0); + if (jl_an_empty_cell) gc_push_root(jl_an_empty_cell, 0); + gc_push_root(jl_exception_in_transit, 0); + gc_push_root(jl_task_arg_in_transit, 0); + gc_push_root(jl_unprotect_stack_func, 0); + gc_push_root(jl_bottom_func, 0); + gc_push_root(jl_typetype_type, 0); + gc_push_root(jl_tupletype_type, 0); + gc_push_root(typeToTypeId, 0); if (jl_module_init_order != NULL) - gc_push_root(jl_module_init_order, GC_MARKED_NOESC, 0); + gc_push_root(jl_module_init_order, 0); // constants - gc_push_root(jl_null, GC_MARKED_NOESC, 0); - gc_push_root(jl_true, GC_MARKED_NOESC, 0); - gc_push_root(jl_false, GC_MARKED_NOESC, 0); + gc_push_root(jl_null, 0); + gc_push_root(jl_true, 0); + gc_push_root(jl_false, 0); jl_mark_box_caches(); @@ -1700,12 +1952,12 @@ static void gc_mark(int finalize) // stuff randomly preserved for(i=0; i < preserved_values.len; i++) { - gc_push_root((jl_value_t*)preserved_values.items[i], GC_MARKED_NOESC, 0); + gc_push_root((jl_value_t*)preserved_values.items[i], 0); } // objects currently being finalized for(i=0; i < to_finalize.len; i++) { - gc_push_root(to_finalize.items[i], GC_MARKED_NOESC, 0); + gc_push_root(to_finalize.items[i], 0); } visit_mark_stack(GC_MARKED_NOESC); @@ -1724,14 +1976,14 @@ static void gc_mark(int finalize) finalizer_table.table[i+1] = HT_NOTFOUND; continue; } - gc_push_root(v, GC_MARKED_NOESC, 0); + gc_push_root(v, 0); if (finalize) schedule_finalization(v); } - gc_push_root(finalizer_table.table[i+1], GC_MARKED_NOESC, 0); + gc_push_root(finalizer_table.table[i+1], 0); } } visit_mark_stack(GC_MARKED_NOESC); - mark_task_stacks(GC_MARKED_NOESC); + mark_task_stacks(); visit_mark_stack(GC_MARKED_NOESC); } @@ -1853,22 +2105,41 @@ static void big_obj_stats(void); #endif #ifdef OBJPROFILE -static void print_obj_profile(htable_t obj_counts) +static void reset_obj_profile() { - for(int i=0; i < obj_counts.size; i+=2) { - if (obj_counts.table[i+1] != HT_NOTFOUND) { - jl_printf(JL_STDERR, " %d ", obj_counts.table[i+1]-1); - jl_static_show(JL_STDERR, (jl_value_t*)obj_counts.table[i]); + for(int g=0; g < 3; g++) { + htable_reset(&obj_counts[g], 0); + htable_reset(&obj_sizes[g], 0); + } +} + +static void print_obj_profile(htable_t nums, htable_t sizes) +{ + for(int i=0; i < nums.size; i+=2) { + if (nums.table[i+1] != HT_NOTFOUND) { + void* ty = nums.table[i]; + int num = (int)nums.table[i+1] - 1; + size_t sz = (int)ptrhash_get(&sizes, ty) - 1; + jl_printf(JL_STDERR, " %6d : %4d kB of ", num, sz/1024); + if (ty == BUFFTY) + jl_printf(JL_STDERR, "buffer"); + else if (ty == MATY) + jl_printf(JL_STDERR, "malloc"); + else + jl_static_show(JL_STDERR, (jl_value_t*)ty); jl_printf(JL_STDERR, "\n"); } } } -static void print_obj_profiles(void) + +void print_obj_profiles(void) { jl_printf(JL_STDERR, "Transient mark :\n"); - print_obj_profile(obj_counts[0]); + print_obj_profile(obj_counts[0], obj_sizes[0]); jl_printf(JL_STDERR, "Perm mark :\n"); - print_obj_profile(obj_counts[1]); + print_obj_profile(obj_counts[1], obj_sizes[1]); + jl_printf(JL_STDERR, "Remset :\n"); + print_obj_profile(obj_counts[2], obj_sizes[2]); } #endif @@ -1876,24 +2147,25 @@ int saved_mark_sp = 0; int sweep_mask = GC_MARKED; #define MIN_SCAN_BYTES 1024*1024 -static void mark_task_stacks(int); -static void gc_mark_task_stack(jl_task_t*,int,int); +static void mark_task_stacks(); +static void gc_mark_task_stack(jl_task_t*,int); void prepare_sweep(void) { for(int i = 0; i < 2*N_POOLS; i++) { pool_t *p = i < N_POOLS ? &norm_pools[i] : &ephe_pools[i - N_POOLS]; - if (p->pages) { + /* if (p->pages) { p->needsweep = p->pages; p->pages = NULL; - p->freelist = NULL; - } + p->freelist = NULL; + }*/ } } #ifdef GC_INC int64_t residual = 0; - +static int lr = 0; +static void clear_mark(int); void jl_gc_collect(void) { if (!is_gc_enabled) return; @@ -1904,31 +2176,38 @@ void jl_gc_collect(void) #if defined(GC_TIME) || defined(GC_FINAL_STATS) int wb_activations = mark_sp - saved_mark_sp; #endif + int64_t last_perm_scanned = perm_scanned_bytes; if (!sweeping) { inc_count++; quick_count++; - - scanned_bytes = 0; + scanned_bytes_goal = inc_count*(live_bytes/gc_inc_steps + mark_sp*sizeof(void*)); scanned_bytes_goal = scanned_bytes_goal < MIN_SCAN_BYTES ? MIN_SCAN_BYTES : scanned_bytes_goal; if (gc_inc_steps > 1) check_timeout = 1; double t = clock_now(); - - mark_stack -= perm_marked; - - mark_sp = perm_marked = perm_marked + mark_sp; - void** scratch = 0; - if (sweep_mask != GC_MARKED) { - scratch = malloc(sizeof(void*)*perm_marked); - memcpy(scratch, mark_stack, perm_marked*sizeof(void*)); + assert(mark_sp == 0); + /*if (live_bytes && gc_inc_steps > 1) visit_mark_stack_inc(GC_MARKED_NOESC); + else visit_mark_stack(GC_MARKED_NOESC);*/ + reset_remset(); + // jl_printf(JL_STDOUT, "remset : %d %d\n", last_remset->len, sweep_mask); + int SA = perm_scanned_bytes; + for(int i = 0; i < last_remset->len; i++) { + uintptr_t item = (uintptr_t)last_remset->items[i]; + void* ptr = (void*)(item & ~(uintptr_t)1); + objprofile_count(jl_typeof(ptr), 2, 0); + /* jl_printf(JL_STDOUT, "rem : "); + jl_(ptr); + jl_printf(JL_STDOUT, "\n");*/ + if (item & 1) { + arraylist_push(remset, item); + } + gc_bits(ptr) = GC_MARKED; + push_root(ptr, 0, gc_bits(ptr)); } - - if (live_bytes && gc_inc_steps > 1) visit_mark_stack_inc(GC_MARKED_NOESC); - else visit_mark_stack(GC_MARKED_NOESC); - - if (sweep_mask == GC_MARKED) + perm_scanned_bytes = SA; + /* if (sweep_mask == GC_MARKED) perm_marked = 0; else { for (int i = 0; i < perm_marked; i++) { @@ -1937,49 +2216,48 @@ void jl_gc_collect(void) memcpy(mark_stack, scratch, perm_marked*sizeof(void*)); free(scratch); mark_stack += perm_marked; - } + }*/ - pre_mark(GC_MARKED_NOESC); + pre_mark(); visit_mark_stack(GC_MARKED_NOESC); if (mark_sp == 0 || inc_count > gc_inc_steps) { // mark current stack last to avoid temporaries visit_mark_stack(GC_MARKED_NOESC); // in case inc_count > inc_steps, we finish the marking in one go - mark_task_stacks(GC_MARKED_NOESC); - visit_mark_stack(GC_MARKED_NOESC); + /* mark_task_stacks(GC_MARKED_NOESC); + visit_mark_stack(GC_MARKED_NOESC);*/ } - allocd_bytes_since_sweep += allocd_bytes + (int)collect_interval/gc_steps; - allocd_bytes = -(int)collect_interval/gc_steps; -#ifdef OBJPROFILE - print_obj_profiles(); - htable_reset(&obj_counts[0], 0); - htable_reset(&obj_counts[1], 0); -#endif + allocd_bytes_since_sweep += allocd_bytes + (int64_t)collect_interval/gc_steps; + // allocd_bytes = -(int64_t)collect_interval/gc_steps; double mark_pause = (clock_now() - t0); #ifdef GC_FINAL_STATS total_mark_time += mark_pause; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %d kB | stack %d -> %d (wb %d) | remset %d %d %d\n", mark_pause*1000, scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, remset.len, max_msp, allocd_bytes/1024); + JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d %d\n", mark_pause*1000, (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, perm_marked, allocd_bytes/1024); saved_mark_sp = mark_sp; #endif } - int64_t pct = -1, bonus = -1, SAVE = -1, SAVE2 = -1; + int64_t pct = -1, bonus = -1, SAVE = -1, SAVE2 = -1, est_fb = 0, SAVE3 = -1; double post_time = 0.0, finalize_time = 0.0; if(mark_sp == 0 || sweeping) { #if defined(GC_TIME) || defined(GC_FINAL_STATS) double sweep_t0 = clock_now(); #endif - int64_t actual_allocd = allocd_bytes_since_sweep; + int64_t actual_allocd = allocd_bytes_since_sweep, promo_bytes = 0; if (!sweeping) { #ifdef GC_TIME post_time = clock_now(); #endif - post_mark(GC_MARKED_NOESC); #ifdef GC_TIME post_time = clock_now() - post_time; #endif + post_mark(); + + est_fb = live_bytes - scanned_bytes - (sweep_mask == GC_MARKED_NOESC ? perm_scanned_bytes : perm_scanned_bytes) + actual_allocd; + promo_bytes = perm_scanned_bytes - last_perm_scanned; + int promo_pct = (actual_allocd - est_fb) ? (promo_bytes*100)/(actual_allocd - est_fb) : 100; #ifdef GC_VERIFY gc_verify(); #endif @@ -1988,28 +2266,47 @@ void jl_gc_collect(void) all_pool_stats(); big_obj_stats(); #endif - +#ifdef OBJPROFILE + print_obj_profiles(); + reset_obj_profile(); +#endif total_allocd_bytes += allocd_bytes_since_sweep; prepare_sweep(); + + bonus = est_fb - (7*(actual_allocd/10)); + // JL_PRINTF(JL_STDOUT, "GC choice %d kB live %d %% %d kB - %d kB + %d kB - %d kB\n", live_bytes/1024, promo_pct, est_fb/1024, (7*(actual_allocd/10))/1024, scanned_bytes/1024, residual/1024); + // if (bonus - residual < 0 && promo_pct < 90 && quick_count >= 3 || quick_count >= gc_quick_steps*2) { + if (/*prev_sweep_mask == GC_MARKED_NOESC && */(0 && quick_count >= long_collect_interval/default_collect_interval || quick_count >= 10 || 0 && collect_interval != default_collect_interval)) { + sweep_mask = GC_MARKED; // next collection is a full one + gc_steps = gc_inc_steps; + quick_count = 0; + residual = 0; + } + else { + sweep_mask = GC_MARKED_NOESC; // next collection is quick + gc_steps = 1;//gc_quick_steps; + } + if (sweep_mask == GC_MARKED) + perm_scanned_bytes = 0; + scanned_bytes = 0; + live_bytes2 = 0; gc_sweep_once(sweep_mask); sweeping = 1; - gc_steps = gc_sweep_steps; + // gc_steps = gc_sweep_steps; } - scanned_bytes = 0; if (gc_sweep_inc(sweep_mask)) { + // sweeping is over if (sweep_mask == GC_MARKED_NOESC) { - for (int i = 0; i < perm_marked; i++) { - gc_bits((mark_stack - perm_marked)[i]) = GC_QUEUED; - } - for (int i = 0; i < remset.len; i++) { - gc_queue_root(remset.items[i]); + for (int i = 0; i < remset->len; i++) { + gc_bits(((uintptr_t)remset->items[i] & ~(uintptr_t)1)) = GC_QUEUED; } } - remset.len = 0; + else { + remset->len = 0; + } - // sweeping is over - int tasks_end = 0; + /*int tasks_end = 0; for (int i = 0; i < tasks.len; i++) { jl_value_t* ta = (jl_value_t*)tasks.items[i]; if (gc_marked(ta)) { @@ -2017,46 +2314,54 @@ void jl_gc_collect(void) tasks_end++; } } - tasks.len = tasks_end; + tasks.len = tasks_end;*/ sweep_weak_refs(); - prev_sweep_mask = sweep_mask; sweeping = 0; if (sweep_mask == GC_MARKED) { tasks.len = 0; } - finalize_time = clock_now(); - run_finalizers(); - - finalize_time = clock_now() - finalize_time; - pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; - bonus = freed_bytes - (7*(actual_allocd/10)); - SAVE = residual; SAVE2 = freed_bytes; - if (bonus - residual < 0 && sweep_mask == GC_MARKED_NOESC) { + SAVE = residual; + pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; + // if (sweep_mask == GC_MARKED) { + if (sweep_mask == GC_MARKED_NOESC) { + collect_interval = default_collect_interval; + if (freed_bytes < actual_allocd/2) { + quick_count = 15; + collect_interval = 0; + } + } + else if (sweep_mask == GC_MARKED && freed_bytes < (7*(actual_allocd/10)) && n_pause > 1) { if (collect_interval <= 2*(max_collect_interval/5)) { collect_interval = 5*(collect_interval/2); + quick_count = 15; } - sweep_mask = GC_MARKED; // next collection is a full one - gc_steps = gc_inc_steps; - quick_count = 0; - residual = 0; } else { - if (sweep_mask == GC_MARKED) - residual = 0; - else - residual = actual_allocd - freed_bytes; collect_interval = default_collect_interval; - sweep_mask = GC_MARKED_NOESC; // next collection is quick - gc_steps = gc_quick_steps; } - + /* if (sweep_mask == GC_MARKED) + collect_interval = long_collect_interval; + else collect_interval = default_collect_interval/8;*/ + prev_sweep_mask = sweep_mask; + + allocd_bytes = -(int64_t)collect_interval/gc_steps; // jl_printf(JL_STDOUT, "ALLOCD %ld %ld %ld\n", allocd_bytes, collect_interval, default_collect_interval); inc_count = 0; live_bytes += -freed_bytes + allocd_bytes_since_sweep; + if (sweep_mask == GC_MARKED_NOESC && quick_count >= 3) { + int res = actual_allocd - freed_bytes - promo_bytes; + residual += res > 0 ? res : 0; + } + // jl_printf(JL_STDOUT, "LIVE %d | %d vs %d\n", live_bytes2 - live_bytes, live_bytes2, live_bytes); + SAVE3 = allocd_bytes_since_sweep; allocd_bytes_since_sweep = 0; freed_bytes = 0; + finalize_time = clock_now(); + run_finalizers(); + + finalize_time = clock_now() - finalize_time; } #if defined(GC_FINAL_STATS) || defined(GC_TIME) double sweep_pause = clock_now() - sweep_t0; @@ -2066,7 +2371,11 @@ void jl_gc_collect(void) total_fin_time += finalize_time + post_time; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms (freed %d kB = %d%% of alloc b/r %d/%d) (%.2f ms in post_mark, %.2f ms in fin) (marked in %d inc) mask %d\n", sweep_pause*1000, SAVE2/1024, pct, bonus/1024, SAVE/1024, post_time*1000, finalize_time*1000, inc_count, sweep_mask); + JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB ~ %d kB = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark, %.2f ms in %d fin) (marked in %d inc) mask %d | next in %d kB\n", sweep_pause*1000, live_bytes/1024, SAVE2/1024, est_fb/1024, pct, SAVE3/1024, bonus/1024, SAVE/1024, post_time*1000, finalize_time*1000, n_finalized, inc_count, sweep_mask, -allocd_bytes/1024); + int64_t diff = est_fb - SAVE2; + /*JL_PRINTF(JL_STDOUT, "relerr : %d %% (%ld)\n", SAVE2? 100*diff/SAVE2 : -1, diff); + if (lr == 0) lr = diff; + else if (lr != diff && diff < 0) { abort(); }*/ #endif } n_pause++; @@ -2151,6 +2460,7 @@ void jl_gc_collect(void) void *allocb(size_t sz) { + // jl_printf(JL_STDOUT, "BUFF relerr: %d\n", sz); buff_t *b; sz += sizeof(void*); #ifdef MEMDEBUG @@ -2168,6 +2478,19 @@ void *allocb(size_t sz) return b->data; } +void *reallocb(void *b, size_t sz) +{ + buff_t *buff = gc_val_buf(b); + if (buff->pooled) { + void* b2 = allocb(sz); + memcpy(b2, b, GC_PAGE(buff)->osize); + return b2; + } else { + char* bv = (bigval_t*)realloc(bigval_header(buff), sz + (BVOFFS + 1)*sizeof(void*)); + return bv + (BVOFFS + 1)*sizeof(void*); + } +} + DLLEXPORT void *allocobj(size_t sz) { #ifdef MEMDEBUG @@ -2242,16 +2565,7 @@ void jl_print_gc_stats(JL_STREAM *s) void jl_gc_init(void) { - int szc[N_POOLS] = { 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, - 64, 72, 80, 88, 96, //#=18 - - 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, - - 288, 320, 352, 384, 416, 448, 480, 512, - - 640, 768, 896, 1024, - - 1536, 2048 }; + int* szc = sizeclasses; int i; for(i=0; i < N_POOLS; i++) { @@ -2260,7 +2574,6 @@ void jl_gc_init(void) norm_pools[i].pages = NULL; norm_pools[i].freelist = NULL; norm_pools[i].needsweep = NULL; - norm_pools[i].fl_linear = 1; norm_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; ephe_pools[i].osize = szc[i]; @@ -2270,6 +2583,10 @@ void jl_gc_init(void) ephe_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; } assert(offsetof(gcpages_t, data) == GC_PAGE_SZ); + + collect_interval = default_collect_interval; + long_collect_interval = default_collect_interval; + allocd_bytes = -default_collect_interval; #ifdef GC_INC gc_steps = gc_inc_steps; @@ -2287,12 +2604,15 @@ void jl_gc_init(void) #endif #ifdef GC_INC arraylist_new(&tasks, 0); - arraylist_new(&remset, 0); + arraylist_new(remset, 0); + arraylist_new(last_remset, 0); #endif #ifdef OBJPROFILE - htable_new(&obj_counts[0], 0); - htable_new(&obj_counts[1], 0); + for(int g=0; g<3; g++) { + htable_new(&obj_counts[g], 0); + htable_new(&obj_sizes[g], 0); + } #endif #ifdef GC_FINAL_STATS process_t0 = clock_now(); @@ -2334,7 +2654,7 @@ static size_t pool_stats(pool_t *p, size_t *pwaste, size_t *np, size_t *pnold) v = (gcval_t*)((char*)v + osize); i++; } - gcpage_t *nextpg = pg->next; + gcpage_t *nextpg = NULL; pg = nextpg; } *pwaste = npgs*GC_PAGE_SZ - (nused*p->osize); diff --git a/src/interpreter.c b/src/interpreter.c index 04f08d3e9c287..d6d6741a9ff00 100644 --- a/src/interpreter.c +++ b/src/interpreter.c @@ -251,6 +251,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) else if (ex->head == method_sym) { jl_sym_t *fname = (jl_sym_t*)args[0]; jl_value_t **bp=NULL; + jl_value_t *bp_owner=NULL; jl_binding_t *b=NULL; jl_value_t *gf=NULL; int kw=0; @@ -266,8 +267,11 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) fname = (jl_sym_t*)jl_fieldref(jl_exprarg(fname, 2), 0); if (!kw) bp = &gf; - else - bp = (jl_value_t**)&((jl_methtable_t*)((jl_function_t*)gf)->env)->kwsorter; + else { + jl_methtable_t *env = (jl_methtable_t*)((jl_function_t*)gf)->env; + bp = (jl_value_t**)&env->kwsorter; + bp_owner = env; + } assert(jl_is_symbol(fname)); } else { @@ -280,6 +284,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) if (bp == NULL) { b = jl_get_binding_for_method_def(jl_current_module, fname); bp = &b->value; + bp_owner = jl_current_module; } } jl_value_t *atypes=NULL, *meth=NULL; @@ -289,7 +294,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) jl_check_static_parameter_conflicts((jl_lambda_info_t*)args[2], (jl_tuple_t*)jl_t1(atypes), fname); } meth = eval(args[2], locals, nl); - jl_method_def(fname, bp, b, (jl_tuple_t*)atypes, (jl_function_t*)meth, args[3]); + jl_method_def(fname, bp, bp_owner, b, (jl_tuple_t*)atypes, (jl_function_t*)meth, args[3]); JL_GC_POP(); return *bp; } diff --git a/src/julia.h b/src/julia.h index c184572a9abd4..32e429cfe8c4b 100644 --- a/src/julia.h +++ b/src/julia.h @@ -463,7 +463,7 @@ extern jl_sym_t *arrow_sym; extern jl_sym_t *ldots_sym; #define jl_cellref(a,i) (((jl_value_t**)((jl_array_t*)a)->data)[(i)]) #define jl_cellset(a,i,x) do { \ jl_value_t *xx = (jl_value_t*)(x); \ - if (xx) gc_wb_back(a); \ + if (xx) gc_wb(a, xx); \ (((jl_value_t**)((jl_array_t*)a)->data)[(i)])=xx; \ } while(0); @@ -692,8 +692,9 @@ jl_expr_t *jl_exprn(jl_sym_t *head, size_t n); jl_function_t *jl_new_generic_function(jl_sym_t *name); void jl_add_method(jl_function_t *gf, jl_tuple_t *types, jl_function_t *meth, jl_tuple_t *tvars, int8_t isstaged); -DLLEXPORT jl_value_t *jl_method_def(jl_sym_t *name, jl_value_t **bp, jl_binding_t *bnd, - jl_tuple_t *argtypes, jl_function_t *f, jl_value_t *isstaged); +DLLEXPORT jl_value_t *jl_method_def(jl_sym_t *name, jl_value_t **bp, jl_value_t *bp_owner, + jl_binding_t *bnd, jl_tuple_t *argtypes, + jl_function_t *f, jl_value_t *isstaged); DLLEXPORT jl_value_t *jl_box_bool(int8_t x); DLLEXPORT jl_value_t *jl_box_int8(int32_t x); DLLEXPORT jl_value_t *jl_box_uint8(uint32_t x); @@ -1098,11 +1099,13 @@ void *jl_gc_managed_malloc(size_t sz); void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz, int isaligned); void jl_gc_free_array(jl_array_t *a); void jl_gc_track_malloced_array(jl_array_t *a); +void jl_gc_count_allocd(size_t sz); void jl_gc_run_all_finalizers(void); DLLEXPORT void *alloc_2w(void); DLLEXPORT void *alloc_3w(void); DLLEXPORT void *alloc_4w(void); void *allocb(size_t sz); +void *reallocb(void*, size_t); DLLEXPORT void *allocobj(size_t sz); DLLEXPORT void jl_clear_malloc_data(void); @@ -1382,19 +1385,25 @@ static inline void gc_wb_fwd(void* parent, void* ptr) #ifdef GC_INC // if parent is marked and ptr is clean if(__unlikely((*((uintptr_t*)parent) & 3) == 1 && (*((uintptr_t*)ptr) & 3) == 0)) { - gc_queue_root(ptr); + gc_queue_root((void*)((uintptr_t)ptr | 1)); } #endif } -#define gc_wb(a,b) gc_wb_back(a) +static inline void gc_wb(void *parent, void *ptr) +{ + if (__unlikely((*((uintptr_t*)parent) & 3) == 1 && + (*((uintptr_t*)ptr) & 3) == 0)) + gc_queue_root(parent); +} static inline void gc_wb_buf(void *parent, void *bufptr) { #ifdef GC_INC - // if parent is marked - if((*((uintptr_t*)parent) & 3) == 1) - gc_setmark_buf(bufptr, *(uintptr_t*)parent & 3); + // if parent is marked and buf is not + if (__unlikely((*((uintptr_t*)parent) & 3) == 1)) + // (*((uintptr_t*)bufptr) & 3) != 1)) + gc_setmark_buf(bufptr, *(uintptr_t*)parent & 3); #endif } @@ -1402,8 +1411,8 @@ static inline void gc_wb_back(void *ptr) { #ifdef GC_INC // if ptr is marked - if((*((uintptr_t*)ptr) & 3) == 1) { - *((uintptr_t*)ptr) &= ~(uintptr_t)3; // clear the mark + if(__unlikely((*((uintptr_t*)ptr) & 3) == 1)) { + // *((uintptr_t*)ptr) &= ~(uintptr_t)3; // clear the mark gc_queue_root(ptr); } #endif diff --git a/src/julia_internal.h b/src/julia_internal.h index 157740f37f110..5913d0fbe6508 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -112,7 +112,7 @@ typedef CONTEXT *bt_context_t; #include typedef unw_context_t *bt_context_t; #endif -#define MAX_BT_SIZE 80000 +#define MAX_BT_SIZE 80 extern ptrint_t bt_data[MAX_BT_SIZE+1]; extern size_t bt_size; DLLEXPORT size_t rec_backtrace(ptrint_t *data, size_t maxsize); diff --git a/src/options.h b/src/options.h index 805a5906b191f..80225e18bad2b 100644 --- a/src/options.h +++ b/src/options.h @@ -46,7 +46,7 @@ //#define MEMPROFILE // GCTIME prints time taken by each phase of GC -#define GC_TIME +//#define GC_TIME // OBJPROFILE counts objects by type //#define OBJPROFILE diff --git a/src/toplevel.c b/src/toplevel.c index bdb869a5a2f66..e73817a707561 100644 --- a/src/toplevel.c +++ b/src/toplevel.c @@ -650,8 +650,10 @@ static int type_contains(jl_value_t *ty, jl_value_t *x) void print_func_loc(JL_STREAM *s, jl_lambda_info_t *li); -DLLEXPORT jl_value_t *jl_method_def(jl_sym_t *name, jl_value_t **bp, jl_binding_t *bnd, - jl_tuple_t *argtypes, jl_function_t *f, jl_value_t *isstaged) +DLLEXPORT jl_value_t *jl_method_def(jl_sym_t *name, jl_value_t **bp, + jl_value_t *bp_owner, jl_binding_t *bnd, + jl_tuple_t *argtypes, jl_function_t *f, + jl_value_t *isstaged) { // argtypes is a tuple ((types...), (typevars...)) jl_tuple_t *t = (jl_tuple_t*)jl_t1(argtypes); @@ -705,11 +707,7 @@ DLLEXPORT jl_value_t *jl_method_def(jl_sym_t *name, jl_value_t **bp, jl_binding_ if (*bp == NULL) { gf = (jl_value_t*)jl_new_generic_function(name); *bp = gf; - #ifdef GC_INC - // this would be better as gc_wb(whatever_jlvalue_bp_points_into, *bp); but this function is used in several places so this will do for now - // (in case changing the sig of this function do not forget methodfunc in codegen) - gc_queue_root(gf); - #endif + if (bp_owner) gc_wb(bp_owner, gf); } JL_GC_PUSH1(&gf); assert(jl_is_function(f)); diff --git a/test/perf/perfcomp.jl b/test/perf/perfcomp.jl index 8bd06b1b51607..1899fef68a06e 100644 --- a/test/perf/perfcomp.jl +++ b/test/perf/perfcomp.jl @@ -11,7 +11,7 @@ end function main() baseline = readperf(open(ARGS[1])) torun = length(ARGS) > 1 ? ARGS[2] : "all" - io,p = readsfrom(`make -s $torun`) + io,p = open(`make -s $torun`, "r") newp = readperf(io) names = sort(intersect(keys(baseline),keys(newp))) From 755581c8186ebf9c7eed3907631fb4e09da4f9a4 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Sun, 31 Aug 2014 00:51:59 +0200 Subject: [PATCH 04/17] fix darwin build & start to cleanup --- src/gc.c | 112 +++++++++------------------------------------- src/gf.c | 10 ++--- src/init.c | 3 +- src/interpreter.c | 4 +- 4 files changed, 29 insertions(+), 100 deletions(-) diff --git a/src/gc.c b/src/gc.c index 169b35db53aaa..9975d342952b8 100644 --- a/src/gc.c +++ b/src/gc.c @@ -19,6 +19,9 @@ #include "julia_internal.h" #ifndef _OS_WINDOWS_ #include +#ifdef _OS_DARWIN_ +#define MAP_ANONYMOUS MAP_ANON +#endif #endif #ifdef __cplusplus @@ -188,7 +191,8 @@ static int gc_steps; #define N_POOLS 42 static __attribute__((aligned (64))) pool_t norm_pools[N_POOLS]; static pool_t ephe_pools[N_POOLS]; -static const pool_t *pools = &norm_pools[0]; +//static pool_t *pools = &norm_pools[0]; +#define pools norm_pools static int64_t total_allocd_bytes = 0; static int64_t allocd_bytes_since_sweep = 0; @@ -327,13 +331,6 @@ static inline int gc_setmark_big(void *o, int mark_mode) #endif bigval_t* hdr = bigval_header(o); int bits = gc_bits(o); - /* if (hdr->age >= PROMOTE_AGE) { - mark_mode = GC_MARKED; - } - else { - if (!bits) - inc_sat(hdr->age, PROMOTE_AGE); - }*/ if (bits == GC_QUEUED || bits == GC_MARKED) mark_mode = GC_MARKED; if ((mark_mode == GC_MARKED) & (bits != GC_MARKED)) { @@ -347,13 +344,13 @@ static inline int gc_setmark_big(void *o, int mark_mode) big_objects_marked = hdr; } #ifdef OBJPROFILE - if (!bits) { + if (!bits) { if (mark_mode == GC_MARKED) perm_scanned_bytes += hdr->sz; else scanned_bytes += hdr->sz; objprofile_count(jl_typeof(o), mark_mode == GC_MARKED, hdr->sz); - } + } #endif _gc_setmark(o, mark_mode); verify_val(o); @@ -372,31 +369,16 @@ static inline int gc_setmark_pool(void *o, int mark_mode) int bits = gc_bits(o); if (bits == GC_QUEUED || bits == GC_MARKED) mark_mode = GC_MARKED; - /* int obj_i = ((uintptr_t)o - (uintptr_t)GC_PAGE_DATA(o))/8; - int sh = (obj_i % 4)*2; - char *ages = page->age; - int age = (ages[obj_i/4] >> sh) & 3; - if (age >= PROMOTE_AGE) { - mark_mode = GC_MARKED; - } - else { - if (!bits) { - inc_sat(age, PROMOTE_AGE); - ages[obj_i/4] &= ~(3 << sh); - ages[obj_i/4] |= age << sh; - } - }*/ #ifdef OBJPROFILE - if (!bits) { + if (!bits) { if (mark_mode == GC_MARKED) perm_scanned_bytes += page->osize; else scanned_bytes += page->osize; objprofile_count(jl_typeof(o), mark_mode == GC_MARKED, page->osize); - } + } #endif _gc_setmark(o, mark_mode); - // page->nmarked += (mark_mode == GC_MARKED); page->gc_bits |= mark_mode; verify_val(o); return mark_mode; @@ -412,16 +394,15 @@ static inline int gc_setmark(void *o, int sz, int mark_mode) } #define gc_typeof(v) ((jl_value_t*)(((uptrint_t)jl_typeof(v))&(~(uintptr_t)3))) -#define gc_val_buf(o) ((gcval_t*)(((void**)(o))-1)) +#define gc_val_buf(o) ((buff_t*)(((void**)(o))-1)) inline void gc_setmark_buf(void *o, int mark_mode) { - buff_t *buf = (buff_t*)gc_val_buf(o); + buff_t *buf = gc_val_buf(o); if (buf->pooled) gc_setmark_pool(buf, mark_mode); else gc_setmark_big(buf, mark_mode); - // objprofile_count(BUFFTY, gc_bits(buf) == GC_MARKED); } // malloc wrappers, aligned allocation @@ -517,7 +498,7 @@ static __attribute__((noinline)) void *malloc_page(void) static inline void free_page(void *p) { - int pg_idx; + int pg_idx = -1; int i; for(i = 0; i < HEAP_COUNT && heaps[i] != NULL; i++) { pg_idx = ((uintptr_t)p - (uintptr_t)heaps[i]->pages[0])/GC_PAGE_SZ; @@ -545,7 +526,6 @@ static inline void free_page(void *p) } #ifdef GC_INC -//#define maybe_collect() if (__unlikely(T.allocd_bytes/**gc_steps*/ > collect_interval)) jl_gc_collect() #define should_collect() (__unlikely(allocd_bytes > 0)) static inline int maybe_collect(void) { @@ -805,8 +785,6 @@ static jl_value_t** sweep_big_list(int sweep_mask, bigval_t** pv) bigval_t *nxt = v->next; if (gc_marked(&v->_data)) { pv = &v->next; - // objprofile_count(&v->_data, gc_bits(&v->_data) == GC_MARKED, v->sz); - live_bytes2 += v->sz; int age = v->age; int bits = gc_bits(&v->_data); if (age >= PROMOTE_AGE) { @@ -923,8 +901,6 @@ static void sweep_malloced_arrays(void) mallocarray_t *nxt = ma->next; if (gc_marked(ma->a)) { pma = &ma->next; - // objprofile_count(&MATY, MATY, array_nbytes(ma->a)); - live_bytes2 += array_nbytes(ma->a); } else { *pma = nxt; @@ -939,16 +915,6 @@ static void sweep_malloced_arrays(void) } } -int isinfl(gcval_t* v, void* needle) -{ - while(v != NULL) { - if (v == needle) - return 1; - v = v->next; - } - return 0; -} - // pool allocation #ifdef __SSE__ #include @@ -1007,36 +973,18 @@ static inline void _update_freelist(pool_t* p, gcval_t* next) static __attribute__((noinline)) void add_page(pool_t *p) { - //gcpage_t *pg = (gcpage_t*)malloc_a16(sizeof(gcpage_t)); char *data = malloc_page(); if (data == NULL) jl_throw(jl_memory_exception); gcpage_t *pg = GC_PAGE(data); - //jl_printf(JL_STDOUT, "add page [%d] : 0x%lx 0x%lx = 0x%lx hdr 0x%lx\n", GC_PAGE_IDX(data), pg, data, (uintptr_t)data - (uintptr_t)pg, GC_PAGES(data)); pg->data_offset = data - (char*)pg; pg->osize = p->osize; gcval_t *fl = reset_page(p, pg, p->freelist); - // these statements are ordered so that interrupting after any of them - // leaves the system in a valid state - // pg->next = p->pages; - // p->pages = pg; _update_freelist(p, fl); } -/*static inline void *_pool_alloc_fast(pool_t* p, int osize, int end_offset) -{ - gcval_t *v = p->freelist; - p->nfree--; - end = &(GC_PAGE_DATA(v)[end_offset]); - linear = (v != end) & p->fl_linear; - gcval_t *next_lin = (gcval_t*)((char*)v + osize); - allocd_bytes += osize; - p->freelist = next_lin; - }*/ - static inline void *__pool_alloc(pool_t* p, int osize, int end_offset) { - // jl_printf(JL_STDOUT, "POOL: %d\n", osize); gcval_t *v, *end; if (__unlikely((allocd_bytes += osize) >= 0)) { jl_gc_collect(); @@ -1055,8 +1003,6 @@ static inline void *__pool_alloc(pool_t* p, int osize, int end_offset) p->freelist = (char*)v + osize; } v->flags = 0; - // p->freelist = next; - // pg->nfree--; return v; } @@ -1070,7 +1016,7 @@ static inline void *pool_alloc(pool_t *p) return __pool_alloc(p, p->osize, p->end_offset); } -static int sizeclasses[N_POOLS] = { +static const int sizeclasses[N_POOLS] = { 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 64, 72, 80, 88, 96, //#=18 @@ -1118,8 +1064,6 @@ static int total_pages = 0; static int freed_pages = 0; static int lazy_freed_pages = 0; static int page_done = 0; -static int obj_old = 0; -static int obj_young = 0; static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl,int,int); static void _update_freelist(pool_t* p, gcval_t* next); static void sweep_pool_region(region_t* heap, int sweep_mask) @@ -1163,7 +1107,6 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma #endif gcval_t **prev_pfl = pfl; gcval_t *v; - // gcpage_t **ppg = &p->needsweep; size_t old_nfree = 0, nfree = 0; int pg_freedall = 0, pg_total = 0; int pg_skpd = 0, pg_wont_skip = 0; @@ -1820,7 +1763,7 @@ static void pre_mark(void) gc_push_root(jl_false, 0); } - +#ifdef GC_VERIFY static arraylist_t bits_save[4]; // set all mark bits to bits @@ -1869,7 +1812,7 @@ static void clear_mark(int bits) } } } -#ifdef GC_VERIFY + static void restore(void) { for(int b = 0; b < 4; b++) { @@ -2163,9 +2106,9 @@ void prepare_sweep(void) } #ifdef GC_INC -int64_t residual = 0; -static int lr = 0; +#ifdef GC_VERIFY static void clear_mark(int); +#endif void jl_gc_collect(void) { if (!is_gc_enabled) return; @@ -2274,14 +2217,10 @@ void jl_gc_collect(void) prepare_sweep(); - bonus = est_fb - (7*(actual_allocd/10)); - // JL_PRINTF(JL_STDOUT, "GC choice %d kB live %d %% %d kB - %d kB + %d kB - %d kB\n", live_bytes/1024, promo_pct, est_fb/1024, (7*(actual_allocd/10))/1024, scanned_bytes/1024, residual/1024); - // if (bonus - residual < 0 && promo_pct < 90 && quick_count >= 3 || quick_count >= gc_quick_steps*2) { - if (/*prev_sweep_mask == GC_MARKED_NOESC && */(0 && quick_count >= long_collect_interval/default_collect_interval || quick_count >= 10 || 0 && collect_interval != default_collect_interval)) { + if (quick_count >= 10) { sweep_mask = GC_MARKED; // next collection is a full one gc_steps = gc_inc_steps; quick_count = 0; - residual = 0; } else { sweep_mask = GC_MARKED_NOESC; // next collection is quick @@ -2321,9 +2260,8 @@ void jl_gc_collect(void) tasks.len = 0; } SAVE2 = freed_bytes; - SAVE = residual; pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; - // if (sweep_mask == GC_MARKED) { + if (sweep_mask == GC_MARKED_NOESC) { collect_interval = default_collect_interval; if (freed_bytes < actual_allocd/2) { @@ -2340,24 +2278,16 @@ void jl_gc_collect(void) else { collect_interval = default_collect_interval; } - /* if (sweep_mask == GC_MARKED) - collect_interval = long_collect_interval; - else collect_interval = default_collect_interval/8;*/ prev_sweep_mask = sweep_mask; allocd_bytes = -(int64_t)collect_interval/gc_steps; - // jl_printf(JL_STDOUT, "ALLOCD %ld %ld %ld\n", allocd_bytes, collect_interval, default_collect_interval); inc_count = 0; live_bytes += -freed_bytes + allocd_bytes_since_sweep; - if (sweep_mask == GC_MARKED_NOESC && quick_count >= 3) { - int res = actual_allocd - freed_bytes - promo_bytes; - residual += res > 0 ? res : 0; - } - // jl_printf(JL_STDOUT, "LIVE %d | %d vs %d\n", live_bytes2 - live_bytes, live_bytes2, live_bytes); SAVE3 = allocd_bytes_since_sweep; allocd_bytes_since_sweep = 0; freed_bytes = 0; + finalize_time = clock_now(); run_finalizers(); @@ -2371,7 +2301,7 @@ void jl_gc_collect(void) total_fin_time += finalize_time + post_time; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB ~ %d kB = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark, %.2f ms in %d fin) (marked in %d inc) mask %d | next in %d kB\n", sweep_pause*1000, live_bytes/1024, SAVE2/1024, est_fb/1024, pct, SAVE3/1024, bonus/1024, SAVE/1024, post_time*1000, finalize_time*1000, n_finalized, inc_count, sweep_mask, -allocd_bytes/1024); + JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark, %.2f ms in %d fin) (marked in %d inc) mask %d | next in %d kB\n", sweep_pause*1000, live_bytes/1024, SAVE2/1024, pct, SAVE3/1024, bonus/1024, SAVE/1024, post_time*1000, finalize_time*1000, n_finalized, inc_count, sweep_mask, -allocd_bytes/1024); int64_t diff = est_fb - SAVE2; /*JL_PRINTF(JL_STDOUT, "relerr : %d %% (%ld)\n", SAVE2? 100*diff/SAVE2 : -1, diff); if (lr == 0) lr = diff; diff --git a/src/gf.c b/src/gf.c index 2b5c31e7cc3e3..2a154a7d3ec90 100644 --- a/src/gf.c +++ b/src/gf.c @@ -1272,17 +1272,17 @@ jl_methlist_t *jl_method_list_insert(jl_methlist_t **pml, jl_tuple_t *type, // if this contains Union types, methods after it might actually be // more specific than it. we need to re-sort them. if (has_unions(type)) { - jl_value_t* item_parent = newrec; - jl_value_t* next_parent = 0; + jl_value_t* item_parent = (jl_value_t*)newrec; + jl_value_t* next_parent = 0; jl_methlist_t *item = newrec->next, *next; jl_methlist_t **pitem = &newrec->next, **pnext; while (item != JL_NULL) { pl = pml; l = *pml; - pa = parent; + pa = parent; next = item->next; pnext = &item->next; - next_parent = item; + next_parent = (jl_value_t*)item; while (l != newrec->next) { if (jl_args_morespecific((jl_value_t*)item->sig, (jl_value_t*)l->sig)) { @@ -1294,7 +1294,7 @@ jl_methlist_t *jl_method_list_insert(jl_methlist_t **pml, jl_tuple_t *type, *pl = item; gc_wb(pa, item); pnext = pitem; - next_parent = item_parent; + next_parent = item_parent; break; } pl = &l->next; diff --git a/src/init.c b/src/init.c index 4873961824934..02dee85ab1ab7 100644 --- a/src/init.c +++ b/src/init.c @@ -85,8 +85,7 @@ jl_compileropts_t jl_compileropts = { NULL, // build_path JL_COMPILEROPT_CHECK_BOUNDS_DEFAULT, JL_COMPILEROPT_DUMPBITCODE_OFF, 0, // int_literals - JL_COMPILEROPT_COMPILE_DEFAULT, - 0, // int32_literals + JL_COMPILEROPT_COMPILE_DEFAULT }; int jl_boot_file_loaded = 0; diff --git a/src/interpreter.c b/src/interpreter.c index d6d6741a9ff00..ff4572670ec88 100644 --- a/src/interpreter.c +++ b/src/interpreter.c @@ -270,7 +270,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) else { jl_methtable_t *env = (jl_methtable_t*)((jl_function_t*)gf)->env; bp = (jl_value_t**)&env->kwsorter; - bp_owner = env; + bp_owner = (jl_value_t*)env; } assert(jl_is_symbol(fname)); } @@ -284,7 +284,7 @@ static jl_value_t *eval(jl_value_t *e, jl_value_t **locals, size_t nl) if (bp == NULL) { b = jl_get_binding_for_method_def(jl_current_module, fname); bp = &b->value; - bp_owner = jl_current_module; + bp_owner = (jl_value_t*)jl_current_module; } } jl_value_t *atypes=NULL, *meth=NULL; From f0a78a7c445dc27d188445dd92506cd02c81af95 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Sun, 31 Aug 2014 14:22:28 +0200 Subject: [PATCH 05/17] Fix a bug where a soon-to-be-promoted object would escape the write barrier. A bit more cleanup too. Also some missing write barrier in new code. --- src/codegen.cpp | 4 ++-- src/gc.c | 30 ++++++++++++------------------ src/gf.c | 2 ++ src/julia.h | 4 ++-- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index f1cabd03e64c8..7d4d3404193ee 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1378,8 +1378,8 @@ static void emit_write_barrier(jl_codectx_t* ctx, Value *parent, Value *ptr) builder.CreateCondBr(parent_marked, barrier_may_trigger, cont); builder.SetInsertPoint(barrier_may_trigger); - Value* ptr_mark_bits = builder.CreateAnd(builder.CreateLoad(builder.CreateBitCast(ptr, T_psize)), 3); - Value* ptr_not_marked = builder.CreateICmpEQ(ptr_mark_bits, ConstantInt::get(T_size, 0)); + Value* ptr_mark_bit = builder.CreateAnd(builder.CreateLoad(builder.CreateBitCast(ptr, T_psize)), 1); + Value* ptr_not_marked = builder.CreateICmpEQ(ptr_mark_bit, ConstantInt::get(T_size, 0)); builder.CreateCondBr(ptr_not_marked, barrier_trigger, cont); builder.SetInsertPoint(barrier_trigger); builder.CreateCall(prepare_call(queuerootfun), builder.CreateBitCast(parent, jl_pvalue_llvmt)); diff --git a/src/gc.c b/src/gc.c index 9975d342952b8..e57046da56d48 100644 --- a/src/gc.c +++ b/src/gc.c @@ -263,8 +263,11 @@ static void add_lostval_parent(jl_value_t* parent) #define verify_val(v) do { \ if(lostval == (jl_value_t*)(v) && (v) != 0) { \ - JL_PRINTF(JL_STDOUT, "Found lostval 0x%lx at %s:%d\n", \ + JL_PRINTF(JL_STDOUT, \ + "Found lostval 0x%lx at %s:%d oftype: ", \ (uintptr_t)(lostval), __FILE__, __LINE__); \ + jl_static_show(JL_STDOUT, jl_typeof(v)); \ + JL_PRINTF(JL_STDOUT, "\n"); \ } \ } while(0); @@ -276,6 +279,9 @@ static void add_lostval_parent(jl_value_t* parent) JL_PRINTF(JL_STDOUT, "\tloc 0x%lx : ", (uintptr_t)(slot)); \ JL_PRINTF(JL_STDOUT, args); \ JL_PRINTF(JL_STDOUT, "\n"); \ + JL_PRINTF(JL_STDOUT, "\ttype: "); \ + jl_static_show(JL_STDOUT, jl_typeof(obj)); \ + JL_PRINTF(JL_STDOUT, "\n"); \ add_lostval_parent((jl_value_t*)(obj)); \ } \ } while(0); @@ -1995,8 +2001,9 @@ static void gc_verify(void) jl_value_t* lostval_parent = NULL; for(int i = 0; i < lostval_parents.len; i++) { lostval_parent = (jl_value_t*)lostval_parents.items[i]; - for(int j = 0; j < bits_save[GC_CLEAN].len; j++) { - if (bits_save[GC_CLEAN].items[j] == lostval_parent) { + int clean_len = bits_save[GC_CLEAN].len; + for(int j = 0; j < clean_len + bits_save[GC_QUEUED].len; j++) { + if (bits_save[j >= clean_len ? GC_QUEUED : GC_CLEAN].items[j >= clean_len ? j - clean_len : j] == lostval_parent) { lostval = lostval_parent; lostval_parent = NULL; break; @@ -2140,9 +2147,6 @@ void jl_gc_collect(void) uintptr_t item = (uintptr_t)last_remset->items[i]; void* ptr = (void*)(item & ~(uintptr_t)1); objprofile_count(jl_typeof(ptr), 2, 0); - /* jl_printf(JL_STDOUT, "rem : "); - jl_(ptr); - jl_printf(JL_STDOUT, "\n");*/ if (item & 1) { arraylist_push(remset, item); } @@ -2150,16 +2154,6 @@ void jl_gc_collect(void) push_root(ptr, 0, gc_bits(ptr)); } perm_scanned_bytes = SA; - /* if (sweep_mask == GC_MARKED) - perm_marked = 0; - else { - for (int i = 0; i < perm_marked; i++) { - gc_bits((uintptr_t)scratch[i] & ~(uintptr_t)3) = GC_MARKED; - } - memcpy(mark_stack, scratch, perm_marked*sizeof(void*)); - free(scratch); - mark_stack += perm_marked; - }*/ pre_mark(); visit_mark_stack(GC_MARKED_NOESC); @@ -2416,8 +2410,8 @@ void *reallocb(void *b, size_t sz) memcpy(b2, b, GC_PAGE(buff)->osize); return b2; } else { - char* bv = (bigval_t*)realloc(bigval_header(buff), sz + (BVOFFS + 1)*sizeof(void*)); - return bv + (BVOFFS + 1)*sizeof(void*); + bigval_t* bv = (bigval_t*)realloc(bigval_header(buff), sz + (BVOFFS + 1)*sizeof(void*)); + return (char*)bv + (BVOFFS + 1)*sizeof(void*); } } diff --git a/src/gf.c b/src/gf.c index 2a154a7d3ec90..d1803e5bd4afe 100644 --- a/src/gf.c +++ b/src/gf.c @@ -1499,6 +1499,7 @@ static void all_p2c(jl_value_t *ast, jl_tuple_t *tvars) if (jl_is_lambda_info(ast)) { jl_lambda_info_t *li = (jl_lambda_info_t*)ast; li->ast = jl_prepare_ast(li, jl_null); + gc_wb(li, li->ast); parameters_to_closureenv(li->ast, tvars); } else if (jl_is_expr(ast)) { @@ -1516,6 +1517,7 @@ static void precompile_unspecialized(jl_function_t *func, jl_tuple_t *sig, jl_tu // assuming they are there. method cache will fill them in when // it constructs closures for new "specializations". func->linfo->ast = jl_prepare_ast(func->linfo, jl_null); + gc_wb(func->linfo, func->linfo->ast); parameters_to_closureenv(func->linfo->ast, tvars); all_p2c(func->linfo->ast, tvars); } diff --git a/src/julia.h b/src/julia.h index 32e429cfe8c4b..f86347d94c2ff 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1384,7 +1384,7 @@ static inline void gc_wb_fwd(void* parent, void* ptr) { #ifdef GC_INC // if parent is marked and ptr is clean - if(__unlikely((*((uintptr_t*)parent) & 3) == 1 && (*((uintptr_t*)ptr) & 3) == 0)) { + if(__unlikely((*((uintptr_t*)parent) & 3) == 1 && (*((uintptr_t*)ptr) & 1) == 0)) { gc_queue_root((void*)((uintptr_t)ptr | 1)); } #endif @@ -1393,7 +1393,7 @@ static inline void gc_wb_fwd(void* parent, void* ptr) static inline void gc_wb(void *parent, void *ptr) { if (__unlikely((*((uintptr_t*)parent) & 3) == 1 && - (*((uintptr_t*)ptr) & 3) == 0)) + (*((uintptr_t*)ptr) & 1) == 0)) gc_queue_root(parent); } From 52520f4ef869e97a080dd355e6844b25adaf598a Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Wed, 3 Sep 2014 14:06:02 +0200 Subject: [PATCH 06/17] repair timing & memory stats --- base/util.jl | 11 ++++++--- src/gc.c | 64 +++++++++++++++++++++++++++------------------------- 2 files changed, 41 insertions(+), 34 deletions(-) diff --git a/base/util.jl b/base/util.jl index f593231ac73a0..bd7d7afa9f890 100644 --- a/base/util.jl +++ b/base/util.jl @@ -39,12 +39,17 @@ function toc() end # print elapsed time, return expression value - +const _units = ["bytes", "kB", "MB"] function time_print(t, b, g) + i = 1 + while b > 1024 && i < length(_units) + b = div(b, 1024) + i += 1 + end if 0 < g - @printf("elapsed time: %s seconds (%d bytes allocated, %.2f%% gc time)\n", t/1e9, b, 100*g/t) + @printf("elapsed time: %s seconds (%d %s allocated, %.2f%% gc time)\n", t/1e9, b, _units[i], 100*g/t) else - @printf("elapsed time: %s seconds (%d bytes allocated)\n", t/1e9, b) + @printf("elapsed time: %s seconds (%d %s allocated)\n", t/1e9, b, _units[i]) end end diff --git a/src/gc.c b/src/gc.c index e57046da56d48..a9dea3f29e671 100644 --- a/src/gc.c +++ b/src/gc.c @@ -198,6 +198,8 @@ static int64_t total_allocd_bytes = 0; static int64_t allocd_bytes_since_sweep = 0; static int64_t freed_bytes = 0; static uint64_t total_gc_time=0; +#define NS_TO_S(t) ((double)(t/1000)/(1000*1000)) +#define NS2MS(t) ((double)(t/1000)/1000) static int64_t live_bytes = 0; static int64_t live_bytes2 = 0; static size_t current_pg_count = 0; @@ -214,9 +216,9 @@ static htable_t obj_sizes[3]; static double page_alloc_time=0; static size_t total_freed_bytes=0; static double max_pause = 0.0; -static double total_sweep_time=0; -static double total_mark_time=0; -static double total_fin_time=0; +static uint64_t total_sweep_time=0; +static uint64_t total_mark_time=0; +static uint64_t total_fin_time=0; #endif static int n_pause = 0; @@ -1546,7 +1548,7 @@ static int push_root(jl_value_t *v, int d, int bits) } while (0) d++; - // __builtin_prefetch(&(GC_PAGE(v)->age[v - PAGE_DATA); + // some values have special representations if (vt == (jl_value_t*)jl_tuple_type) { size_t l = jl_tuple_len(v); @@ -2034,7 +2036,7 @@ DLLEXPORT void jl_gc_enable(void) { is_gc_enabled = 1; } DLLEXPORT void jl_gc_disable(void) { is_gc_enabled = 0; } DLLEXPORT int jl_gc_is_enabled(void) { return is_gc_enabled; } -DLLEXPORT int64_t jl_gc_total_bytes(void) { return total_allocd_bytes + allocd_bytes; } +DLLEXPORT int64_t jl_gc_total_bytes(void) { return total_allocd_bytes + allocd_bytes + collect_interval/gc_steps; } DLLEXPORT uint64_t jl_gc_total_hrtime(void) { return total_gc_time; } int64_t diff_gc_total_bytes(void) @@ -2122,7 +2124,7 @@ void jl_gc_collect(void) if (jl_in_gc) return; jl_in_gc = 1; JL_SIGATOMIC_BEGIN(); - double t0 = clock_now(); + uint64_t t0 = jl_hrtime(); #if defined(GC_TIME) || defined(GC_FINAL_STATS) int wb_activations = mark_sp - saved_mark_sp; #endif @@ -2166,31 +2168,30 @@ void jl_gc_collect(void) } allocd_bytes_since_sweep += allocd_bytes + (int64_t)collect_interval/gc_steps; // allocd_bytes = -(int64_t)collect_interval/gc_steps; - double mark_pause = (clock_now() - t0); + uint64_t mark_pause = jl_hrtime() - t0; #ifdef GC_FINAL_STATS total_mark_time += mark_pause; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d %d\n", mark_pause*1000, (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, perm_marked, allocd_bytes/1024); + JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, perm_marked, allocd_bytes/1024); saved_mark_sp = mark_sp; #endif } int64_t pct = -1, bonus = -1, SAVE = -1, SAVE2 = -1, est_fb = 0, SAVE3 = -1; - double post_time = 0.0, finalize_time = 0.0; + uint64_t post_time = 0, finalize_time = 0; if(mark_sp == 0 || sweeping) { #if defined(GC_TIME) || defined(GC_FINAL_STATS) - double sweep_t0 = clock_now(); + uint64_t sweep_t0 = jl_hrtime(); #endif int64_t actual_allocd = allocd_bytes_since_sweep, promo_bytes = 0; if (!sweeping) { #ifdef GC_TIME - post_time = clock_now(); -#endif - -#ifdef GC_TIME - post_time = clock_now() - post_time; + post_time = jl_hrtime(); #endif post_mark(); +#ifdef GC_TIME + post_time = jl_hrtime() - post_time; +#endif est_fb = live_bytes - scanned_bytes - (sweep_mask == GC_MARKED_NOESC ? perm_scanned_bytes : perm_scanned_bytes) + actual_allocd; promo_bytes = perm_scanned_bytes - last_perm_scanned; @@ -2257,10 +2258,15 @@ void jl_gc_collect(void) pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; if (sweep_mask == GC_MARKED_NOESC) { - collect_interval = default_collect_interval; - if (freed_bytes < actual_allocd/2) { - quick_count = 15; - collect_interval = 0; + if (freed_bytes >= actual_allocd) { + quick_count--; + } + else { + collect_interval = default_collect_interval; + if (freed_bytes < actual_allocd/2) { + quick_count = 15; + // collect_interval = 0; + } } } else if (sweep_mask == GC_MARKED && freed_bytes < (7*(actual_allocd/10)) && n_pause > 1) { @@ -2282,33 +2288,29 @@ void jl_gc_collect(void) allocd_bytes_since_sweep = 0; freed_bytes = 0; - finalize_time = clock_now(); + finalize_time = jl_hrtime(); run_finalizers(); - finalize_time = clock_now() - finalize_time; + finalize_time = jl_hrtime() - finalize_time; } #if defined(GC_FINAL_STATS) || defined(GC_TIME) - double sweep_pause = clock_now() - sweep_t0; + uint64_t sweep_pause = jl_hrtime() - sweep_t0; #endif #ifdef GC_FINAL_STATS total_sweep_time += sweep_pause - finalize_time - post_time; total_fin_time += finalize_time + post_time; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark, %.2f ms in %d fin) (marked in %d inc) mask %d | next in %d kB\n", sweep_pause*1000, live_bytes/1024, SAVE2/1024, pct, SAVE3/1024, bonus/1024, SAVE/1024, post_time*1000, finalize_time*1000, n_finalized, inc_count, sweep_mask, -allocd_bytes/1024); + JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark, %.2f ms in %d fin) (marked in %d inc) mask %d | next in %d kB\n", NS2MS(sweep_pause), live_bytes/1024, SAVE2/1024, pct, SAVE3/1024, bonus/1024, SAVE/1024, NS2MS(post_time), NS2MS(finalize_time), n_finalized, inc_count, sweep_mask, -allocd_bytes/1024); int64_t diff = est_fb - SAVE2; - /*JL_PRINTF(JL_STDOUT, "relerr : %d %% (%ld)\n", SAVE2? 100*diff/SAVE2 : -1, diff); - if (lr == 0) lr = diff; - else if (lr != diff && diff < 0) { abort(); }*/ #endif } n_pause++; + double pause = jl_hrtime() - t0; + total_gc_time += pause; #ifdef GC_FINAL_STATS - double pause = clock_now() - t0; - total_gc_time += pause*1000*1000*1000; // i don't think ns precision is really relevant here - pause -= finalize_time; // do not count the first pause as it is always a full collection - max_pause = (max_pause < pause && n_pause > 1) ? pause : max_pause; + // max_pause = (max_pause < pause && n_pause > 1) ? pause : max_pause; #endif JL_SIGATOMIC_END(); jl_in_gc = 0; @@ -2462,7 +2464,7 @@ DLLEXPORT void *alloc_4w(void) return pool_alloc(&pools[2]); #endif } -#define NS_TO_S(t) ((double)(t/1000)/(1000*1000)) + #ifdef GC_FINAL_STATS static double process_t0; #include From 611a471ba95549385e980da2e8a9d41eaf82d22a Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Wed, 3 Sep 2014 15:36:05 +0200 Subject: [PATCH 07/17] count external memory alloc again --- src/gc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gc.c b/src/gc.c index a9dea3f29e671..792eee4c2836e 100644 --- a/src/gc.c +++ b/src/gc.c @@ -550,7 +550,7 @@ static inline int maybe_collect(void) DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { maybe_collect(); - // allocd_bytes += sz; + allocd_bytes += sz; void *b = malloc(sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -560,13 +560,13 @@ DLLEXPORT void *jl_gc_counted_malloc(size_t sz) DLLEXPORT void jl_gc_counted_free(void *p, size_t sz) { free(p); - // freed_bytes += sz; + freed_bytes += sz; } DLLEXPORT void *jl_gc_counted_realloc(void *p, size_t sz) { maybe_collect(); - // allocd_bytes += ((sz+1)/2); // NOTE: wild guess at growth amount + allocd_bytes += ((sz+1)/2); // NOTE: wild guess at growth amount void *b = realloc(p, sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -576,7 +576,7 @@ DLLEXPORT void *jl_gc_counted_realloc(void *p, size_t sz) DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz) { maybe_collect(); - // allocd_bytes += (sz-old); + allocd_bytes += (sz-old); void *b = realloc(p, sz); if (b == NULL) jl_throw(jl_memory_exception); From 10a32ffa7a2206e1d4b592d4468fd6504156ddf5 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Sun, 7 Sep 2014 15:20:44 +0200 Subject: [PATCH 08/17] add peak resident memory to perf tests, slight heuristic adjustment, no there yet. --- src/gc.c | 5 ++--- test/perf/kernel/perf.jl | 1 + test/perf/micro/perf.jl | 2 ++ test/perf/perfutil.jl | 12 ++++++++++++ test/perf/shootout/perf.jl | 1 + 5 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/gc.c b/src/gc.c index 792eee4c2836e..513d0dd2d4e38 100644 --- a/src/gc.c +++ b/src/gc.c @@ -935,7 +935,7 @@ static inline void bzero_small_a16(char *p, size_t sz) #ifndef __SSE__ memset(p, 0, sz); #else - __m128i c = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); + __m128i c = _mm_set1_epi8(0); for(int i=0; i < sz/16; i++) _mm_store_si128((__m128i*)p, c); #endif @@ -2262,11 +2262,10 @@ void jl_gc_collect(void) quick_count--; } else { - collect_interval = default_collect_interval; if (freed_bytes < actual_allocd/2) { quick_count = 15; // collect_interval = 0; - } + } else collect_interval = default_collect_interval; } } else if (sweep_mask == GC_MARKED && freed_bytes < (7*(actual_allocd/10)) && n_pause > 1) { diff --git a/test/perf/kernel/perf.jl b/test/perf/kernel/perf.jl index bfe1cdf94b791..f6773212cdd1e 100644 --- a/test/perf/kernel/perf.jl +++ b/test/perf/kernel/perf.jl @@ -217,3 +217,4 @@ logical_y = map(iseven, 1:length(x)) @timeit (for n=1:100 add1!(x,logical_y) end) "add1_logical" "Increment x_i if y_i is true" @timeit (for n=1:100 devec_add1_logical!(x,logical_y) end) "devec_add1_logical" "Devectorized increment x_i if y_i is true" +@maxrss "kernel" diff --git a/test/perf/micro/perf.jl b/test/perf/micro/perf.jl index 943bc9992adfe..76e3822c5c812 100644 --- a/test/perf/micro/perf.jl +++ b/test/perf/micro/perf.jl @@ -148,3 +148,5 @@ end printfd(1) @timeit printfd(100000) "printfd" "Printing to a file descriptor" end + +@maxrss "micro" diff --git a/test/perf/perfutil.jl b/test/perf/perfutil.jl index 41a4c97c3deb0..6a56c9a35623c 100644 --- a/test/perf/perfutil.jl +++ b/test/perf/perfutil.jl @@ -89,6 +89,18 @@ macro timeit_init(ex,init,name,desc,group...) end end +@linux? macro maxrss(name) + quote + rus = Array(Int64, div(144,8)) + fill!(rus, 0x0) + res = ccall(:getrusage, Int32, (Int32, Ptr{Void}), 0, rus) + if res == 0 + mx = rus[5]/1024 + @printf "julia,%s.mem,%f,%f,%f,%f\n" $name mx mx mx 0 + end + end +end : macro maxrss(name) end + # seed rng for more consistent timings srand(1776) diff --git a/test/perf/shootout/perf.jl b/test/perf/shootout/perf.jl index ce62be5b35c2c..a74509b5c6233 100644 --- a/test/perf/shootout/perf.jl +++ b/test/perf/shootout/perf.jl @@ -41,3 +41,4 @@ include("revcomp.jl") include("spectralnorm.jl") @timeit spectralnorm() "spectralnorm" "Eigenvalue using the power method" +@maxrss "shootout" From 977fa6c7d5a01673ce0cabb42d364a29c3af6d56 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Tue, 9 Sep 2014 18:54:00 +0200 Subject: [PATCH 09/17] Prevent module level assignments from clobbering up the remembered set. A bit more tweaking of the collection heuristics. We are now faster/less memory hungry on almost every benchmark of the micro, kernel & shootout suite. --- src/gc.c | 37 ++++++++++++++++++++++++++++--------- src/julia.h | 9 +++++++++ src/module.c | 2 +- test/perf/perfutil.jl | 13 +++++++++---- 4 files changed, 47 insertions(+), 14 deletions(-) diff --git a/src/gc.c b/src/gc.c index 513d0dd2d4e38..9cc95a60304dd 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1359,6 +1359,7 @@ void grow_mark_stack(void) int max_msp = 0; #ifdef GC_INC static arraylist_t tasks; +static arraylist_t rem_bindings; static arraylist_t _remset[2]; static arraylist_t *remset = &_remset[0]; static arraylist_t *last_remset = &_remset[1]; @@ -1373,10 +1374,15 @@ void reset_remset(void) DLLEXPORT void gc_queue_root(void *p) { void *ptr = (void*)((uintptr_t)p & ~(uintptr_t)1); - if (gc_bits(ptr) == GC_QUEUED) return; + if (gc_bits(ptr) == GC_QUEUED) return; // TODO check if still needed gc_bits(ptr) = GC_QUEUED; arraylist_push(remset, p); } +void gc_queue_binding(void *bnd) +{ + gc_bits(bnd) = GC_QUEUED; + arraylist_push(&rem_bindings, (void*)((void**)bnd + 1)); +} static int push_root(jl_value_t *v, int d, int); static inline int gc_push_root(void *v, int d) @@ -2155,6 +2161,16 @@ void jl_gc_collect(void) gc_bits(ptr) = GC_MARKED; push_root(ptr, 0, gc_bits(ptr)); } + int n_bnd_refyoung = 0; + for (int i = 0; i < rem_bindings.len; i++) { + void *ptr = rem_bindings.items[i]; + gc_bits(gc_val_buf(ptr)) = GC_MARKED; + if (gc_push_root(((jl_binding_t*)ptr)->value, 0) == GC_MARKED_NOESC) { + rem_bindings.items[n_bnd_refyoung] = ptr; + n_bnd_refyoung++; + } + } + rem_bindings.len = n_bnd_refyoung; perm_scanned_bytes = SA; pre_mark(); @@ -2212,7 +2228,7 @@ void jl_gc_collect(void) prepare_sweep(); - if (quick_count >= 10) { + if (quick_count >= 30) { sweep_mask = GC_MARKED; // next collection is a full one gc_steps = gc_inc_steps; quick_count = 0; @@ -2258,21 +2274,23 @@ void jl_gc_collect(void) pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; if (sweep_mask == GC_MARKED_NOESC) { + collect_interval = default_collect_interval/4; if (freed_bytes >= actual_allocd) { quick_count--; } else { - if (freed_bytes < actual_allocd/2) { - quick_count = 15; - // collect_interval = 0; - } else collect_interval = default_collect_interval; + if (freed_bytes/quick_count < actual_allocd/30) { + quick_count = 50; + collect_interval = default_collect_interval; + } } } - else if (sweep_mask == GC_MARKED && freed_bytes < (7*(actual_allocd/10)) && n_pause > 1) { + else if (freed_bytes < (7*(actual_allocd/10)) && n_pause > 1) { if (collect_interval <= 2*(max_collect_interval/5)) { - collect_interval = 5*(collect_interval/2); - quick_count = 15; + // if (prev_sweep_mask == GC_MARKED) + collect_interval = 5*(collect_interval/2); } + quick_count = 50; } else { collect_interval = default_collect_interval; @@ -2529,6 +2547,7 @@ void jl_gc_init(void) #endif #ifdef GC_INC arraylist_new(&tasks, 0); + arraylist_new(&rem_bindings, 0); arraylist_new(remset, 0); arraylist_new(last_remset, 0); #endif diff --git a/src/julia.h b/src/julia.h index f86347d94c2ff..1b61c9bd28ec1 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1377,9 +1377,18 @@ extern DLLEXPORT jl_compileropts_t jl_compileropts; #define JL_COMPILEROPT_DUMPBITCODE_OFF 2 DLLEXPORT void gc_queue_root(void *root); +void gc_queue_binding(void *bnd); void gc_setmark_buf(void *buf, int); DLLEXPORT void gc_wb_slow(void* parent, void* ptr); +static inline void gc_wb_binding(void *bnd, void *val) +{ + #ifdef GC_INC + if (__unlikely((*(uintptr_t*)bnd & 3) == 1 && (*(uintptr_t*)val & 1) == 0)) + gc_queue_binding(bnd); + #endif +} + static inline void gc_wb_fwd(void* parent, void* ptr) { #ifdef GC_INC diff --git a/src/module.c b/src/module.c index 43bc764af62d8..fc71d5d60c7da 100644 --- a/src/module.c +++ b/src/module.c @@ -374,7 +374,7 @@ DLLEXPORT void jl_checked_assignment(jl_binding_t *b, jl_value_t *rhs) JL_PRINTF(JL_STDERR,"Warning: redefining constant %s\n",b->name->name); } } - gc_wb_fwd(((void**)b)-1, rhs); + gc_wb_binding(((void**)b)-1, rhs); b->value = rhs; } diff --git a/test/perf/perfutil.jl b/test/perf/perfutil.jl index 6a56c9a35623c..40d0dd2447793 100644 --- a/test/perf/perfutil.jl +++ b/test/perf/perfutil.jl @@ -1,4 +1,5 @@ -const ntrials = 5 +const mintrials = 5 +const mintime = 2000.0 print_output = isempty(ARGS) codespeed = length(ARGS) > 0 && ARGS[1] == "codespeed" @@ -62,13 +63,17 @@ end macro timeit(ex,name,desc,group...) quote - t = zeros(ntrials) - for i=0:ntrials + t = Float64[] + tot = 0.0 + i = 0 + while i < mintrials || tot < mintime e = 1000*(@elapsed $(esc(ex))) + tot += e if i > 0 # warm up on first iteration - t[i] = e + push!(t, e) end + i += 1 end @output_timings t $name $desc $group end From 5c9ec6cf7da715869410bc122ae4d71e6be87b2a Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Sun, 14 Sep 2014 12:00:41 +0200 Subject: [PATCH 10/17] Slight allocation optimizations. Also another cleanup round. --- src/gc.c | 158 +++++++++---------------------------------- src/julia.h | 2 - src/julia_internal.h | 12 +++- 3 files changed, 44 insertions(+), 128 deletions(-) diff --git a/src/gc.c b/src/gc.c index 9cc95a60304dd..54f26503621bf 100644 --- a/src/gc.c +++ b/src/gc.c @@ -46,6 +46,7 @@ typedef struct { #define HEAP_COUNT 64 static region_t *heaps[HEAP_COUNT] = {NULL}; +static int heaps_lb[HEAP_COUNT] = {0}; typedef struct _bigval_t { struct _bigval_t *next; @@ -91,15 +92,11 @@ typedef struct _pool_t { gcval_t *freelist ; uint16_t end_offset; // avoid to compute this at each allocation uint16_t osize; - union { - struct _gcpage_t *pages; - struct { - uint16_t allocd : 1; - uint16_t linear : 1; - }; - }; - struct _gcpage_t *needsweep; uint16_t nfree; + struct { + uint16_t allocd : 1; + uint16_t linear : 1; + }; } pool_t; /*#ifdef _P64 @@ -190,8 +187,6 @@ static size_t long_collect_interval; static int gc_steps; #define N_POOLS 42 static __attribute__((aligned (64))) pool_t norm_pools[N_POOLS]; -static pool_t ephe_pools[N_POOLS]; -//static pool_t *pools = &norm_pools[0]; #define pools norm_pools static int64_t total_allocd_bytes = 0; @@ -469,12 +464,12 @@ static __attribute__((noinline)) void *malloc_page(void) #endif memset(heap->freemap, 0xff, REGION_PG_COUNT/8); } - heap_i++; - for(i = 0; i < REGION_PG_COUNT/32; i++) { + for(i = heaps_lb[heap_i]; i < REGION_PG_COUNT/32; i++) { if (heap->freemap[i]) break; } if (i == REGION_PG_COUNT/32) { // heap full + heap_i++; continue; } break; @@ -483,6 +478,8 @@ static __attribute__((noinline)) void *malloc_page(void) jl_printf(JL_STDERR, "increase HEAP_COUNT or allocate less memory\n"); abort(); } + if (heaps_lb[heap_i] < i) + heaps_lb[heap_i] = i; int j = (ffs(heap->freemap[i]) - 1); heap->freemap[i] &= ~(uint32_t)(1 << j); if (j == 0) { // reserve a page for metadata (every 31 data pages) @@ -530,6 +527,7 @@ static inline void free_page(void *p) madvise(&heap->pages[pg_idx], GC_PAGE_SZ, MADV_DONTNEED); #endif } + if (heaps_lb[i] > pg_idx/32) heaps_lb[i] = pg_idx/32; current_pg_count--; } @@ -1004,9 +1002,8 @@ static inline void *__pool_alloc(pool_t* p, int osize, int end_offset) p->nfree--; p->allocd = 1; end = &(GC_PAGE_DATA(v)[end_offset]); - if ((v == end) | (!p->linear)) { + if (__unlikely((v == end) | (!p->linear))) { _update_freelist(p, v->next); - p->freelist = v->next; } else { p->freelist = (char*)v + osize; } @@ -1129,24 +1126,18 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma prev_pfl = pfl; if (pg->gc_bits == GC_MARKED) { // skip - if (sweep_mask == GC_MARKED_NOESC && (!pg->allocd/* || pg->nmarked >= (8*obj_per_page)/10)*/)) { - // pg->allocd = 0; - if (!pg->allocd) { - pg_skpd++; - freedall = 0; - if (pg->fl_begin_offset != (uint16_t)-1) { - *pfl = (gcval_t*)PAGE_PFL_BEG(pg); - pfl = prev_pfl = PAGE_PFL_END(pg); - } - goto free_page; + if (sweep_mask == GC_MARKED_NOESC && !pg->allocd) { + pg_skpd++; + freedall = 0; + if (pg->fl_begin_offset != (uint16_t)-1) { + *pfl = (gcval_t*)PAGE_PFL_BEG(pg); + pfl = prev_pfl = PAGE_PFL_END(pg); } + goto free_page; } pg->allocd = 0; } else if(pg->gc_bits == GC_CLEAN) { - // if (whole_page) - // p->nfree += obj_per_page; // overestimation - // else pg->allocd = 0; goto free_page; } @@ -1178,11 +1169,9 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma } else if ((sweep_mask & bits) == sweep_mask) gc_bits(v) = GC_CLEAN; - // else { inc_sat(age, PROMOTE_AGE); ages[obj_i/4] &= ~(3 << sh); ages[obj_i/4] |= age << sh; - // } freedall = 0; } v = (gcval_t*)((char*)v + osize); @@ -1194,8 +1183,6 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma pg->nfree = pg_nfree; page_done++; free_page: - // nfreed += this_page_nfree; - // pg->nfree = this_page_nfree; if (sweep_mask == GC_MARKED) pg->nmarked = 0; pg_freedall += freedall; @@ -1206,14 +1193,10 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma pg_total++; if (freedall) { if (prev_sweep_mask == GC_MARKED_NOESC && lazy_freed_pages <= default_collect_interval/4*4096) { - gcval_t *begin = reset_page(p, pg, 0x1234); + gcval_t *begin = reset_page(p, pg, 0); *prev_pfl = begin; pfl = (gcval_t**)((char*)begin + ((int)pg->nfree - 1)*osize); - begin->next = (gcval_t*)0xdeadbeef; - // jl_printf(JL_STDOUT, "SZ: 0x%lx 0x%lx 0x%lx\n", begin, prev_pfl, ((intptr_t)pfl - (intptr_t)begin)); - // if (!isinfl(p->freelist, begin)) - // abort(); - // ppg = &pg->next; + begin->next = (gcval_t*)0; lazy_freed_pages++; } else { @@ -1231,30 +1214,12 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma } else { pg->gc_bits = GC_MARKED; - // ppg = &pg->next; pg->linear = 0; nfree += pg->nfree; } - /* if (should_timeout() && nextpg) { - pg->next = NULL; - pg = nextpg; - break; - }*/ - // scanned_bytes += GC_PAGE_SZ; - // pg = nextpg; - //gcpage_t* pgs = p->pages; - // *ppg = p->pages; - /* p->pages = p->needsweep; - if (pg == NULL) { - p->needsweep = NULL; - } else { - p->needsweep = pg; - }*/ + skipped_pages += pg_skpd; total_pages += pg_total; - // *pfl = NULL; - /* if (stats[0] + stats[1] + stats[2] + stats[2] > 0) - jl_printf(JL_STDOUT, "Pool : %d %d %d %d\n", stats[0], stats[1], stats[2], stats[3]);*/ freed_bytes += (nfree - old_nfree)*osize; return pfl; } @@ -1310,13 +1275,8 @@ static int gc_sweep_inc(int sweep_mask) #endif for (int i = 0; i < HEAP_COUNT; i++) { if (heaps[i]) - sweep_pool_region(heaps[i], sweep_mask); + /*finished &= */sweep_pool_region(heaps[i], sweep_mask); } - /* for(i=0; i < N_POOLS; i++) { - sweep_pool(&norm_pools[i], sweep_mask); - finished &= !norm_pools[i].needsweep; - }*/ - finished = 1; #ifdef GC_INC check_timeout = ct; #endif @@ -1374,12 +1334,13 @@ void reset_remset(void) DLLEXPORT void gc_queue_root(void *p) { void *ptr = (void*)((uintptr_t)p & ~(uintptr_t)1); - if (gc_bits(ptr) == GC_QUEUED) return; // TODO check if still needed + assert(gc_bits(ptr) != GC_QUEUED); gc_bits(ptr) = GC_QUEUED; arraylist_push(remset, p); } void gc_queue_binding(void *bnd) { + assert(gc_bits(bnd) != GC_QUEUED); gc_bits(bnd) = GC_QUEUED; arraylist_push(&rem_bindings, (void*)((void**)bnd + 1)); } @@ -1473,7 +1434,6 @@ static void gc_mark_task_stack(jl_task_t *ta, int d) if (ta->stkbuf != NULL || ta == jl_current_task) { if (ta->stkbuf != NULL) { gc_setmark_buf(ta->stkbuf, gc_bits(ta)); - // scanned_bytes += ta->ssize + 2*4096 - 1; } #ifdef COPY_STACKS ptrint_t offset; @@ -1507,15 +1467,7 @@ __attribute__((noinline)) static void gc_mark_task(jl_task_t *ta, int d) gc_push_root(ta->exception, d); if (ta->start) gc_push_root(ta->start, d); if (ta->result) gc_push_root(ta->result, d); -#ifdef GC_INC - // if (1 || mark_mode == GC_MARKED_NOESC) { - gc_mark_task_stack(ta, d); - /* } else { - arraylist_push(&tasks, (void*)ta); - }*/ -#else gc_mark_task_stack(ta, d); -#endif } @@ -1526,12 +1478,11 @@ DLLEXPORT void jl_gc_lookfor(jl_value_t *v) { lookforme = v; } */ #define MAX_MARK_DEPTH 400 -// returns 1 if v is young after this marking +// returns the new gc_bits of v static int push_root(jl_value_t *v, int d, int bits) { assert(v != NULL); jl_value_t *vt = (jl_value_t*)gc_typeof(v); - // gc_setmark(v); int refyoung = 0; if (vt == (jl_value_t*)jl_weakref_type) { @@ -1541,7 +1492,6 @@ static int push_root(jl_value_t *v, int d, int bits) if ((jl_is_datatype(vt) && ((jl_datatype_t*)vt)->pointerfree)) { int sz = jl_datatype_size(vt); bits = gc_setmark(v, sz, GC_MARKED_NOESC); - // scanned_bytes += allocdsz(sz); goto ret; } int marked = 0; @@ -1598,7 +1548,7 @@ static int push_root(jl_value_t *v, int d, int bits) } if (a->ptrarray && a->data!=NULL) { size_t l = jl_array_len(a); - if (0 && l > 100000 && d > MAX_MARK_DEPTH-10) { + if (l > 100000 && d > MAX_MARK_DEPTH-10) { // don't mark long arrays at high depth, to try to avoid // copying the whole array into the mark queue goto queue_the_root; @@ -1608,7 +1558,6 @@ static int push_root(jl_value_t *v, int d, int bits) int has_young_elt = 0; for(size_t i=0; i < l; i++) { jl_value_t *elt = ((jl_value_t**)data)[i]; - // scanned_bytes += sizeof(void*); if (elt != NULL) { verify_parent("array", v, &((jl_value_t**)data)[i], "elem(%d)", i); refyoung |= gc_push_root(elt, d); @@ -1624,16 +1573,14 @@ static int push_root(jl_value_t *v, int d, int bits) else if (vt == (jl_value_t*)jl_module_type) { MARK(v, bits = gc_setmark(v, sizeof(jl_module_t), GC_MARKED_NOESC)); refyoung |= gc_mark_module((jl_module_t*)v, d); - // scanned_bytes += allocdsz(sizeof(jl_module_t)); } else if (vt == (jl_value_t*)jl_task_type) { MARK(v, bits = gc_setmark(v, sizeof(jl_task_t), GC_MARKED_NOESC)); gc_mark_task((jl_task_t*)v, d); refyoung = GC_MARKED_NOESC; - // scanned_bytes += allocdsz(sizeof(jl_task_t)); } else if(vt == (jl_value_t*)jl_symbol_type) { - gc_setmark_other(v, GC_MARKED); // symbols are not pooled + gc_setmark_other(v, GC_MARKED); // symbols have their own allocator } else if( #ifdef GC_VERIFY @@ -1652,7 +1599,6 @@ static int push_root(jl_value_t *v, int d, int bits) int ci = 0; for(int i=0; i < nf; i++) { if (fields[i].isptr) { - // scanned_bytes += sizeof(void*); jl_value_t **slot = (jl_value_t**)((char*)v + fields[i].offset + sizeof(void*)); jl_value_t *fld = *slot; if (fld) { @@ -1661,9 +1607,6 @@ static int push_root(jl_value_t *v, int d, int bits) refyoung |= gc_push_root(fld, d); } } - else { - // scanned_bytes += jl_field_size(dt, i); - } } // while(ci) // refyoung |= gc_push_root(children[--ci], d); @@ -1680,18 +1623,14 @@ static int push_root(jl_value_t *v, int d, int bits) #ifdef GC_VERIFY if (verifying) return; #endif - // objprofile_count(jl_typeof(v), gc_bits(v) == GC_MARKED ? 1 : 0, ); if ((bits == GC_MARKED) && (refyoung == GC_MARKED_NOESC)) { - /*for (int i = 0; i < remset.len; i++) { - if (remset.items[i] == v) - abort(); - }*/ arraylist_push(remset, v); } return bits; +#undef MARK + queue_the_root: - scanned_bytes += 0;//sizeof(void*); if(mark_sp >= mark_stack_size) grow_mark_stack(); mark_stack[mark_sp++] = (jl_value_t*)v; max_msp = max_msp > mark_sp ? max_msp : mark_sp; @@ -1702,7 +1641,7 @@ static void visit_mark_stack_inc(int mark_mode) { while(mark_sp > 0 && !should_timeout()) { gcval_t* v = (gcval_t*)mark_stack[--mark_sp]; - // assert(gc_bits(v) == GC_QUEUED || gc_bits(v) == GC_MARKED || gc_bits(v) == GC_MARKED_NOESC); + assert(gc_bits(v) == GC_QUEUED || gc_bits(v) == GC_MARKED || gc_bits(v) == GC_MARKED_NOESC); push_root(v, 0, gc_bits(v)); } } @@ -1714,7 +1653,7 @@ static void visit_mark_stack(int mark_mode) check_timeout = 0; #endif visit_mark_stack_inc(mark_mode); - // assert(!mark_sp); + assert(!mark_sp); #ifdef GC_INC check_timeout = ct; #endif @@ -1763,8 +1702,6 @@ static void pre_mark(void) gc_push_root(to_finalize.items[i], 0); } - //if (inc_count > 1 || quick_count > 1) return; // the following roots are constant and will stay marked in between increments - // if (prev_sweep_mask == GC_MARKED) jl_mark_box_caches(); gc_push_root(jl_unprotect_stack_func, 0); gc_push_root(jl_bottom_func, 0); @@ -1850,9 +1787,6 @@ static void post_mark(void) if (!gc_marked(v)) { jl_value_t *fin = finalizer_table.table[i+1]; if (gc_typeof(fin) == (jl_value_t*)jl_voidpointer_type) { - /* jl_printf(JL_STDOUT, "CFINA: "); - jl_static_show(JL_STDOUT, v); - jl_printf(JL_STDOUT, "\n");*/ void *p = jl_unbox_voidpointer(fin); if (p) ((void (*)(void*))p)(jl_data_ptr(v)); @@ -1861,9 +1795,6 @@ static void post_mark(void) } gc_push_root(v, 0); schedule_finalization(v); - //jl_printf(JL_STDOUT, "FINA: "); - //jl_static_show(JL_STDOUT, v); - //jl_printf(JL_STDOUT, "\n"); n_finalized++; } gc_push_root(finalizer_table.table[i+1], 0); @@ -2054,8 +1985,8 @@ int64_t diff_gc_total_bytes(void) } void sync_gc_total_bytes(void) {last_gc_total_bytes = jl_gc_total_bytes();} -void jl_gc_ephemeral_on(void) { }//pools = &ephe_pools[0]; } -void jl_gc_ephemeral_off(void) { }//pools = &norm_pools[0]; } +void jl_gc_ephemeral_on(void) { } +void jl_gc_ephemeral_off(void) { } #if defined(MEMPROFILE) static void all_pool_stats(void); @@ -2110,14 +2041,6 @@ static void gc_mark_task_stack(jl_task_t*,int); void prepare_sweep(void) { - for(int i = 0; i < 2*N_POOLS; i++) { - pool_t *p = i < N_POOLS ? &norm_pools[i] : &ephe_pools[i - N_POOLS]; - /* if (p->pages) { - p->needsweep = p->pages; - p->pages = NULL; - p->freelist = NULL; - }*/ - } } #ifdef GC_INC @@ -2403,7 +2326,6 @@ void jl_gc_collect(void) void *allocb(size_t sz) { - // jl_printf(JL_STDOUT, "BUFF relerr: %d\n", sz); buff_t *b; sz += sizeof(void*); #ifdef MEMDEBUG @@ -2514,16 +2436,8 @@ void jl_gc_init(void) for(i=0; i < N_POOLS; i++) { assert(szc[i] % 4 == 0); norm_pools[i].osize = szc[i]; - norm_pools[i].pages = NULL; norm_pools[i].freelist = NULL; - norm_pools[i].needsweep = NULL; norm_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; - - ephe_pools[i].osize = szc[i]; - ephe_pools[i].pages = NULL; - ephe_pools[i].freelist = NULL; - ephe_pools[i].needsweep = NULL; - ephe_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; } assert(offsetof(gcpages_t, data) == GC_PAGE_SZ); @@ -2630,12 +2544,6 @@ static void all_pool_stats(void) tp += np; nold += nol; noldbytes += nol*norm_pools[i].osize; - /* - b = pool_stats(&ephe_pools[i], &w, &np); - nb += b; - no += (b/ephe_pools[i].osize); - tw += w; - tp += np;*/ } JL_PRINTF(JL_STDOUT, "%d objects (%d%% old), %d kB (%d%% old) total allocated, %d total fragments (%d%% overhead), in %d pages\n", diff --git a/src/julia.h b/src/julia.h index 1b61c9bd28ec1..a889980099ab9 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1087,8 +1087,6 @@ DLLEXPORT int64_t jl_gc_total_bytes(void); DLLEXPORT uint64_t jl_gc_total_hrtime(void); int64_t diff_gc_total_bytes(void); void sync_gc_total_bytes(void); -void jl_gc_ephemeral_on(void); -void jl_gc_ephemeral_off(void); DLLEXPORT void jl_gc_collect(void); void jl_gc_preserve(jl_value_t *v); void jl_gc_unpreserve(void); diff --git a/src/julia_internal.h b/src/julia_internal.h index 5913d0fbe6508..72f57087328b3 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -9,7 +9,17 @@ extern "C" { STATIC_INLINE jl_value_t *newobj(jl_value_t *type, size_t nfields) { - jl_value_t *jv = (jl_value_t*)allocobj((1+nfields) * sizeof(void*)); + jl_value_t *jv = NULL; + switch (nfields) { + case 1: + jv = (jl_value_t*)alloc_2w(); break; + case 2: + jv = (jl_value_t*)alloc_3w(); break; + case 3: + jv = (jl_value_t*)alloc_4w(); break; + default: + jv = (jl_value_t*)allocobj((1+nfields) * sizeof(void*)); + } jv->type = type; return jv; } From 9d4568d3582eab03f9ef46e740cc3e8e4aac3870 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Sun, 14 Sep 2014 12:29:38 +0200 Subject: [PATCH 11/17] More cleanup --- src/dump.c | 2 - src/gc.c | 165 +++++++++++++++++++---------------------------------- 2 files changed, 58 insertions(+), 109 deletions(-) diff --git a/src/dump.c b/src/dump.c index bd1535a569cd9..b0dcbdbaaf3e1 100644 --- a/src/dump.c +++ b/src/dump.c @@ -1211,10 +1211,8 @@ jl_value_t *jl_uncompress_ast(jl_lambda_info_t *li, jl_value_t *data) src.size = jl_array_len(bytes); int en = jl_gc_is_enabled(); jl_gc_disable(); - jl_gc_ephemeral_on(); (void)jl_deserialize_value(&src); // skip ret type jl_value_t *v = jl_deserialize_value(&src); - jl_gc_ephemeral_off(); if (en) jl_gc_enable(); tree_literal_values = NULL; diff --git a/src/gc.c b/src/gc.c index 54f26503621bf..378e22e15e2b5 100644 --- a/src/gc.c +++ b/src/gc.c @@ -36,11 +36,7 @@ extern "C" { #define REGION_PG_COUNT 8*4096 typedef struct { - // union { - // uint32_t freemap[REGION_PG_COUNT/32]; - uint32_t freemap[SYS_PAGE_SZ/4]; - // char _pad[SYS_PAGE_SZ]; - // }; + uint32_t freemap[SYS_PAGE_SZ/4]; char pages[REGION_PG_COUNT][GC_PAGE_SZ]; } region_t; @@ -99,13 +95,6 @@ typedef struct _pool_t { }; } pool_t; -/*#ifdef _P64 -#define GC_PAGE_SZ (1536*sizeof(void*))//bytes -#else*/ - -// the cookie field must be before the page data -// becaue we will be doing GC_PAGE(v)->cookie for -// some v not in a page and it must not segfault typedef struct _gcpage_t { struct { uint16_t pool_n : 8; @@ -113,7 +102,7 @@ typedef struct _gcpage_t { // this is a bitwise | of all gc_bits in this page uint16_t gc_bits : 2; // if this is 1, the freelist in this page contains only 2 cells. - // one is the first free cell, it points to the last cell of the page + // the first free cell and the last cell of the page // every cell in between is free uint16_t linear : 1; }; @@ -122,7 +111,6 @@ typedef struct _gcpage_t { uint16_t osize; uint16_t fl_begin_offset; uint16_t fl_end_offset; - // struct _gcpage_t **prev; // point to the next field of the previous page uint32_t data_offset; // this is not strictly necessary char age[2*GC_PAGE_SZ/(8*8)]; // two bits per object } gcpage_t; @@ -154,27 +142,16 @@ typedef struct { // GC knobs and self-measurement variables static int64_t last_gc_total_bytes = 0; -/*static size_t allocd_bytes = 0; -static int64_t total_allocd_bytes = 0; -static size_t allocd_bytes_since_sweep = 0; -static size_t freed_bytes = 0; -static uint64_t total_gc_time=0; -static size_t live_bytes = 0; -static size_t scanned_bytes = 0; -static size_t scanned_bytes_goal; -static size_t current_pg_count = 0; -static size_t max_pg_count = 0;*/ #ifdef GC_INC static int gc_inc_steps = 1; -static int gc_quick_steps = 16; -static int gc_sweep_steps = 1; +static int gc_quick_steps = 32; +//static int gc_sweep_steps = 1; #else static const int gc_inc_steps = 1; #endif #ifdef _P64 #define default_collect_interval (5600*1024*sizeof(void*)) -//#define default_collect_interval (560*1024*sizeof(void*)) static size_t max_collect_interval = 1250000000UL; #else #define default_collect_interval (3200*1024*sizeof(void*)) @@ -183,7 +160,7 @@ static size_t max_collect_interval = 500000000UL; // keep those 3 together static int64_t allocd_bytes; static size_t collect_interval; -static size_t long_collect_interval; + static int gc_steps; #define N_POOLS 42 static __attribute__((aligned (64))) pool_t norm_pools[N_POOLS]; @@ -290,8 +267,10 @@ static void add_lostval_parent(jl_value_t* parent) static bigval_t *big_objects = NULL; static bigval_t *big_objects_marked = NULL; -const void *BUFFTY = (void*)0xdeadb00f; -const void *MATY = (void*)0xdeadaa01; +#ifdef OBJPROFILE +static void *BUFFTY = (void*)0xdeadb00f; +#endif +static void *MATY = (void*)0xdeadaa01; static size_t array_nbytes(jl_array_t*); static inline void objprofile_count(void* ty, int old, int sz) @@ -390,6 +369,9 @@ static inline int gc_setmark_pool(void *o, int mark_mode) static inline int gc_setmark(void *o, int sz, int mark_mode) { +#ifdef MEMDEBUG + return gc_setmark_big(o, mark_mode); +#endif if (sz <= 2048) return gc_setmark_pool(o, mark_mode); else @@ -438,9 +420,6 @@ static __attribute__((noinline)) void *malloc_page(void) { void *ptr = (void*)0; int i; -#ifdef GC_FINAL_STATS - double t0 = clock_now(); -#endif region_t* heap; int heap_i = 0; while(heap_i < HEAP_COUNT) { @@ -784,7 +763,7 @@ static int big_total; static int big_freed; static int big_reset; -static jl_value_t** sweep_big_list(int sweep_mask, bigval_t** pv) +static bigval_t** sweep_big_list(int sweep_mask, bigval_t** pv) { bigval_t *v = *pv; while (v != NULL) { @@ -832,7 +811,7 @@ static void sweep_big(int sweep_mask) { sweep_big_list(sweep_mask, &big_objects); if (sweep_mask == GC_MARKED) { - jl_value_t** last_next = sweep_big_list(sweep_mask, &big_objects_marked); + bigval_t** last_next = sweep_big_list(sweep_mask, &big_objects_marked); if (big_objects) big_objects->prev = last_next; *last_next = big_objects; @@ -1001,11 +980,11 @@ static inline void *__pool_alloc(pool_t* p, int osize, int end_offset) v = p->freelist; p->nfree--; p->allocd = 1; - end = &(GC_PAGE_DATA(v)[end_offset]); + end = (gcval_t*)&(GC_PAGE_DATA(v)[end_offset]); if (__unlikely((v == end) | (!p->linear))) { _update_freelist(p, v->next); } else { - p->freelist = (char*)v + osize; + p->freelist = (gcval_t*)((char*)v + osize); } v->flags = 0; return v; @@ -1049,12 +1028,6 @@ static int szclass(size_t sz) return 41; } -static int allocdsz(size_t sz) -{ - if (sz > 2048) return sz; - return sizeclasses[szclass(sz)]; -} - #ifdef GC_INC int check_timeout = 0; //#define should_timeout() (check_timeout && scanned_bytes >= scanned_bytes_goal) @@ -1113,10 +1086,8 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma gcval_t **prev_pfl = pfl; gcval_t *v; size_t old_nfree = 0, nfree = 0; - int pg_freedall = 0, pg_total = 0; - int pg_skpd = 0, pg_wont_skip = 0; + int pg_freedall = 0, pg_total = 0, pg_skpd = 0; int obj_per_page = GC_PAGE_SZ/osize; - int whole_page = 0; char *data = PAGE_DATA_PRE(pg); char *ages = pg->age; v = (gcval_t*)data; @@ -1260,13 +1231,11 @@ static void gc_sweep_once(int sweep_mask) // returns 0 if not finished static int gc_sweep_inc(int sweep_mask) { - double t0 = clock_now(); skipped_pages = 0; total_pages = 0; freed_pages = 0; lazy_freed_pages = 0; page_done = 0; - int i; int finished = 1; #ifdef GC_INC int ct = check_timeout; @@ -1286,14 +1255,6 @@ static int gc_sweep_inc(int sweep_mask) return finished; } -static void gc_sweep(int sweep_mask) -{ - gc_sweep_once(sweep_mask); - while (!gc_sweep_inc(sweep_mask)); -} - - - // mark phase jl_value_t **mark_stack = NULL; @@ -1403,9 +1364,10 @@ __attribute__((noinline)) static int gc_mark_module(jl_module_t *m, int d) if (table[i] != HT_NOTFOUND) { jl_binding_t *b = (jl_binding_t*)table[i]; gc_setmark_buf(b, gc_bits(m)); +#ifdef GC_VERIFY void* vb = gc_val_buf(b); verify_parent("module", m, &vb, "binding_buff"); - // scanned_bytes += allocdsz(sizeof(jl_binding_t) + sizeof(void*)); +#endif if (b->value != NULL) { verify_parent("module", m, &b->value, "binding(%s)", b->name->name); refyoung |= gc_push_root(b->value, d); @@ -1451,11 +1413,13 @@ static void gc_mark_task_stack(jl_task_t *ta, int d) } } +#if 0 static void mark_task_stacks(void) { for (int i = 0; i < tasks.len; i++) { gc_mark_task_stack(tasks.items[i], 0); } } +#endif __attribute__((noinline)) static void gc_mark_task(jl_task_t *ta, int d) { @@ -1494,7 +1458,6 @@ static int push_root(jl_value_t *v, int d, int bits) bits = gc_setmark(v, sz, GC_MARKED_NOESC); goto ret; } - int marked = 0; #define MARK(v, s) do { \ s; \ if (d >= MAX_MARK_DEPTH) \ @@ -1542,8 +1505,10 @@ static int push_root(jl_value_t *v, int d, int bits) goto ret; } else if (a->how == 1) { +#ifdef GC_VERIFY void* val_buf = gc_val_buf((char*)a->data - a->offset*a->elsize); verify_parent("array", v, &val_buf, "buffer ('loc' addr is meaningless)"); +#endif gc_setmark_buf((char*)a->data - a->offset*a->elsize, gc_bits(v)); } if (a->ptrarray && a->data!=NULL) { @@ -1555,7 +1520,6 @@ static int push_root(jl_value_t *v, int d, int bits) } else { void *data = a->data; - int has_young_elt = 0; for(size_t i=0; i < l; i++) { jl_value_t *elt = ((jl_value_t**)data)[i]; if (elt != NULL) { @@ -1593,10 +1557,10 @@ static int push_root(jl_value_t *v, int d, int bits) jl_datatype_t *dt = (jl_datatype_t*)vt; MARK(v, bits = gc_setmark(v, jl_datatype_size(dt), GC_MARKED_NOESC)); int nf = (int)jl_tuple_len(dt->names); - int fdsz = sizeof(void*)*nf; - // void** children = alloca(fdsz); + // int fdsz = sizeof(void*)*nf; + // void** children = alloca(fdsz); + // int ci = 0; jl_fielddesc_t* fields = dt->fields; - int ci = 0; for(int i=0; i < nf; i++) { if (fields[i].isptr) { jl_value_t **slot = (jl_value_t**)((char*)v + fields[i].offset + sizeof(void*)); @@ -1640,7 +1604,7 @@ static int push_root(jl_value_t *v, int d, int bits) static void visit_mark_stack_inc(int mark_mode) { while(mark_sp > 0 && !should_timeout()) { - gcval_t* v = (gcval_t*)mark_stack[--mark_sp]; + jl_value_t* v = mark_stack[--mark_sp]; assert(gc_bits(v) == GC_QUEUED || gc_bits(v) == GC_MARKED || gc_bits(v) == GC_MARKED_NOESC); push_root(v, 0, gc_bits(v)); } @@ -1803,6 +1767,7 @@ static void post_mark(void) visit_mark_stack(GC_MARKED_NOESC); } +#ifdef GC_VERIFY static void gc_mark(int finalize) { // mark all roots @@ -1871,9 +1836,8 @@ static void gc_mark(int finalize) } } visit_mark_stack(GC_MARKED_NOESC); - mark_task_stacks(); - visit_mark_stack(GC_MARKED_NOESC); } +#endif /* @@ -1985,9 +1949,6 @@ int64_t diff_gc_total_bytes(void) } void sync_gc_total_bytes(void) {last_gc_total_bytes = jl_gc_total_bytes();} -void jl_gc_ephemeral_on(void) { } -void jl_gc_ephemeral_off(void) { } - #if defined(MEMPROFILE) static void all_pool_stats(void); static void big_obj_stats(void); @@ -2036,7 +1997,7 @@ int saved_mark_sp = 0; int sweep_mask = GC_MARKED; #define MIN_SCAN_BYTES 1024*1024 -static void mark_task_stacks(); +//static void mark_task_stacks(); static void gc_mark_task_stack(jl_task_t*,int); void prepare_sweep(void) @@ -2057,7 +2018,6 @@ void jl_gc_collect(void) #if defined(GC_TIME) || defined(GC_FINAL_STATS) int wb_activations = mark_sp - saved_mark_sp; #endif - int64_t last_perm_scanned = perm_scanned_bytes; if (!sweeping) { inc_count++; @@ -2067,19 +2027,15 @@ void jl_gc_collect(void) scanned_bytes_goal = scanned_bytes_goal < MIN_SCAN_BYTES ? MIN_SCAN_BYTES : scanned_bytes_goal; if (gc_inc_steps > 1) check_timeout = 1; - double t = clock_now(); assert(mark_sp == 0); - /*if (live_bytes && gc_inc_steps > 1) visit_mark_stack_inc(GC_MARKED_NOESC); - else visit_mark_stack(GC_MARKED_NOESC);*/ reset_remset(); - // jl_printf(JL_STDOUT, "remset : %d %d\n", last_remset->len, sweep_mask); int SA = perm_scanned_bytes; for(int i = 0; i < last_remset->len; i++) { uintptr_t item = (uintptr_t)last_remset->items[i]; void* ptr = (void*)(item & ~(uintptr_t)1); objprofile_count(jl_typeof(ptr), 2, 0); if (item & 1) { - arraylist_push(remset, item); + arraylist_push(remset, (void*)item); } gc_bits(ptr) = GC_MARKED; push_root(ptr, 0, gc_bits(ptr)); @@ -2099,30 +2055,32 @@ void jl_gc_collect(void) pre_mark(); visit_mark_stack(GC_MARKED_NOESC); - if (mark_sp == 0 || inc_count > gc_inc_steps) { // mark current stack last to avoid temporaries + /*if (mark_sp == 0 || inc_count > gc_inc_steps) { // mark current stack last to avoid temporaries visit_mark_stack(GC_MARKED_NOESC); // in case inc_count > inc_steps, we finish the marking in one go - /* mark_task_stacks(GC_MARKED_NOESC); - visit_mark_stack(GC_MARKED_NOESC);*/ - } + mark_task_stacks(GC_MARKED_NOESC); + visit_mark_stack(GC_MARKED_NOESC); + }*/ allocd_bytes_since_sweep += allocd_bytes + (int64_t)collect_interval/gc_steps; - // allocd_bytes = -(int64_t)collect_interval/gc_steps; - uint64_t mark_pause = jl_hrtime() - t0; + #ifdef GC_FINAL_STATS total_mark_time += mark_pause; #endif #ifdef GC_TIME + uint64_t mark_pause = jl_hrtime() - t0; JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, perm_marked, allocd_bytes/1024); saved_mark_sp = mark_sp; #endif } - int64_t pct = -1, bonus = -1, SAVE = -1, SAVE2 = -1, est_fb = 0, SAVE3 = -1; +#ifdef GC_TIME + int64_t bonus = -1, SAVE = -1, SAVE2 = -1, SAVE3 = -1, pct = -1; uint64_t post_time = 0, finalize_time = 0; +#endif if(mark_sp == 0 || sweeping) { #if defined(GC_TIME) || defined(GC_FINAL_STATS) uint64_t sweep_t0 = jl_hrtime(); #endif - int64_t actual_allocd = allocd_bytes_since_sweep, promo_bytes = 0; + int64_t actual_allocd = allocd_bytes_since_sweep; if (!sweeping) { #ifdef GC_TIME post_time = jl_hrtime(); @@ -2131,10 +2089,10 @@ void jl_gc_collect(void) #ifdef GC_TIME post_time = jl_hrtime() - post_time; #endif - + /* est_fb = live_bytes - scanned_bytes - (sweep_mask == GC_MARKED_NOESC ? perm_scanned_bytes : perm_scanned_bytes) + actual_allocd; promo_bytes = perm_scanned_bytes - last_perm_scanned; - int promo_pct = (actual_allocd - est_fb) ? (promo_bytes*100)/(actual_allocd - est_fb) : 100; + int promo_pct = (actual_allocd - est_fb) ? (promo_bytes*100)/(actual_allocd - est_fb) : 100;*/ #ifdef GC_VERIFY gc_verify(); #endif @@ -2151,9 +2109,9 @@ void jl_gc_collect(void) prepare_sweep(); - if (quick_count >= 30) { + if (quick_count >= gc_quick_steps) { sweep_mask = GC_MARKED; // next collection is a full one - gc_steps = gc_inc_steps; + gc_steps = 1;//gc_inc_steps; quick_count = 0; } else { @@ -2179,22 +2137,16 @@ void jl_gc_collect(void) remset->len = 0; } - /*int tasks_end = 0; - for (int i = 0; i < tasks.len; i++) { - jl_value_t* ta = (jl_value_t*)tasks.items[i]; - if (gc_marked(ta)) { - tasks.items[tasks_end] = tasks.items[i]; - tasks_end++; - } - } - tasks.len = tasks_end;*/ sweep_weak_refs(); sweeping = 0; if (sweep_mask == GC_MARKED) { tasks.len = 0; } - SAVE2 = freed_bytes; +#ifdef GC_TIME + SAVE2 = freed_bytes + SAVE3 = allocd_bytes_since_sweep; pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; +#endif if (sweep_mask == GC_MARKED_NOESC) { collect_interval = default_collect_interval/4; @@ -2202,18 +2154,17 @@ void jl_gc_collect(void) quick_count--; } else { - if (freed_bytes/quick_count < actual_allocd/30) { - quick_count = 50; + if (freed_bytes/quick_count < actual_allocd/gc_quick_steps) { + quick_count = gc_quick_steps; collect_interval = default_collect_interval; } } } else if (freed_bytes < (7*(actual_allocd/10)) && n_pause > 1) { if (collect_interval <= 2*(max_collect_interval/5)) { - // if (prev_sweep_mask == GC_MARKED) - collect_interval = 5*(collect_interval/2); + collect_interval = 5*(collect_interval/2); } - quick_count = 50; + quick_count = gc_quick_steps; } else { collect_interval = default_collect_interval; @@ -2224,14 +2175,16 @@ void jl_gc_collect(void) allocd_bytes = -(int64_t)collect_interval/gc_steps; inc_count = 0; live_bytes += -freed_bytes + allocd_bytes_since_sweep; - SAVE3 = allocd_bytes_since_sweep; allocd_bytes_since_sweep = 0; freed_bytes = 0; +#ifdef GC_TIME finalize_time = jl_hrtime(); +#endif run_finalizers(); - +#ifdef GC_TIME finalize_time = jl_hrtime() - finalize_time; +#endif } #if defined(GC_FINAL_STATS) || defined(GC_TIME) uint64_t sweep_pause = jl_hrtime() - sweep_t0; @@ -2242,7 +2195,6 @@ void jl_gc_collect(void) #endif #ifdef GC_TIME JL_PRINTF(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark, %.2f ms in %d fin) (marked in %d inc) mask %d | next in %d kB\n", NS2MS(sweep_pause), live_bytes/1024, SAVE2/1024, pct, SAVE3/1024, bonus/1024, SAVE/1024, NS2MS(post_time), NS2MS(finalize_time), n_finalized, inc_count, sweep_mask, -allocd_bytes/1024); - int64_t diff = est_fb - SAVE2; #endif } n_pause++; @@ -2430,7 +2382,7 @@ void jl_print_gc_stats(JL_STREAM *s) void jl_gc_init(void) { - int* szc = sizeclasses; + const int* szc = sizeclasses; int i; for(i=0; i < N_POOLS; i++) { @@ -2442,7 +2394,6 @@ void jl_gc_init(void) assert(offsetof(gcpages_t, data) == GC_PAGE_SZ); collect_interval = default_collect_interval; - long_collect_interval = default_collect_interval; allocd_bytes = -default_collect_interval; #ifdef GC_INC From 8dd9c91efe3275735e13c5e915e3eeb47f55a670 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Thu, 18 Sep 2014 18:30:15 +0200 Subject: [PATCH 12/17] Avoid a silly 1-cycle latency to old object remarking after a full collection. Slight cleanup. Address some of Jeff's comments. --- src/gc.c | 49 ++++++++++++++++---------------------------- src/gf.c | 2 +- src/julia_internal.h | 2 +- 3 files changed, 20 insertions(+), 33 deletions(-) diff --git a/src/gc.c b/src/gc.c index 378e22e15e2b5..1a45cc0dafc4c 100644 --- a/src/gc.c +++ b/src/gc.c @@ -185,7 +185,6 @@ static htable_t obj_sizes[3]; #endif #ifdef GC_FINAL_STATS -static double page_alloc_time=0; static size_t total_freed_bytes=0; static double max_pause = 0.0; static uint64_t total_sweep_time=0; @@ -326,11 +325,7 @@ static inline int gc_setmark_big(void *o, int mark_mode) big_objects_marked = hdr; } #ifdef OBJPROFILE - if (!bits) { - if (mark_mode == GC_MARKED) - perm_scanned_bytes += hdr->sz; - else - scanned_bytes += hdr->sz; + if (!(bits & GC_MARKED)) { objprofile_count(jl_typeof(o), mark_mode == GC_MARKED, hdr->sz); } #endif @@ -352,11 +347,7 @@ static inline int gc_setmark_pool(void *o, int mark_mode) if (bits == GC_QUEUED || bits == GC_MARKED) mark_mode = GC_MARKED; #ifdef OBJPROFILE - if (!bits) { - if (mark_mode == GC_MARKED) - perm_scanned_bytes += page->osize; - else - scanned_bytes += page->osize; + if (!(bits & GC_MARKED)) { objprofile_count(jl_typeof(o), mark_mode == GC_MARKED, page->osize); } #endif @@ -474,9 +465,6 @@ static __attribute__((noinline)) void *malloc_page(void) #endif current_pg_count++; max_pg_count = max_pg_count < current_pg_count ? current_pg_count : max_pg_count; -#ifdef GC_FINAL_STATS - page_alloc_time += clock_now() - t0; -#endif return ptr; } @@ -773,11 +761,7 @@ static bigval_t** sweep_big_list(int sweep_mask, bigval_t** pv) int age = v->age; int bits = gc_bits(&v->_data); if (age >= PROMOTE_AGE) { - if (sweep_mask == GC_MARKED) { - bits = GC_CLEAN; - big_reset++; - } - else if (bits == GC_MARKED_NOESC) + if (sweep_mask == GC_MARKED || bits == GC_MARKED_NOESC) bits = GC_QUEUED; } else { @@ -1120,11 +1104,11 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma int obj_i = ((uintptr_t)v - (uintptr_t)data)/8; // we can encouter a queued value at this point // if a write barrier was moved back between two - // sweeping increments + // sweeping increments TODO int bits = gc_bits(v); int sh = (obj_i % 4)*2; int age = (ages[obj_i/4] >> sh) & 3; - if (!bits) { + if (!(bits & GC_MARKED)) { *pfl = v; pfl = &v->next; pfl_begin = pfl_begin ? pfl_begin : pfl; @@ -1133,16 +1117,16 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma } else { if (age >= PROMOTE_AGE) { - if (sweep_mask == GC_MARKED) - gc_bits(v) = GC_CLEAN; - else if (bits == GC_MARKED_NOESC) + if (sweep_mask == GC_MARKED || bits == GC_MARKED_NOESC) gc_bits(v) = GC_QUEUED; - } else if ((sweep_mask & bits) == sweep_mask) + } + else if ((sweep_mask & bits) == sweep_mask) { gc_bits(v) = GC_CLEAN; + } - inc_sat(age, PROMOTE_AGE); - ages[obj_i/4] &= ~(3 << sh); - ages[obj_i/4] |= age << sh; + inc_sat(age, PROMOTE_AGE); + ages[obj_i/4] &= ~(3 << sh); + ages[obj_i/4] |= age << sh; freedall = 0; } v = (gcval_t*)((char*)v + osize); @@ -1231,6 +1215,9 @@ static void gc_sweep_once(int sweep_mask) // returns 0 if not finished static int gc_sweep_inc(int sweep_mask) { +#ifdef GC_TIME + double t0 = clock_now(); +#endif skipped_pages = 0; total_pages = 0; freed_pages = 0; @@ -2049,6 +2036,7 @@ void jl_gc_collect(void) n_bnd_refyoung++; } } + int rem_bindings_len = rem_bindings.len; rem_bindings.len = n_bnd_refyoung; perm_scanned_bytes = SA; @@ -2068,7 +2056,7 @@ void jl_gc_collect(void) #endif #ifdef GC_TIME uint64_t mark_pause = jl_hrtime() - t0; - JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, perm_marked, allocd_bytes/1024); + JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, rem_bindings_len, allocd_bytes/1024); saved_mark_sp = mark_sp; #endif } @@ -2143,7 +2131,7 @@ void jl_gc_collect(void) tasks.len = 0; } #ifdef GC_TIME - SAVE2 = freed_bytes + SAVE2 = freed_bytes; SAVE3 = allocd_bytes_since_sweep; pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; #endif @@ -2369,7 +2357,6 @@ void jl_print_gc_stats(JL_STREAM *s) (NS_TO_S(total_gc_time)/ptime)*100); jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%.2f ms max\n", (NS_TO_S(total_gc_time)/n_pause)*1000, max_pause*1000); jl_printf(s, "\t\t(%2.1f%% mark, %2.1f%% sweep, %2.1f%% finalizers)\n", (total_mark_time/NS_TO_S(total_gc_time))*100, (total_sweep_time/NS_TO_S(total_gc_time))*100, (total_fin_time/NS_TO_S(total_gc_time))*100); - jl_printf(s, "alloc pause\t%.2f ms\n", page_alloc_time); struct mallinfo mi = mallinfo(); jl_printf(s, "malloc size\t%d MB\n", mi.uordblks/1024/1024); jl_printf(s, "max page alloc\t%ld MB\n", max_pg_count*GC_PAGE_SZ/1024/1024); diff --git a/src/gf.c b/src/gf.c index d1803e5bd4afe..f55b8856e2a1d 100644 --- a/src/gf.c +++ b/src/gf.c @@ -1,5 +1,5 @@ /* - GENERIC Functions + Generic Functions . method table and lookup . GF constructor, add_method . dispatch diff --git a/src/julia_internal.h b/src/julia_internal.h index 72f57087328b3..4f9ba17590ac2 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -122,7 +122,7 @@ typedef CONTEXT *bt_context_t; #include typedef unw_context_t *bt_context_t; #endif -#define MAX_BT_SIZE 80 +#define MAX_BT_SIZE 80000 extern ptrint_t bt_data[MAX_BT_SIZE+1]; extern size_t bt_size; DLLEXPORT size_t rec_backtrace(ptrint_t *data, size_t maxsize); From 0419d5a3f5aaa937d9dd74562d561b6bcd56bd94 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Mon, 29 Sep 2014 00:32:02 +0200 Subject: [PATCH 13/17] cleanup some defines. remove useless geptr instruction. repair GC_FINAL_STATS. --- src/codegen.cpp | 9 +++--- src/gc.c | 76 ++++++++++++++++++++++++------------------------- 2 files changed, 43 insertions(+), 42 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index 7d4d3404193ee..a776183eb4392 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -2649,9 +2649,11 @@ static void emit_assignment(jl_value_t *l, jl_value_t *r, jl_codectx_t *ctx) rval = emit_unbox(vt->getContainedType(0), emit_unboxed(r, ctx), vi.declType); } else { - rval = boxed(emit_expr(r, ctx, true), ctx); - Value* box = builder.CreateGEP(bp, ConstantInt::get(T_size, -1)); - if (!is_stack(bp)) emit_write_barrier(ctx, box, rval); + rval = boxed(emit_expr(r, ctx, true), ctx, rt); + if (!is_stack(bp)) { + Value* box = builder.CreateGEP(bp, ConstantInt::get(T_size, -1)); + emit_write_barrier(ctx, box, rval); + } } if (builder.GetInsertBlock()->getTerminator() == NULL) { builder.CreateStore(rval, bp, vi.isVolatile); @@ -4757,7 +4759,6 @@ extern "C" void jl_init_codegen(void) #endif options.NoFramePointerElim = true; #ifndef LLVM34 - options.JITExceptionHandling = 1; options.NoFramePointerElimNonLeaf = true; #endif #if defined(_OS_WINDOWS_) && !defined(_CPU_X86_64_) diff --git a/src/gc.c b/src/gc.c index 1a45cc0dafc4c..039b577d7dd3a 100644 --- a/src/gc.c +++ b/src/gc.c @@ -30,20 +30,26 @@ extern "C" { #pragma pack(push, 1) -#define GC_PG_LG2 14 -#define GC_PAGE_SZ (4*4096) // ((1 << GC_PAGE_W) - 16) -#define SYS_PAGE_SZ 4096 +#define GC_PAGE_LG2 14 #define REGION_PG_COUNT 8*4096 - +#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k +// contiguous storage for up to REGION_PG_COUNT naturally aligned GC_PAGE_SZ blocks +// uses a very naive allocator (see malloc_page & free_page) typedef struct { - uint32_t freemap[SYS_PAGE_SZ/4]; + uint32_t freemap[REGION_PG_COUNT/32]; char pages[REGION_PG_COUNT][GC_PAGE_SZ]; } region_t; - #define HEAP_COUNT 64 static region_t *heaps[HEAP_COUNT] = {NULL}; +// store a lower bound of the first free block in each region static int heaps_lb[HEAP_COUNT] = {0}; +// every 2^PAGE_GROUP_COUNT_LG2 gc page is reserved for the following pages' metadata +// this requires : +// sizeof(gcpage_t)*2^PAGE_GROUP_COUNT_LG2 <= GC_PAGE_SZ +#define PAGE_GROUP_COUNT_LG2 5 // log2(32) +#define PAGE_GROUP_LG2 (GC_PAGE_LG2 + PAGE_GROUP_COUNT_LG2) + typedef struct _bigval_t { struct _bigval_t *next; struct _bigval_t **prev; // pointer to the next field of the prev entry @@ -95,6 +101,7 @@ typedef struct _pool_t { }; } pool_t; +// pool page metadata typedef struct _gcpage_t { struct { uint16_t pool_n : 8; @@ -114,31 +121,20 @@ typedef struct _gcpage_t { uint32_t data_offset; // this is not strictly necessary char age[2*GC_PAGE_SZ/(8*8)]; // two bits per object } gcpage_t; +// access page data given a pointer to its metadata #define PAGE_DATA_PRE(p) ((char*)(p) + (p)->data_offset) #define PAGE_DATA(p) ((char*)GC_PAGES(p) + GC_PAGE_SZ*(((char*)(p) - (char*)GC_PAGES(p))/sizeof(gcpage_t) + 1)) #define PAGE_PFL_BEG(p) ((gcval_t**)(PAGE_DATA(p) + (p)->fl_begin_offset)) #define PAGE_PFL_END(p) ((gcval_t**)(PAGE_DATA(p) + (p)->fl_end_offset)) -#define PAGE_GROUP_COUNT 31 -// We pack pages by groups of 31 which means a little less than 512k = 32*4 vm pages -#define PAGE_GROUP_LG2 19 -#define PAGE_GROUP_SZ 1 << PAGE_GROUP_LG2 - -typedef struct { - union { - gcpage_t pages[PAGE_GROUP_COUNT]; - char _pad[GC_PAGE_SZ]; - }; - char data[PAGE_GROUP_COUNT][GC_PAGE_SZ]; -} gcpages_t; - +// access page data given a pointer to somewhere inside its data #define GC_PAGES(x) ((gcpage_t*)(((uintptr_t)x) >> PAGE_GROUP_LG2 << PAGE_GROUP_LG2)) #define GC_PAGE_IDX(x) (((uintptr_t)(x) - (uintptr_t)GC_PAGES(x) - GC_PAGE_SZ)/GC_PAGE_SZ) #define GC_PAGE(x) ((gcpage_t*)(&(GC_PAGES(x)[GC_PAGE_IDX(x)]))) -#define GC_PAGE_DATA(x) ((char*)((uintptr_t)(x) >> GC_PG_LG2 << GC_PG_LG2)) +#define GC_PAGE_DATA(x) ((char*)((uintptr_t)(x) >> GC_PAGE_LG2 << GC_PAGE_LG2)) + #define GC_POOL_END_OFS(osize) (((GC_PAGE_SZ/osize) - 1)*osize) - //static int free_lb = 0; // GC knobs and self-measurement variables static int64_t last_gc_total_bytes = 0; @@ -186,7 +182,7 @@ static htable_t obj_sizes[3]; #ifdef GC_FINAL_STATS static size_t total_freed_bytes=0; -static double max_pause = 0.0; +static uint64_t max_pause = 0; static uint64_t total_sweep_time=0; static uint64_t total_mark_time=0; static uint64_t total_fin_time=0; @@ -344,8 +340,9 @@ static inline int gc_setmark_pool(void *o, int mark_mode) #endif gcpage_t* page = GC_PAGE(o); int bits = gc_bits(o); - if (bits == GC_QUEUED || bits == GC_MARKED) + if (bits == GC_QUEUED || bits == GC_MARKED) { mark_mode = GC_MARKED; + } #ifdef OBJPROFILE if (!(bits & GC_MARKED)) { objprofile_count(jl_typeof(o), mark_mode == GC_MARKED, page->osize); @@ -419,7 +416,7 @@ static __attribute__((noinline)) void *malloc_page(void) #ifdef _OS_WINDOWS_ char* mem = VirtualAlloc(NULL, sizeof(region_t) + GC_PAGE_SZ*32, MEM_RESERVE, PAGE_READWRITE); #else - char* mem = mmap(NULL, sizeof(region_t) + GC_PAGE_SZ*32, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + char* mem = mmap(0, sizeof(region_t) + GC_PAGE_SZ*32, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); mem = mem == MAP_FAILED ? NULL : mem; #endif if (mem == NULL) { @@ -427,7 +424,7 @@ static __attribute__((noinline)) void *malloc_page(void) abort(); } // we may waste up to around 500k of virtual address space for alignment but those pages are never committed - heap = (region_t*)((char*)GC_PAGES(mem + SYS_PAGE_SZ + GC_PAGE_SZ*32 - 1) - SYS_PAGE_SZ); + heap = (region_t*)((char*)GC_PAGES(mem + REGION_PG_COUNT/8 + GC_PAGE_SZ*32 - 1) - REGION_PG_COUNT/8); heaps[heap_i] = heap; #ifdef _OS_WINDOWS_ VirtualAlloc(heap->freemap, REGION_PG_COUNT/8, MEM_COMMIT, PAGE_READWRITE); @@ -474,7 +471,7 @@ static inline void free_page(void *p) int i; for(i = 0; i < HEAP_COUNT && heaps[i] != NULL; i++) { pg_idx = ((uintptr_t)p - (uintptr_t)heaps[i]->pages[0])/GC_PAGE_SZ; - if (pg_idx >= 0 && pg_idx < 8*SYS_PAGE_SZ) break; + if (pg_idx >= 0 && pg_idx < REGION_PG_COUNT) break; } assert(i < HEAP_COUNT && heaps[i] != NULL); region_t *heap = heaps[i]; @@ -2051,16 +2048,18 @@ void jl_gc_collect(void) }*/ allocd_bytes_since_sweep += allocd_bytes + (int64_t)collect_interval/gc_steps; -#ifdef GC_FINAL_STATS - total_mark_time += mark_pause; +#if defined(GC_TIME) || defined(GC_FINAL_STATS) + uint64_t mark_pause = jl_hrtime() - t0; #endif #ifdef GC_TIME - uint64_t mark_pause = jl_hrtime() - t0; JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, rem_bindings_len, allocd_bytes/1024); saved_mark_sp = mark_sp; +#endif +#ifdef GC_FINAL_STATS + total_mark_time += mark_pause; #endif } -#ifdef GC_TIME +#if defined(GC_TIME) || defined(GC_FINAL_STATS) int64_t bonus = -1, SAVE = -1, SAVE2 = -1, SAVE3 = -1, pct = -1; uint64_t post_time = 0, finalize_time = 0; #endif @@ -2186,11 +2185,10 @@ void jl_gc_collect(void) #endif } n_pause++; - double pause = jl_hrtime() - t0; + uint64_t pause = jl_hrtime() - t0; total_gc_time += pause; #ifdef GC_FINAL_STATS - // do not count the first pause as it is always a full collection - // max_pause = (max_pause < pause && n_pause > 1) ? pause : max_pause; + max_pause = max_pause < pause ? pause : max_pause; #endif JL_SIGATOMIC_END(); jl_in_gc = 0; @@ -2353,10 +2351,13 @@ void jl_print_gc_stats(JL_STREAM *s) malloc_stats(); double ptime = clock_now()-process_t0; jl_printf(s, "exec time\t%.5f sec\n", ptime); - jl_printf(s, "gc time \t%.5f sec (%2.1f%%)\n", NS_TO_S(total_gc_time), - (NS_TO_S(total_gc_time)/ptime)*100); - jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%.2f ms max\n", (NS_TO_S(total_gc_time)/n_pause)*1000, max_pause*1000); - jl_printf(s, "\t\t(%2.1f%% mark, %2.1f%% sweep, %2.1f%% finalizers)\n", (total_mark_time/NS_TO_S(total_gc_time))*100, (total_sweep_time/NS_TO_S(total_gc_time))*100, (total_fin_time/NS_TO_S(total_gc_time))*100); + jl_printf(s, "gc time \t%.5f sec (%2.1f%%) in %d collections\n", + NS_TO_S(total_gc_time), (NS_TO_S(total_gc_time)/ptime)*100, n_pause); + jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%2.0f ms max\n", + NS2MS(total_gc_time)/n_pause, NS2MS(max_pause)); + jl_printf(s, "\t\t(%2d%% mark, %2d%% sweep, %2d%% finalizers)\n", + (total_mark_time*100)/total_gc_time, (total_sweep_time*100)/total_gc_time, + (total_fin_time*100)/total_gc_time); struct mallinfo mi = mallinfo(); jl_printf(s, "malloc size\t%d MB\n", mi.uordblks/1024/1024); jl_printf(s, "max page alloc\t%ld MB\n", max_pg_count*GC_PAGE_SZ/1024/1024); @@ -2378,7 +2379,6 @@ void jl_gc_init(void) norm_pools[i].freelist = NULL; norm_pools[i].end_offset = ((GC_PAGE_SZ/szc[i]) - 1)*szc[i]; } - assert(offsetof(gcpages_t, data) == GC_PAGE_SZ); collect_interval = default_collect_interval; allocd_bytes = -default_collect_interval; From c2387059f1b45055461a2463fb292ea629849d03 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Thu, 2 Oct 2014 16:36:21 +0200 Subject: [PATCH 14/17] yet another round of cleanups + some additional comments. --- src/codegen.cpp | 2 +- src/gc.c | 72 ++++++++++++++++++++++++++++++++----------------- src/julia.h | 10 +++---- 3 files changed, 53 insertions(+), 31 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index a776183eb4392..864241d354dc6 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1366,7 +1366,7 @@ static void emit_write_barrier(jl_codectx_t* ctx, Value *parent, Value *ptr) return;*/ parent = builder.CreateBitCast(parent, T_psize); Value* parent_type = builder.CreateLoad(parent); - Value* parent_mark_bits = builder.CreateAnd(parent_type, 3); + Value* parent_mark_bits = builder.CreateAnd(parent_type, 1); // the branch hint does not seem to make it to the generated code //builder.CreateCall2(expect_func, parent_marked, ConstantInt::get(T_int1, 0)); diff --git a/src/gc.c b/src/gc.c index 039b577d7dd3a..ad7c5c86083dc 100644 --- a/src/gc.c +++ b/src/gc.c @@ -30,19 +30,26 @@ extern "C" { #pragma pack(push, 1) -#define GC_PAGE_LG2 14 -#define REGION_PG_COUNT 8*4096 +#define GC_PAGE_LG2 14 // log2(size of a page) #define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k + // contiguous storage for up to REGION_PG_COUNT naturally aligned GC_PAGE_SZ blocks // uses a very naive allocator (see malloc_page & free_page) +#ifdef _P64 +#define REGION_PG_COUNT 16*8*4096 // 8G because virtual memory is cheap +#else +#define REGION_PG_COUNT 8*4096 // 512M +#endif +#define HEAP_COUNT 8 typedef struct { uint32_t freemap[REGION_PG_COUNT/32]; char pages[REGION_PG_COUNT][GC_PAGE_SZ]; } region_t; -#define HEAP_COUNT 64 static region_t *heaps[HEAP_COUNT] = {NULL}; // store a lower bound of the first free block in each region static int heaps_lb[HEAP_COUNT] = {0}; +// same with an upper bound +static int heaps_ub[HEAP_COUNT] = {0}; // every 2^PAGE_GROUP_COUNT_LG2 gc page is reserved for the following pages' metadata // this requires : @@ -188,14 +195,23 @@ static uint64_t total_mark_time=0; static uint64_t total_fin_time=0; #endif static int n_pause = 0; +static int n_full_sweep = 0; +int sweeping = 0; // manipulating mark bits -#define GC_CLEAN 0 -#define GC_MARKED 1 -#define GC_QUEUED 2 -#define GC_MARKED_NOESC (GC_MARKED | GC_QUEUED) -int sweeping = 0; +#define GC_CLEAN 0 // freshly allocated +#define GC_MARKED 1 // reachable and old +#define GC_QUEUED 2 // if it is reachable it will be marked as old +#define GC_MARKED_NOESC (GC_MARKED | GC_QUEUED) // reachable and young +// When a reachable object has survived more than PROMOTE_AGE+1 collections +// it is tagged with GC_QUEUED during sweep and will be promoted on next mark +// because at that point we can know easily if it references young objects. +// Marked old objects that reference young ones are kept in the remset. +// When a write barrier triggers, the offending marked object is both queued, +// so as not to trigger the barrier again, and put in the remset. +// Old objects are put back in clean state only on major collection +// (or more precisely, while sweeping at the previous collection) #ifdef GC_INC static int64_t scanned_bytes; @@ -296,7 +312,7 @@ static inline void gc_setmark_other(void *o, int mark_mode) } #define inc_sat(v,s) v = (v) >= s ? s : (v)+1; -#define PROMOTE_AGE 2 +#define PROMOTE_AGE 1 static inline int gc_setmark_big(void *o, int mark_mode) { @@ -309,7 +325,7 @@ static inline int gc_setmark_big(void *o, int mark_mode) bigval_t* hdr = bigval_header(o); int bits = gc_bits(o); if (bits == GC_QUEUED || bits == GC_MARKED) - mark_mode = GC_MARKED; + mark_mode = GC_MARKED; if ((mark_mode == GC_MARKED) & (bits != GC_MARKED)) { *hdr->prev = hdr->next; if (hdr->next) @@ -335,7 +351,7 @@ static inline int gc_setmark_pool(void *o, int mark_mode) #ifdef GC_VERIFY if (verifying) { _gc_setmark(o, mark_mode); - return; + return mark_mode; } #endif gcpage_t* page = GC_PAGE(o); @@ -447,6 +463,8 @@ static __attribute__((noinline)) void *malloc_page(void) } if (heaps_lb[heap_i] < i) heaps_lb[heap_i] = i; + if (heaps_ub[heap_i] < i) + heaps_ub[heap_i] = i; int j = (ffs(heap->freemap[i]) - 1); heap->freemap[i] &= ~(uint32_t)(1 << j); if (j == 0) { // reserve a page for metadata (every 31 data pages) @@ -467,10 +485,10 @@ static __attribute__((noinline)) void *malloc_page(void) static inline void free_page(void *p) { - int pg_idx = -1; + size_t pg_idx = -1; int i; for(i = 0; i < HEAP_COUNT && heaps[i] != NULL; i++) { - pg_idx = ((uintptr_t)p - (uintptr_t)heaps[i]->pages[0])/GC_PAGE_SZ; + pg_idx = ((char*)p - (char*)&heaps[i]->pages[0])/GC_PAGE_SZ; if (pg_idx >= 0 && pg_idx < REGION_PG_COUNT) break; } assert(i < HEAP_COUNT && heaps[i] != NULL); @@ -720,7 +738,6 @@ void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f) static __attribute__((noinline)) void *alloc_big(size_t sz) { - // jl_printf(JL_STDOUT, "BIG: %d\n", sz); maybe_collect(); size_t offs = BVOFFS*sizeof(void*); if (sz+offs+15 < offs+15) // overflow in adding offs, size was "negative" @@ -1569,7 +1586,7 @@ static int push_root(jl_value_t *v, int d, int bits) ret: #ifdef GC_VERIFY - if (verifying) return; + if (verifying) return bits; #endif if ((bits == GC_MARKED) && (refyoung == GC_MARKED_NOESC)) { arraylist_push(remset, v); @@ -2033,7 +2050,6 @@ void jl_gc_collect(void) n_bnd_refyoung++; } } - int rem_bindings_len = rem_bindings.len; rem_bindings.len = n_bnd_refyoung; perm_scanned_bytes = SA; @@ -2052,7 +2068,7 @@ void jl_gc_collect(void) uint64_t mark_pause = jl_hrtime() - t0; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, rem_bindings_len, allocd_bytes/1024); + JL_PRINTF(JL_STDOUT, "GC mark pause %.2f ms | scanned %ld kB = %ld + %ld | stack %d -> %d (wb %d) | remset %d %d\n", NS2MS(mark_pause), (scanned_bytes + perm_scanned_bytes)/1024, scanned_bytes/1024, perm_scanned_bytes/1024, saved_mark_sp, mark_sp, wb_activations, last_remset->len, allocd_bytes/1024); saved_mark_sp = mark_sp; #endif #ifdef GC_FINAL_STATS @@ -2122,6 +2138,7 @@ void jl_gc_collect(void) } else { remset->len = 0; + n_full_sweep++; } sweep_weak_refs(); @@ -2136,7 +2153,7 @@ void jl_gc_collect(void) #endif if (sweep_mask == GC_MARKED_NOESC) { - collect_interval = default_collect_interval/4; + collect_interval = default_collect_interval; if (freed_bytes >= actual_allocd) { quick_count--; } @@ -2351,13 +2368,18 @@ void jl_print_gc_stats(JL_STREAM *s) malloc_stats(); double ptime = clock_now()-process_t0; jl_printf(s, "exec time\t%.5f sec\n", ptime); - jl_printf(s, "gc time \t%.5f sec (%2.1f%%) in %d collections\n", - NS_TO_S(total_gc_time), (NS_TO_S(total_gc_time)/ptime)*100, n_pause); - jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%2.0f ms max\n", - NS2MS(total_gc_time)/n_pause, NS2MS(max_pause)); - jl_printf(s, "\t\t(%2d%% mark, %2d%% sweep, %2d%% finalizers)\n", - (total_mark_time*100)/total_gc_time, (total_sweep_time*100)/total_gc_time, - (total_fin_time*100)/total_gc_time); + if (n_pause > 0) { + jl_printf(s, "gc time \t%.5f sec (%2.1f%%) in %d (%d full) collections\n", + NS_TO_S(total_gc_time), (NS_TO_S(total_gc_time)/ptime)*100, n_pause, n_full_sweep); + jl_printf(s, "gc pause \t%.2f ms avg\n\t\t%2.0f ms max\n", + NS2MS(total_gc_time)/n_pause, NS2MS(max_pause)); + jl_printf(s, "\t\t(%2d%% mark, %2d%% sweep, %2d%% finalizers)\n", + (total_mark_time*100)/total_gc_time, (total_sweep_time*100)/total_gc_time, + (total_fin_time*100)/total_gc_time); + } + int i = 0; + while (i < HEAP_COUNT && heaps[i]) i++; + jl_printf(s, "max allocated regions : %d\n", i); struct mallinfo mi = mallinfo(); jl_printf(s, "malloc size\t%d MB\n", mi.uordblks/1024/1024); jl_printf(s, "max page alloc\t%ld MB\n", max_pg_count*GC_PAGE_SZ/1024/1024); diff --git a/src/julia.h b/src/julia.h index a889980099ab9..28037aee7dce7 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1382,7 +1382,7 @@ DLLEXPORT void gc_wb_slow(void* parent, void* ptr); static inline void gc_wb_binding(void *bnd, void *val) { #ifdef GC_INC - if (__unlikely((*(uintptr_t*)bnd & 3) == 1 && (*(uintptr_t*)val & 1) == 0)) + if (__unlikely((*(uintptr_t*)bnd & 1) == 1 && (*(uintptr_t*)val & 1) == 0)) gc_queue_binding(bnd); #endif } @@ -1391,7 +1391,7 @@ static inline void gc_wb_fwd(void* parent, void* ptr) { #ifdef GC_INC // if parent is marked and ptr is clean - if(__unlikely((*((uintptr_t*)parent) & 3) == 1 && (*((uintptr_t*)ptr) & 1) == 0)) { + if(__unlikely((*((uintptr_t*)parent) & 1) == 1 && (*((uintptr_t*)ptr) & 1) == 0)) { gc_queue_root((void*)((uintptr_t)ptr | 1)); } #endif @@ -1399,7 +1399,7 @@ static inline void gc_wb_fwd(void* parent, void* ptr) static inline void gc_wb(void *parent, void *ptr) { - if (__unlikely((*((uintptr_t*)parent) & 3) == 1 && + if (__unlikely((*((uintptr_t*)parent) & 1) == 1 && (*((uintptr_t*)ptr) & 1) == 0)) gc_queue_root(parent); } @@ -1408,7 +1408,7 @@ static inline void gc_wb_buf(void *parent, void *bufptr) { #ifdef GC_INC // if parent is marked and buf is not - if (__unlikely((*((uintptr_t*)parent) & 3) == 1)) + if (__unlikely((*((uintptr_t*)parent) & 1) == 1)) // (*((uintptr_t*)bufptr) & 3) != 1)) gc_setmark_buf(bufptr, *(uintptr_t*)parent & 3); #endif @@ -1418,7 +1418,7 @@ static inline void gc_wb_back(void *ptr) { #ifdef GC_INC // if ptr is marked - if(__unlikely((*((uintptr_t*)ptr) & 3) == 1)) { + if(__unlikely((*((uintptr_t*)ptr) & 1) == 1)) { // *((uintptr_t*)ptr) &= ~(uintptr_t)3; // clear the mark gc_queue_root(ptr); } From e47f5e16e6888be588e6fe757cdd214db3480b19 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Mon, 6 Oct 2014 17:35:39 +0200 Subject: [PATCH 15/17] add some more timing output. improve a bit the page allocator. --- base/util.jl | 13 ++++++++++--- src/gc.c | 32 +++++++++++++++++++++----------- src/julia.h | 2 ++ 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/base/util.jl b/base/util.jl index bd7d7afa9f890..9e3c63677da0b 100644 --- a/base/util.jl +++ b/base/util.jl @@ -12,6 +12,9 @@ gc_time_ns() = ccall(:jl_gc_total_hrtime, Uint64, ()) # total number of bytes allocated so far gc_bytes() = ccall(:jl_gc_total_bytes, Int64, ()) +gc_num_pause() = ccall(:jl_gc_num_pause, Int64, ()) +gc_num_full_sweep() = ccall(:jl_gc_num_full_sweep, Int64, ()) + # reset the malloc log. Used to avoid counting memory allocated during compilation. clear_malloc_data() = ccall(:jl_clear_malloc_data, Void, ()) @@ -40,14 +43,14 @@ end # print elapsed time, return expression value const _units = ["bytes", "kB", "MB"] -function time_print(t, b, g) +function time_print(t, b, g, np, nfs) i = 1 while b > 1024 && i < length(_units) b = div(b, 1024) i += 1 end if 0 < g - @printf("elapsed time: %s seconds (%d %s allocated, %.2f%% gc time)\n", t/1e9, b, _units[i], 100*g/t) + @printf("elapsed time: %s seconds (%d %s allocated, %.2f%% gc time in %d pauses with %d full sweep)\n", t/1e9, b, _units[i], 100*g/t, np, nfs) else @printf("elapsed time: %s seconds (%d %s allocated)\n", t/1e9, b, _units[i]) end @@ -58,11 +61,15 @@ macro time(ex) local b0 = gc_bytes() local t0 = time_ns() local g0 = gc_time_ns() + local n0 = gc_num_pause() + local nfs0 = gc_num_full_sweep() local val = $(esc(ex)) + local nfs1 = gc_num_full_sweep() + local n1 = gc_num_pause() local g1 = gc_time_ns() local t1 = time_ns() local b1 = gc_bytes() - time_print(t1-t0, b1-b0, g1-g0) + time_print(t1-t0, b1-b0, g1-g0, n1-n0, nfs1-nfs0) val end end diff --git a/src/gc.c b/src/gc.c index ad7c5c86083dc..47a8ca400813e 100644 --- a/src/gc.c +++ b/src/gc.c @@ -48,8 +48,8 @@ typedef struct { static region_t *heaps[HEAP_COUNT] = {NULL}; // store a lower bound of the first free block in each region static int heaps_lb[HEAP_COUNT] = {0}; -// same with an upper bound -static int heaps_ub[HEAP_COUNT] = {0}; +// an upper bound of the last non-free block +static int heaps_ub[HEAP_COUNT] = {REGION_PG_COUNT/32-1}; // every 2^PAGE_GROUP_COUNT_LG2 gc page is reserved for the following pages' metadata // this requires : @@ -507,7 +507,7 @@ static inline void free_page(void *p) VirtualFree(&heap->pages[pg_idx], GC_PAGE_SZ, MEM_DECOMMIT); #else madvise(&heap->pages[pg_idx], GC_PAGE_SZ, MADV_DONTNEED); -#endif +#endif } if (heaps_lb[i] > pg_idx/32) heaps_lb[i] = pg_idx/32; current_pg_count--; @@ -1042,16 +1042,20 @@ static int lazy_freed_pages = 0; static int page_done = 0; static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl,int,int); static void _update_freelist(pool_t* p, gcval_t* next); -static void sweep_pool_region(region_t* heap, int sweep_mask) +static void sweep_pool_region(int heap_i, int sweep_mask) { + region_t* heap = heaps[heap_i]; gcval_t **pfl[N_POOLS]; for (int i = 0; i < N_POOLS; i++) { _update_freelist(&norm_pools[i], NULL); pfl[i] = &norm_pools[i].freelist; } - for (int pg_i = 0; pg_i < REGION_PG_COUNT/32; pg_i++) { + int ub = 0; + int lb = heaps_lb[heap_i]; + for (int pg_i = 0; pg_i <= heaps_ub[heap_i]; pg_i++) { uint32_t line = heap->freemap[pg_i]; if (!!~line) { + ub = pg_i; for (int j = 1; j < 32; j++) { if (!((line >> j) & 1)) { gcpage_t *pg = GC_PAGE(heap->pages[pg_i*32 + j]); @@ -1061,8 +1065,10 @@ static void sweep_pool_region(region_t* heap, int sweep_mask) pfl[p_n] = sweep_page(p, pg, pfl[p_n], sweep_mask, osize); } } - } + } else if (pg_i < lb) lb = pg_i; } + heaps_ub[heap_i] = ub; + heaps_lb[heap_i] = lb; int i = 0; for (pool_t* p = norm_pools; p < norm_pools + N_POOLS; p++) { *pfl[i++] = NULL; @@ -1161,7 +1167,7 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma // the eager one uses less memory. pg_total++; if (freedall) { - if (prev_sweep_mask == GC_MARKED_NOESC && lazy_freed_pages <= default_collect_interval/4*4096) { + if (sweep_mask == GC_MARKED_NOESC && lazy_freed_pages <= default_collect_interval/(4*4096)) { gcval_t *begin = reset_page(p, pg, 0); *prev_pfl = begin; pfl = (gcval_t**)((char*)begin + ((int)pg->nfree - 1)*osize); @@ -1245,13 +1251,15 @@ static int gc_sweep_inc(int sweep_mask) #endif for (int i = 0; i < HEAP_COUNT; i++) { if (heaps[i]) - /*finished &= */sweep_pool_region(heaps[i], sweep_mask); + /*finished &= */sweep_pool_region(i, sweep_mask); } #ifdef GC_INC check_timeout = ct; #endif #ifdef GC_TIME - JL_PRINTF(JL_STDOUT, "GC sweep pools %s %.2f (skipped %d%% of %d, done %d pgs, %d freed with %d lazily) mask %d\n", finished ? "end" : "inc", (clock_now() - t0)*1000, total_pages ? (skipped_pages*100)/total_pages : 0, total_pages, page_done, freed_pages, lazy_freed_pages, sweep_mask); + double sweep_pool_sec = clock_now() - t0; + double sweep_speed = (((double)total_pages)*GC_PAGE_SZ)/(1024*1024); + JL_PRINTF(JL_STDOUT, "GC sweep pools %s %.2f at %.1f MB/s (skipped %d%% of %d, done %d pgs, %d freed with %d lazily) mask %d\n", finished ? "end" : "inc", sweep_pool_sec*1000, sweep_speed, total_pages ? (skipped_pages*100)/total_pages : 0, total_pages, page_done, freed_pages, lazy_freed_pages, sweep_mask); #endif return finished; } @@ -1940,6 +1948,8 @@ DLLEXPORT int jl_gc_is_enabled(void) { return is_gc_enabled; } DLLEXPORT int64_t jl_gc_total_bytes(void) { return total_allocd_bytes + allocd_bytes + collect_interval/gc_steps; } DLLEXPORT uint64_t jl_gc_total_hrtime(void) { return total_gc_time; } +DLLEXPORT int64_t jl_gc_num_pause(void) { return n_pause; } +DLLEXPORT int64_t jl_gc_num_full_sweep(void) { return n_full_sweep; } int64_t diff_gc_total_bytes(void) { @@ -2182,11 +2192,11 @@ void jl_gc_collect(void) allocd_bytes_since_sweep = 0; freed_bytes = 0; -#ifdef GC_TIME +#if defined(GC_FINAL_STATS) || defined(GC_TIME) finalize_time = jl_hrtime(); #endif run_finalizers(); -#ifdef GC_TIME +#if defined(GC_FINAL_STATS) || defined(GC_TIME) finalize_time = jl_hrtime() - finalize_time; #endif } diff --git a/src/julia.h b/src/julia.h index 28037aee7dce7..4857fa9b171e2 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1107,6 +1107,8 @@ void *reallocb(void*, size_t); DLLEXPORT void *allocobj(size_t sz); DLLEXPORT void jl_clear_malloc_data(void); +DLLEXPORT int64_t jl_gc_num_pause(void); +DLLEXPORT int64_t jl_gc_num_full_sweep(void); #else From 1a0b7069068375c288190bdc933aa86d2a72692e Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Mon, 6 Oct 2014 18:26:30 +0200 Subject: [PATCH 16/17] oups --- src/codegen.cpp | 2 +- src/gc.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index 864241d354dc6..507fab9eddf5c 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -2891,7 +2891,7 @@ static Value *emit_expr(jl_value_t *expr, jl_codectx_t *ctx, bool isboxed, make_gcroot(a1, ctx); Value *a2 = boxed(emit_expr(args[2], ctx),ctx); make_gcroot(a2, ctx); - Value *mdargs[6] = { name, bp, bp_owner, literal_pointer_val(bnd), a1, a2, literal_pointer_val(args[3]) }; + Value *mdargs[7] = { name, bp, bp_owner, literal_pointer_val(bnd), a1, a2, literal_pointer_val(args[3]) }; ctx->argDepth = last_depth; return builder.CreateCall(prepare_call(jlmethod_func), ArrayRef(&mdargs[0], 6)); } diff --git a/src/gc.c b/src/gc.c index 47a8ca400813e..4130ba7698bee 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1258,8 +1258,8 @@ static int gc_sweep_inc(int sweep_mask) #endif #ifdef GC_TIME double sweep_pool_sec = clock_now() - t0; - double sweep_speed = (((double)total_pages)*GC_PAGE_SZ)/(1024*1024); - JL_PRINTF(JL_STDOUT, "GC sweep pools %s %.2f at %.1f MB/s (skipped %d%% of %d, done %d pgs, %d freed with %d lazily) mask %d\n", finished ? "end" : "inc", sweep_pool_sec*1000, sweep_speed, total_pages ? (skipped_pages*100)/total_pages : 0, total_pages, page_done, freed_pages, lazy_freed_pages, sweep_mask); + double sweep_speed = ((((double)total_pages)*GC_PAGE_SZ)/(1024*1024*1024))/sweep_pool_sec; + JL_PRINTF(JL_STDOUT, "GC sweep pools %s %.2f at %.1f GB/s (skipped %d%% of %d, done %d pgs, %d freed with %d lazily) mask %d\n", finished ? "end" : "inc", sweep_pool_sec*1000, sweep_speed, total_pages ? (skipped_pages*100)/total_pages : 0, total_pages, page_done, freed_pages, lazy_freed_pages, sweep_mask); #endif return finished; } From 7447ccb9be8d5f72a7bafc8cfaa43b25f87c1534 Mon Sep 17 00:00:00 2001 From: Oscar Blumberg Date: Thu, 16 Oct 2014 15:26:04 +0200 Subject: [PATCH 17/17] remove gc_inc, add more comments --- src/codegen.cpp | 4 - src/gc.c | 228 ++++++++++++++++++++++-------------------------- src/julia.h | 15 +--- src/options.h | 6 +- 4 files changed, 111 insertions(+), 142 deletions(-) diff --git a/src/codegen.cpp b/src/codegen.cpp index 507fab9eddf5c..c65326fc5d50f 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -1361,7 +1361,6 @@ static Value *emit_boxed_rooted(jl_value_t *e, jl_codectx_t *ctx) // if ptr is NULL this emits a write barrier _back_ static void emit_write_barrier(jl_codectx_t* ctx, Value *parent, Value *ptr) { - #ifdef GC_INC /* builder.CreateCall2(wbfunc, builder.CreateBitCast(parent, jl_pvalue_llvmt), builder.CreateBitCast(ptr, jl_pvalue_llvmt)); return;*/ parent = builder.CreateBitCast(parent, T_psize); @@ -1386,12 +1385,10 @@ static void emit_write_barrier(jl_codectx_t* ctx, Value *parent, Value *ptr) builder.CreateBr(cont); ctx->f->getBasicBlockList().push_back(cont); builder.SetInsertPoint(cont); - #endif } static void emit_checked_write_barrier(jl_codectx_t *ctx, Value *parent, Value *ptr) { -#ifdef GC_INC BasicBlock *cont; Value *not_null = builder.CreateICmpNE(ptr, V_null); BasicBlock *if_not_null = BasicBlock::Create(getGlobalContext(), "wb_not_null", ctx->f); @@ -1402,7 +1399,6 @@ static void emit_checked_write_barrier(jl_codectx_t *ctx, Value *parent, Value * builder.CreateBr(cont); ctx->f->getBasicBlockList().push_back(cont); builder.SetInsertPoint(cont); -#endif } diff --git a/src/gc.c b/src/gc.c index 4130ba7698bee..52ec6cc763668 100644 --- a/src/gc.c +++ b/src/gc.c @@ -146,13 +146,9 @@ typedef struct _gcpage_t { // GC knobs and self-measurement variables static int64_t last_gc_total_bytes = 0; -#ifdef GC_INC static int gc_inc_steps = 1; static int gc_quick_steps = 32; -//static int gc_sweep_steps = 1; -#else -static const int gc_inc_steps = 1; -#endif + #ifdef _P64 #define default_collect_interval (5600*1024*sizeof(void*)) static size_t max_collect_interval = 1250000000UL; @@ -213,14 +209,10 @@ int sweeping = 0; // Old objects are put back in clean state only on major collection // (or more precisely, while sweeping at the previous collection) -#ifdef GC_INC static int64_t scanned_bytes; static int64_t perm_scanned_bytes; static int prev_sweep_mask = GC_MARKED; static size_t scanned_bytes_goal; -#else -const int prev_sweep_mask = GC_MARKED; -#endif #define gc_bits(o) (((gcval_t*)(o))->gc_bits) #define gc_marked(o) (((gcval_t*)(o))->gc_bits & GC_MARKED) @@ -387,6 +379,9 @@ static inline int gc_setmark(void *o, int sz, int mark_mode) inline void gc_setmark_buf(void *o, int mark_mode) { +#ifdef MEMDEBUG + return gc_setmark_big(o, mark_mode); +#endif buff_t *buf = gc_val_buf(o); if (buf->pooled) gc_setmark_pool(buf, mark_mode); @@ -513,8 +508,7 @@ static inline void free_page(void *p) current_pg_count--; } -#ifdef GC_INC -#define should_collect() (__unlikely(allocd_bytes > 0)) +#define should_collect() (__unlikely(allocd_bytes>0)) static inline int maybe_collect(void) { if (should_collect()) { @@ -523,9 +517,6 @@ static inline int maybe_collect(void) } return 0; } -#else -#define maybe_collect() if (__unlikely(allocd_bytes > collect_interval)) jl_gc_collect() -#endif DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { @@ -933,7 +924,9 @@ static inline gcval_t *reset_page(pool_t *p, gcpage_t *pg, gcval_t *fl) return beg; } - +// assign p->freelist = next +// takes care of flushing the page metadata cache +// if page(p->freelist) != page(next) static inline void _update_freelist(pool_t* p, gcval_t* next) { gcval_t *cur = p->freelist; @@ -1026,13 +1019,8 @@ static int szclass(size_t sz) return 41; } -#ifdef GC_INC int check_timeout = 0; -//#define should_timeout() (check_timeout && scanned_bytes >= scanned_bytes_goal) -#define should_timeout() 0 -#else #define should_timeout() 0 -#endif static int skipped_pages = 0; @@ -1203,7 +1191,6 @@ static gcval_t** sweep_page(pool_t* p, gcpage_t* pg, gcval_t **pfl, int sweep_ma extern void jl_unmark_symbols(void); - // if mark_bits & sweep_mask == sweep_mask we reset the mark while sweeping the heap static void gc_sweep_once(int sweep_mask) { @@ -1244,18 +1231,12 @@ static int gc_sweep_inc(int sweep_mask) lazy_freed_pages = 0; page_done = 0; int finished = 1; -#ifdef GC_INC - int ct = check_timeout; - if (sweep_mask == GC_MARKED_NOESC || gc_steps == 1) check_timeout = 0; - check_timeout = 0; -#endif + for (int i = 0; i < HEAP_COUNT; i++) { if (heaps[i]) /*finished &= */sweep_pool_region(i, sweep_mask); } -#ifdef GC_INC - check_timeout = ct; -#endif + #ifdef GC_TIME double sweep_pool_sec = clock_now() - t0; double sweep_speed = ((((double)total_pages)*GC_PAGE_SZ)/(1024*1024*1024))/sweep_pool_sec; @@ -1287,7 +1268,7 @@ void grow_mark_stack(void) } int max_msp = 0; -#ifdef GC_INC + static arraylist_t tasks; static arraylist_t rem_bindings; static arraylist_t _remset[2]; @@ -1300,7 +1281,7 @@ void reset_remset(void) last_remset = tmp; remset->len = 0; } -#endif + DLLEXPORT void gc_queue_root(void *p) { void *ptr = (void*)((uintptr_t)p & ~(uintptr_t)1); @@ -1346,14 +1327,12 @@ static void gc_mark_stack(jl_value_t* ta, jl_gcframe_t *s, ptrint_t offset, int if (s->nroots & 1) { for(size_t i=0; i < nr; i++) { jl_value_t **ptr = (jl_value_t**)((char*)rts[i] + offset); - // scanned_bytes += sizeof(void*); if (*ptr != NULL) gc_push_root(*ptr, d); } } else { for(size_t i=0; i < nr; i++) { - // scanned_bytes += sizeof(void*); if (rts[i] != NULL) { verify_parent("task", ta, &rts[i], "stack(%d)", i); gc_push_root(rts[i], d); @@ -1536,7 +1515,7 @@ static int push_root(jl_value_t *v, int d, int bits) refyoung |= gc_push_root(elt, d); } // try to split large array marking - // if (should_timeout() && l > 1000) goto queue_the_root; + // if (should_timeout() && l > 1000) goto queue_the_root; } } } @@ -1550,6 +1529,8 @@ static int push_root(jl_value_t *v, int d, int bits) else if (vt == (jl_value_t*)jl_task_type) { MARK(v, bits = gc_setmark(v, sizeof(jl_task_t), GC_MARKED_NOESC)); gc_mark_task((jl_task_t*)v, d); + // tasks should always be remarked since we do not trigger the write barrier + // for stores to stack slots refyoung = GC_MARKED_NOESC; } else if(vt == (jl_value_t*)jl_symbol_type) { @@ -1566,6 +1547,7 @@ static int push_root(jl_value_t *v, int d, int bits) jl_datatype_t *dt = (jl_datatype_t*)vt; MARK(v, bits = gc_setmark(v, jl_datatype_size(dt), GC_MARKED_NOESC)); int nf = (int)jl_tuple_len(dt->names); + // TODO check if there is a perf improvement for objects with a lot of fields // int fdsz = sizeof(void*)*nf; // void** children = alloca(fdsz); // int ci = 0; @@ -1576,13 +1558,13 @@ static int push_root(jl_value_t *v, int d, int bits) jl_value_t *fld = *slot; if (fld) { verify_parent("object", v, slot, "field(%d)", i); - // children[ci++] = fld; + //children[ci++] = fld; refyoung |= gc_push_root(fld, d); } } } - // while(ci) - // refyoung |= gc_push_root(children[--ci], d); + //while(ci) + // refyoung |= gc_push_root(children[--ci], d); } #ifdef GC_VERIFY else { @@ -1597,6 +1579,7 @@ static int push_root(jl_value_t *v, int d, int bits) if (verifying) return bits; #endif if ((bits == GC_MARKED) && (refyoung == GC_MARKED_NOESC)) { + // v is an old object referencing young objects arraylist_push(remset, v); } return bits; @@ -1621,15 +1604,11 @@ static void visit_mark_stack_inc(int mark_mode) static void visit_mark_stack(int mark_mode) { -#ifdef GC_INC int ct = check_timeout; check_timeout = 0; -#endif visit_mark_stack_inc(mark_mode); assert(!mark_sp); -#ifdef GC_INC check_timeout = ct; -#endif } void jl_mark_box_caches(void); @@ -1687,66 +1666,6 @@ static void pre_mark(void) gc_push_root(jl_false, 0); } -#ifdef GC_VERIFY -static arraylist_t bits_save[4]; - -// set all mark bits to bits -// record the state of the heap and can replay it in restore() -// restore _must_ be called as this will overwrite parts of the -// freelist in pools -static void clear_mark(int bits) -{ - size_t i; - pool_t* pool; - gcpage_t* pg; - gcval_t* pv; - for(int i = 0; i < 4; i++) - bits_save[i].len = 0; - - bigval_t *bigs[] = { big_objects, big_objects_marked }; - for (int i = 0; i < 2; i++) { - bigval_t *v = bigs[i]; - while (v != NULL) { - void* gcv = &v->_data; - arraylist_push(&bits_save[gc_bits(gcv)], gcv); - gc_bits(gcv) = bits; - v = v->next; - } - } - for (int h = 0; h < HEAP_COUNT; h++) { - region_t* heap = heaps[h]; - if (!heap) break; - for (int pg_i = 0; pg_i < REGION_PG_COUNT/32; pg_i++) { - uint32_t line = heap->freemap[pg_i]; - if (!!~line) { - for (int j = 1; j < 32; j++) { - if (!((line >> j) & 1)) { - gcpage_t *pg = GC_PAGE(heap->pages[pg_i*32 + j]); - pool_t *pool = &norm_pools[pg->pool_n]; - pv = (gcval_t*)PAGE_DATA(pg); - char *lim = (char*)pv + GC_PAGE_SZ - pool->osize; - while ((char*)pv <= lim) { - arraylist_push(&bits_save[gc_bits(pv)], pv); - gc_bits(pv) = bits; - pv = (gcval_t*)((char*)pv + pool->osize); - } - } - } - } - } - } -} - -static void restore(void) -{ - for(int b = 0; b < 4; b++) { - for(int i = 0; i < bits_save[b].len; i++) { - gc_bits(bits_save[b].items[i]) = b; - } - } -} -#endif - static int n_finalized; static void post_mark(void) @@ -1873,6 +1792,64 @@ static void gc_mark(int finalize) // this does not yet detect missing writes from marked to marked_noesc // the error is caught at the first long collection #ifdef GC_VERIFY +static arraylist_t bits_save[4]; + +// set all mark bits to bits +// record the state of the heap and can replay it in restore() +// restore _must_ be called as this will overwrite parts of the +// freelist in pools +static void clear_mark(int bits) +{ + size_t i; + pool_t* pool; + gcpage_t* pg; + gcval_t* pv; + for(int i = 0; i < 4; i++) + bits_save[i].len = 0; + + bigval_t *bigs[] = { big_objects, big_objects_marked }; + for (int i = 0; i < 2; i++) { + bigval_t *v = bigs[i]; + while (v != NULL) { + void* gcv = &v->_data; + arraylist_push(&bits_save[gc_bits(gcv)], gcv); + gc_bits(gcv) = bits; + v = v->next; + } + } + for (int h = 0; h < HEAP_COUNT; h++) { + region_t* heap = heaps[h]; + if (!heap) break; + for (int pg_i = 0; pg_i < REGION_PG_COUNT/32; pg_i++) { + uint32_t line = heap->freemap[pg_i]; + if (!!~line) { + for (int j = 1; j < 32; j++) { + if (!((line >> j) & 1)) { + gcpage_t *pg = GC_PAGE(heap->pages[pg_i*32 + j]); + pool_t *pool = &norm_pools[pg->pool_n]; + pv = (gcval_t*)PAGE_DATA(pg); + char *lim = (char*)pv + GC_PAGE_SZ - pool->osize; + while ((char*)pv <= lim) { + arraylist_push(&bits_save[gc_bits(pv)], pv); + gc_bits(pv) = bits; + pv = (gcval_t*)((char*)pv + pool->osize); + } + } + } + } + } + } +} + +static void restore(void) +{ + for(int b = 0; b < 4; b++) { + for(int i = 0; i < bits_save[b].len; i++) { + gc_bits(bits_save[b].items[i]) = b; + } + } +} + static void gc_verify(void) { verifying = 1; @@ -2008,14 +1985,12 @@ int saved_mark_sp = 0; int sweep_mask = GC_MARKED; #define MIN_SCAN_BYTES 1024*1024 -//static void mark_task_stacks(); static void gc_mark_task_stack(jl_task_t*,int); void prepare_sweep(void) { } -#ifdef GC_INC #ifdef GC_VERIFY static void clear_mark(int); #endif @@ -2039,18 +2014,22 @@ void jl_gc_collect(void) if (gc_inc_steps > 1) check_timeout = 1; assert(mark_sp == 0); + + // 1. mark every object in the remset reset_remset(); - int SA = perm_scanned_bytes; for(int i = 0; i < last_remset->len; i++) { uintptr_t item = (uintptr_t)last_remset->items[i]; void* ptr = (void*)(item & ~(uintptr_t)1); objprofile_count(jl_typeof(ptr), 2, 0); if (item & 1) { + // some objects are required to stay in the remset between quick collections arraylist_push(remset, (void*)item); } gc_bits(ptr) = GC_MARKED; push_root(ptr, 0, gc_bits(ptr)); } + + // 2. mark every object in a remembered binding int n_bnd_refyoung = 0; for (int i = 0; i < rem_bindings.len; i++) { void *ptr = rem_bindings.items[i]; @@ -2061,8 +2040,8 @@ void jl_gc_collect(void) } } rem_bindings.len = n_bnd_refyoung; - perm_scanned_bytes = SA; + // 3. walk roots pre_mark(); visit_mark_stack(GC_MARKED_NOESC); @@ -2085,8 +2064,10 @@ void jl_gc_collect(void) total_mark_time += mark_pause; #endif } -#if defined(GC_TIME) || defined(GC_FINAL_STATS) +#ifdef GC_TIME int64_t bonus = -1, SAVE = -1, SAVE2 = -1, SAVE3 = -1, pct = -1; +#endif +#if defined(GC_TIME) || defined(GC_FINAL_STATS) uint64_t post_time = 0, finalize_time = 0; #endif if(mark_sp == 0 || sweeping) { @@ -2095,17 +2076,21 @@ void jl_gc_collect(void) #endif int64_t actual_allocd = allocd_bytes_since_sweep; if (!sweeping) { + // marking is over #ifdef GC_TIME post_time = jl_hrtime(); #endif - post_mark(); + // 4. check for objects to finalize + post_mark(); #ifdef GC_TIME post_time = jl_hrtime() - post_time; #endif - /* - est_fb = live_bytes - scanned_bytes - (sweep_mask == GC_MARKED_NOESC ? perm_scanned_bytes : perm_scanned_bytes) + actual_allocd; - promo_bytes = perm_scanned_bytes - last_perm_scanned; - int promo_pct = (actual_allocd - est_fb) ? (promo_bytes*100)/(actual_allocd - est_fb) : 100;*/ + /* We should be able to know at this point how much memory we are going to + free by running a sweep. This would allow us to make a better decision + about the next collection, instead of waiting another cycle. + However it has proven a bit annoying to get this number right given the many ways + of allocating memory (shared arrays, ...). TODO */ + //est_fb = live_bytes - scanned_bytes - perm_scanned_bytes + actual_allocd; #ifdef GC_VERIFY gc_verify(); #endif @@ -2120,27 +2105,28 @@ void jl_gc_collect(void) #endif total_allocd_bytes += allocd_bytes_since_sweep; - prepare_sweep(); - + // 5. next collection decision if (quick_count >= gc_quick_steps) { sweep_mask = GC_MARKED; // next collection is a full one - gc_steps = 1;//gc_inc_steps; + gc_steps = 1; quick_count = 0; } else { sweep_mask = GC_MARKED_NOESC; // next collection is quick - gc_steps = 1;//gc_quick_steps; + gc_steps = 1; } if (sweep_mask == GC_MARKED) perm_scanned_bytes = 0; scanned_bytes = 0; live_bytes2 = 0; + // 5. start sweeping gc_sweep_once(sweep_mask); sweeping = 1; - // gc_steps = gc_sweep_steps; } if (gc_sweep_inc(sweep_mask)) { // sweeping is over + // 6. if it is a quick sweep, put back the remembered objects in queued state + // so that we don't trigger the barrier again on them. if (sweep_mask == GC_MARKED_NOESC) { for (int i = 0; i < remset->len; i++) { gc_bits(((uintptr_t)remset->items[i] & ~(uintptr_t)1)) = GC_QUEUED; @@ -2161,7 +2147,9 @@ void jl_gc_collect(void) SAVE3 = allocd_bytes_since_sweep; pct = actual_allocd ? (freed_bytes*100)/actual_allocd : -1; #endif - + // 7. try to avoid doing unnescessary work and/or wasting too much memory + // as noted above this should really be decided _before_ sweeping. + // in any case this is still very ad hoc and could be improved a lot. if (sweep_mask == GC_MARKED_NOESC) { collect_interval = default_collect_interval; if (freed_bytes >= actual_allocd) { @@ -2221,7 +2209,7 @@ void jl_gc_collect(void) jl_in_gc = 0; } -#else +#if 0 void jl_gc_collect(void) { @@ -2415,9 +2403,7 @@ void jl_gc_init(void) collect_interval = default_collect_interval; allocd_bytes = -default_collect_interval; -#ifdef GC_INC gc_steps = gc_inc_steps; -#endif htable_new(&finalizer_table, 0); arraylist_new(&to_finalize, 0); @@ -2429,12 +2415,10 @@ void jl_gc_init(void) arraylist_new(&lostval_parents, 0); arraylist_new(&lostval_parents_done, 0); #endif -#ifdef GC_INC arraylist_new(&tasks, 0); arraylist_new(&rem_bindings, 0); arraylist_new(remset, 0); arraylist_new(last_remset, 0); -#endif #ifdef OBJPROFILE for(int g=0; g<3; g++) { diff --git a/src/julia.h b/src/julia.h index 4857fa9b171e2..4e0a1af0520f1 100644 --- a/src/julia.h +++ b/src/julia.h @@ -432,11 +432,7 @@ extern jl_sym_t *arrow_sym; extern jl_sym_t *ldots_sym; #ifdef OVERLAP_TUPLE_LEN #define jl_typeof(v) ((jl_value_t*)((uptrint_t)((jl_value_t*)(v))->type & 0x000ffffffffffffeULL)) #else -#ifdef GC_INC #define jl_typeof(v) ((jl_value_t*)((uptrint_t)((jl_value_t*)(v))->type & ((uintptr_t)~3))) -#else -#define jl_typeof(v) (((jl_value_t*)(v))->type) -#endif #endif #define jl_typeis(v,t) (jl_typeof(v)==(jl_value_t*)(t)) @@ -1383,20 +1379,18 @@ DLLEXPORT void gc_wb_slow(void* parent, void* ptr); static inline void gc_wb_binding(void *bnd, void *val) { - #ifdef GC_INC if (__unlikely((*(uintptr_t*)bnd & 1) == 1 && (*(uintptr_t*)val & 1) == 0)) gc_queue_binding(bnd); - #endif } static inline void gc_wb_fwd(void* parent, void* ptr) { - #ifdef GC_INC // if parent is marked and ptr is clean if(__unlikely((*((uintptr_t*)parent) & 1) == 1 && (*((uintptr_t*)ptr) & 1) == 0)) { + // the set lsb indicates this object must stay in the remset until the next + // long collection gc_queue_root((void*)((uintptr_t)ptr | 1)); } - #endif } static inline void gc_wb(void *parent, void *ptr) @@ -1408,23 +1402,18 @@ static inline void gc_wb(void *parent, void *ptr) static inline void gc_wb_buf(void *parent, void *bufptr) { - #ifdef GC_INC // if parent is marked and buf is not if (__unlikely((*((uintptr_t*)parent) & 1) == 1)) // (*((uintptr_t*)bufptr) & 3) != 1)) gc_setmark_buf(bufptr, *(uintptr_t*)parent & 3); - #endif } static inline void gc_wb_back(void *ptr) { - #ifdef GC_INC // if ptr is marked if(__unlikely((*((uintptr_t*)ptr) & 1) == 1)) { - // *((uintptr_t*)ptr) &= ~(uintptr_t)3; // clear the mark gc_queue_root(ptr); } - #endif } #ifdef __cplusplus diff --git a/src/options.h b/src/options.h index 80225e18bad2b..cf98971d18468 100644 --- a/src/options.h +++ b/src/options.h @@ -25,9 +25,7 @@ // only one GC is supported at this time #define JL_GC_MARKSWEEP -#ifndef GC_NO_INC -#define GC_INC -#endif + // debugging options // with MEMDEBUG, every object is allocated explicitly with malloc, and @@ -35,6 +33,8 @@ // catch invalid accesses. //#define MEMDEBUG +// GC_VERIFY force a full verification gc along with every quick gc to ensure no +// reachable memory is freed //#define GC_VERIFY // profiling options