diff --git a/src/Makefile b/src/Makefile index 75fb3265b88df4..1ccbee2c1afa32 100644 --- a/src/Makefile +++ b/src/Makefile @@ -28,7 +28,7 @@ SRCS := \ jltypes gf typemap ast builtins module interpreter \ alloc dlload sys init task array dump toplevel jl_uv jlapi signal-handling \ simplevector APInt-C runtime_intrinsics runtime_ccall \ - threadgroup threading stackwalk gc gc-debug safepoint + threadgroup threading stackwalk gc gc-debug gc-pages safepoint ifeq ($(JULIACODEGEN),LLVM) SRCS += codegen disasm debuginfo llvm-simdloop llvm-gcroot @@ -119,6 +119,7 @@ $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/codegen_internal.h $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/table.c $(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc.h +$(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $(SRCDIR)/,signals-*.c) $(BUILDDIR)/dump.o $(BUILDDIR)/dump.dbg.obj: $(addprefix $(SRCDIR)/,common_symbols1.inc common_symbols2.inc) $(addprefix $(BUILDDIR)/,threading.o threading.dbg.obj gc.o gc.dbg.obj init.c init.dbg.obj task.o task.dbg.obj): $(addprefix $(SRCDIR)/,threading.h threadgroup.h) diff --git a/src/gc-pages.c b/src/gc-pages.c new file mode 100644 index 00000000000000..1289cf6792d97f --- /dev/null +++ b/src/gc-pages.c @@ -0,0 +1,155 @@ +// This file is a part of Julia. License is MIT: http://julialang.org/license + +#include "gc.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// A region is contiguous storage for up to REGION_PG_COUNT naturally aligned GC_PAGE_SZ pages +// It uses a very naive allocator (see jl_gc_alloc_page & jl_gc_free_page) +#if defined(_P64) && !defined(_COMPILER_MICROSOFT_) +#define REGION_PG_COUNT 16*8*4096 // 8G because virtual memory is cheap +#else +#define REGION_PG_COUNT 8*4096 // 512M +#endif + +static jl_mutex_t pagealloc_lock; +static size_t current_pg_count = 0; + +NOINLINE void *jl_gc_alloc_page(void) +{ + void *ptr = NULL; + int i; + region_t *region; + int region_i = 0; + JL_LOCK_NOGC(&pagealloc_lock); + while(region_i < REGION_COUNT) { + region = ®ions[region_i]; + if (region->pages == NULL) { + int pg_cnt = REGION_PG_COUNT; + const size_t pages_sz = sizeof(jl_gc_page_t) * pg_cnt; + const size_t freemap_sz = sizeof(uint32_t) * pg_cnt / 32; + const size_t meta_sz = sizeof(jl_gc_pagemeta_t) * pg_cnt; + size_t alloc_size = pages_sz + freemap_sz + meta_sz; +#ifdef _OS_WINDOWS_ + char *mem = (char*)VirtualAlloc(NULL, alloc_size + GC_PAGE_SZ, + MEM_RESERVE, PAGE_READWRITE); +#else + if (GC_PAGE_SZ > jl_page_size) + alloc_size += GC_PAGE_SZ; + char *mem = (char*)mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + mem = mem == MAP_FAILED ? NULL : mem; +#endif + if (mem == NULL) { + jl_printf(JL_STDERR, "could not allocate pools\n"); + gc_debug_critical_error(); + abort(); + } + if (GC_PAGE_SZ > jl_page_size) { + // round data pointer up to the nearest gc_page_data-aligned + // boundary if mmap didn't already do so. + mem = (char*)gc_page_data(mem + GC_PAGE_SZ - 1); + } + region->pages = (jl_gc_page_t*)mem; + region->freemap = (uint32_t*)(mem + pages_sz); + region->meta = (jl_gc_pagemeta_t*)(mem + pages_sz +freemap_sz); + region->lb = 0; + region->ub = 0; + region->pg_cnt = pg_cnt; +#ifdef _OS_WINDOWS_ + VirtualAlloc(region->freemap, region->pg_cnt / 8, + MEM_COMMIT, PAGE_READWRITE); + VirtualAlloc(region->meta, region->pg_cnt * sizeof(jl_gc_pagemeta_t), + MEM_COMMIT, PAGE_READWRITE); +#endif + memset(region->freemap, 0xff, region->pg_cnt / 8); + } + for (i = region->lb; i < region->pg_cnt / 32; i++) { + if (region->freemap[i]) + break; + } + if (i == region->pg_cnt / 32) { + // region full + region_i++; + continue; + } + break; + } + if (region_i >= REGION_COUNT) { + jl_printf(JL_STDERR, "increase REGION_COUNT or allocate less memory\n"); + gc_debug_critical_error(); + abort(); + } + if (region->lb < i) + region->lb = i; + if (region->ub < i) + region->ub = i; + +#if defined(_COMPILER_MINGW_) + int j = __builtin_ffs(region->freemap[i]) - 1; +#elif defined(_COMPILER_MICROSOFT_) + unsigned long j; + _BitScanForward(&j, region->freemap[i]); +#else + int j = ffs(region->freemap[i]) - 1; +#endif + + region->freemap[i] &= ~(uint32_t)(1 << j); + ptr = region->pages[i*32 + j].data; +#ifdef _OS_WINDOWS_ + VirtualAlloc(ptr, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); +#endif + current_pg_count++; +#ifdef GC_FINAL_STATS + max_pg_count = max_pg_count < current_pg_count ? current_pg_count : max_pg_count; +#endif + JL_UNLOCK_NOGC(&pagealloc_lock); + return ptr; +} + +void jl_gc_free_page(void *p) +{ + int pg_idx = -1; + int i; + region_t *region = regions; + for (i = 0; i < REGION_COUNT && regions[i].pages != NULL; i++) { + region = ®ions[i]; + pg_idx = page_index(region, p); + if (pg_idx >= 0 && pg_idx < region->pg_cnt) { + break; + } + } + assert(i < REGION_COUNT && region->pages != NULL); + uint32_t msk = (uint32_t)(1 << (pg_idx % 32)); + assert(!(region->freemap[pg_idx/32] & msk)); + region->freemap[pg_idx/32] ^= msk; + free(region->meta[pg_idx].ages); + // tell the OS we don't need these pages right now + size_t decommit_size = GC_PAGE_SZ; + if (GC_PAGE_SZ < jl_page_size) { + // ensure so we don't release more memory than intended + size_t n_pages = (GC_PAGE_SZ + jl_page_size - 1) / GC_PAGE_SZ; + decommit_size = jl_page_size; + p = (void*)((uintptr_t)region->pages[pg_idx].data & ~(jl_page_size - 1)); // round down to the nearest page + pg_idx = page_index(region, p); + if (pg_idx + n_pages > region->pg_cnt) goto no_decommit; + for (; n_pages--; pg_idx++) { + msk = (uint32_t)(1 << ((pg_idx % 32))); + if (!(region->freemap[pg_idx/32] & msk)) goto no_decommit; + } + } +#ifdef _OS_WINDOWS_ + VirtualFree(p, decommit_size, MEM_DECOMMIT); +#else + madvise(p, decommit_size, MADV_DONTNEED); +#endif +no_decommit: + if (region->lb > pg_idx / 32) + region->lb = pg_idx / 32; + current_pg_count--; +} + +#ifdef __cplusplus +} +#endif diff --git a/src/gc.c b/src/gc.c index 49843a69cb499e..ca8542f43a2bb6 100644 --- a/src/gc.c +++ b/src/gc.c @@ -6,10 +6,9 @@ extern "C" { #endif -jl_mutex_t pagealloc_lock; // Protect all access to `finalizer_list`, `finalizer_list_marked` and // `to_finalize`. -jl_mutex_t finalizers_lock; +static jl_mutex_t finalizers_lock; /** * Note about GC synchronization: @@ -273,8 +272,6 @@ static size_t max_collect_interval = 500000000UL; #define NS2MS(t) ((double)(t/1000)/1000) static int64_t live_bytes = 0; static int64_t promoted_bytes = 0; -static size_t current_pg_count = 0; -static size_t max_pg_count = 0; JL_DLLEXPORT size_t jl_gc_total_freed_bytes=0; #ifdef GC_FINAL_STATS @@ -424,136 +421,6 @@ inline void gc_setmark_buf(void *o, int mark_mode) gc_setmark_big(buf, mark_mode); } -static NOINLINE void *malloc_page(void) -{ - void *ptr = (void*)0; - int i; - region_t *region; - int region_i = 0; - JL_LOCK_NOGC(&pagealloc_lock); - while(region_i < REGION_COUNT) { - region = ®ions[region_i]; - if (region->pages == NULL) { - const size_t pages_sz = sizeof(jl_gc_page_t) * REGION_PG_COUNT; - const size_t freemap_sz = sizeof(uint32_t) * REGION_PG_COUNT / 32; - const size_t meta_sz = sizeof(jl_gc_pagemeta_t) * REGION_PG_COUNT; - size_t alloc_size = pages_sz + freemap_sz + meta_sz; -#ifdef _OS_WINDOWS_ - char *mem = (char*)VirtualAlloc(NULL, alloc_size + GC_PAGE_SZ, - MEM_RESERVE, PAGE_READWRITE); -#else - if (GC_PAGE_SZ > jl_page_size) - alloc_size += GC_PAGE_SZ; - char *mem = (char*)mmap(0, alloc_size, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - mem = mem == MAP_FAILED ? NULL : mem; -#endif - if (mem == NULL) { - jl_printf(JL_STDERR, "could not allocate pools\n"); - gc_debug_critical_error(); - abort(); - } - if (GC_PAGE_SZ > jl_page_size) { - // round data pointer up to the nearest gc_page_data-aligned - // boundary if mmap didn't already do so. - mem = (char*)gc_page_data(mem + GC_PAGE_SZ - 1); - } - region->pages = (jl_gc_page_t*)mem; - region->freemap = (uint32_t*)(mem + pages_sz); - region->meta = (jl_gc_pagemeta_t*)(mem + pages_sz +freemap_sz); - region->lb = 0; - region->ub = 0; - region->pg_cnt = REGION_PG_COUNT; -#ifdef _OS_WINDOWS_ - VirtualAlloc(region->freemap, region->pg_cnt / 8, - MEM_COMMIT, PAGE_READWRITE); - VirtualAlloc(region->meta, region->pg_cnt * sizeof(jl_gc_pagemeta_t), - MEM_COMMIT, PAGE_READWRITE); -#endif - memset(region->freemap, 0xff, region->pg_cnt / 8); - } - for (i = region->lb; i < region->pg_cnt / 32; i++) { - if (region->freemap[i]) - break; - } - if (i == region->pg_cnt / 32) { - // region full - region_i++; - continue; - } - break; - } - if (region_i >= REGION_COUNT) { - jl_printf(JL_STDERR, "increase REGION_COUNT or allocate less memory\n"); - gc_debug_critical_error(); - abort(); - } - if (region->lb < i) - region->lb = i; - if (region->ub < i) - region->ub = i; - -#if defined(_COMPILER_MINGW_) - int j = __builtin_ffs(region->freemap[i]) - 1; -#elif defined(_COMPILER_MICROSOFT_) - unsigned long j; - _BitScanForward(&j, region->freemap[i]); -#else - int j = ffs(region->freemap[i]) - 1; -#endif - - region->freemap[i] &= ~(uint32_t)(1 << j); - ptr = region->pages[i*32 + j].data; -#ifdef _OS_WINDOWS_ - VirtualAlloc(ptr, GC_PAGE_SZ, MEM_COMMIT, PAGE_READWRITE); -#endif - current_pg_count++; - max_pg_count = max_pg_count < current_pg_count ? current_pg_count : max_pg_count; - JL_UNLOCK_NOGC(&pagealloc_lock); - return ptr; -} - -static void free_page(void *p) -{ - int pg_idx = -1; - int i; - region_t *region = regions; - for (i = 0; i < REGION_COUNT && regions[i].pages != NULL; i++) { - region = ®ions[i]; - pg_idx = page_index(region, p); - if (pg_idx >= 0 && pg_idx < region->pg_cnt) { - break; - } - } - assert(i < REGION_COUNT && region->pages != NULL); - uint32_t msk = (uint32_t)(1 << (pg_idx % 32)); - assert(!(region->freemap[pg_idx/32] & msk)); - region->freemap[pg_idx/32] ^= msk; - free(region->meta[pg_idx].ages); - // tell the OS we don't need these pages right now - size_t decommit_size = GC_PAGE_SZ; - if (GC_PAGE_SZ < jl_page_size) { - // ensure so we don't release more memory than intended - size_t n_pages = (GC_PAGE_SZ + jl_page_size - 1) / GC_PAGE_SZ; - decommit_size = jl_page_size; - p = (void*)((uintptr_t)region->pages[pg_idx].data & ~(jl_page_size - 1)); // round down to the nearest page - pg_idx = page_index(region, p); - if (pg_idx + n_pages > region->pg_cnt) goto no_decommit; - for (; n_pages--; pg_idx++) { - msk = (uint32_t)(1 << ((pg_idx % 32))); - if (!(region->freemap[pg_idx/32] & msk)) goto no_decommit; - } - } -#ifdef _OS_WINDOWS_ - VirtualFree(p, decommit_size, MEM_DECOMMIT); -#else - madvise(p, decommit_size, MADV_DONTNEED); -#endif -no_decommit: - if (region->lb > pg_idx / 32) - region->lb = pg_idx / 32; - current_pg_count--; -} - #define should_collect() (__unlikely(gc_num.allocd>0)) static inline int maybe_collect(void) @@ -795,7 +662,7 @@ static inline gcval_t *reset_page(pool_t *p, jl_gc_pagemeta_t *pg, gcval_t *fl) static NOINLINE void add_page(pool_t *p) { - char *data = (char*)malloc_page(); + char *data = (char*)jl_gc_alloc_page(); if (data == NULL) jl_throw(jl_memory_exception); jl_gc_pagemeta_t *pg = page_metadata(data + GC_PAGE_OFFSET); @@ -1070,7 +937,7 @@ static gcval_t **sweep_page(pool_t *p, jl_gc_pagemeta_t *pg, gcval_t **pfl, int #ifdef MEMDEBUG memset(pg->data, 0xbb, GC_PAGE_SZ); #endif - free_page(data); + jl_gc_free_page(data); #ifdef MEMDEBUG memset(pg, 0xbb, sizeof(jl_gc_pagemeta_t)); #endif @@ -2109,6 +1976,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void) #ifdef GC_FINAL_STATS static double process_t0; +size_t max_pg_count = 0; #include void jl_print_gc_stats(JL_STREAM *s) { diff --git a/src/gc.h b/src/gc.h index cb8bd5d858ca0f..7186c15dfcd5e3 100644 --- a/src/gc.h +++ b/src/gc.h @@ -41,13 +41,6 @@ extern "C" { #define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k #define GC_PAGE_OFFSET (JL_SMALL_BYTE_ALIGNMENT - (sizeof_jl_taggedvalue_t % JL_SMALL_BYTE_ALIGNMENT)) -// A region is contiguous storage for up to REGION_PG_COUNT naturally aligned GC_PAGE_SZ pages -// It uses a very naive allocator (see malloc_page & free_page) -#if defined(_P64) && !defined(_COMPILER_MICROSOFT_) -#define REGION_PG_COUNT 16*8*4096 // 8G because virtual memory is cheap -#else -#define REGION_PG_COUNT 8*4096 // 512M -#endif // 8G * 32768 = 2^48 // It's really unlikely that we'll actually allocate that much though... #define REGION_COUNT 32768 @@ -241,8 +234,6 @@ typedef struct { jl_thread_heap_t *heap; } jl_single_heap_index_t; -extern jl_mutex_t pagealloc_lock; -extern jl_mutex_t finalizers_lock; extern jl_gc_num_t gc_num; extern region_t regions[REGION_COUNT]; extern bigval_t *big_objects_marked; @@ -250,6 +241,10 @@ extern arraylist_t finalizer_list; extern arraylist_t finalizer_list_marked; extern arraylist_t to_finalize; +// Counters +// GC_FINAL_STATS only +extern size_t max_pg_count; + #define bigval_header(data) container_of((data), bigval_t, header) // round an address inside a gcpage's data to its beginning @@ -343,6 +338,13 @@ extern jl_thread_heap_t *const jl_thread_heap; #define FOR_HEAP(t_n) _FOR_SINGLE_HEAP(jl_thread_heap) #endif +// GC pages + +NOINLINE void *jl_gc_alloc_page(void); +void jl_gc_free_page(void *p); + +// GC debug + #ifdef GC_VERIFY extern jl_value_t *lostval; void gc_verify(void);