Skip to content

Commit 6f831a1

Browse files
committed
reduce contention on page metadata lists during the sweeping phase
1 parent 1f111e1 commit 6f831a1

File tree

3 files changed

+156
-18
lines changed

3 files changed

+156
-18
lines changed

src/gc.c

Lines changed: 108 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ int jl_n_sweepthreads;
2121
_Atomic(int) gc_n_threads_marking;
2222
// Number of threads sweeping
2323
_Atomic(int) gc_n_threads_sweeping;
24-
// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
25-
_Atomic(jl_gc_page_stack_t *) gc_allocd_scratch;
24+
// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
25+
_Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch;
2626
// `tid` of mutator thread that triggered GC
2727
_Atomic(int) gc_master_tid;
2828
// `tid` of first GC thread
@@ -1596,8 +1596,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
15961596
pg->nfree = nfree;
15971597
}
15981598

1599-
void gc_sweep_wake_all(void)
1599+
// pre-scan pages to check whether there are enough pages so that's worth parallelizing
1600+
// also sweeps pages that don't need to be linearly scanned
1601+
int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
16001602
{
1603+
// 4MB worth of pages is worth parallelizing
1604+
const int n_pages_worth_parallel_sweep = (int)(4 * (1 << 20) / GC_PAGE_SZ);
1605+
int n_pages_to_scan = 0;
1606+
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
1607+
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
1608+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1609+
if (ptls2 == NULL) {
1610+
continue;
1611+
}
1612+
jl_gc_page_stack_t *dest = &new_gc_allocd_scratch[ptls2->tid].stack;
1613+
jl_gc_page_stack_t tmp;
1614+
jl_gc_pagemeta_t *tail = NULL;
1615+
memset(&tmp, 0, sizeof(tmp));
1616+
while (1) {
1617+
jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&ptls2->page_metadata_allocd);
1618+
if (pg == NULL) {
1619+
break;
1620+
}
1621+
int should_scan = 1;
1622+
if (!pg->has_marked) {
1623+
should_scan = 0;
1624+
}
1625+
if (!current_sweep_full && !pg->has_young) {
1626+
assert(!prev_sweep_full || pg->prev_nold >= pg->nold);
1627+
if (!prev_sweep_full || pg->prev_nold == pg->nold) {
1628+
should_scan = 0;
1629+
}
1630+
}
1631+
if (should_scan) {
1632+
if (tail == NULL) {
1633+
tail = pg;
1634+
}
1635+
n_pages_to_scan++;
1636+
push_lf_back_nosync(&tmp, pg);
1637+
}
1638+
else {
1639+
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
1640+
}
1641+
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
1642+
break;
1643+
}
1644+
}
1645+
if (tail != NULL) {
1646+
tail->next = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
1647+
}
1648+
ptls2->page_metadata_allocd = tmp;
1649+
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
1650+
break;
1651+
}
1652+
}
1653+
gc_page_serializer_destroy(&serializer);
1654+
return n_pages_to_scan >= n_pages_worth_parallel_sweep;
1655+
}
1656+
1657+
// wake up all threads to sweep the pages
1658+
void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
1659+
{
1660+
int parallel_sweep_worthwhile = gc_sweep_prescan(ptls, new_gc_allocd_scratch);
1661+
jl_atomic_store(&gc_allocd_scratch, new_gc_allocd_scratch);
1662+
if (!parallel_sweep_worthwhile) {
1663+
return;
1664+
}
16011665
uv_mutex_lock(&gc_threads_lock);
16021666
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
16031667
jl_ptls_t ptls2 = gc_all_tls_states[i];
@@ -1608,6 +1672,7 @@ void gc_sweep_wake_all(void)
16081672
uv_mutex_unlock(&gc_threads_lock);
16091673
}
16101674

1675+
// wait for all threads to finish sweeping
16111676
void gc_sweep_wait_for_all(void)
16121677
{
16131678
jl_atomic_store(&gc_allocd_scratch, NULL);
@@ -1616,36 +1681,58 @@ void gc_sweep_wait_for_all(void)
16161681
}
16171682
}
16181683

1619-
void gc_sweep_pool_parallel(void)
1684+
// sweep all pools
1685+
void gc_sweep_pool_parallel(jl_ptls_t ptls)
16201686
{
16211687
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
1622-
jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
1688+
jl_gc_padded_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
16231689
if (allocd_scratch != NULL) {
16241690
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
16251691
while (1) {
16261692
int found_pg = 0;
1693+
// sequentially walk the threads and sweep the pages
16271694
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
16281695
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1696+
// skip foreign threads that already exited
16291697
if (ptls2 == NULL) {
16301698
continue;
16311699
}
1632-
jl_gc_page_stack_t *allocd = &allocd_scratch[t_i];
1633-
jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd);
1700+
jl_gc_page_stack_t *dest = &allocd_scratch[ptls2->tid].stack;
1701+
jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd);
1702+
// failed steal attempt
16341703
if (pg == NULL) {
16351704
continue;
16361705
}
1637-
gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg);
1706+
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
16381707
found_pg = 1;
16391708
}
16401709
if (!found_pg) {
1641-
break;
1710+
// check for termination
1711+
int no_more_work = 1;
1712+
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
1713+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1714+
// skip foreign threads that already exited
1715+
if (ptls2 == NULL) {
1716+
continue;
1717+
}
1718+
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
1719+
if (pg != NULL) {
1720+
no_more_work = 0;
1721+
break;
1722+
}
1723+
}
1724+
if (no_more_work) {
1725+
break;
1726+
}
16421727
}
1728+
jl_cpu_pause();
16431729
}
16441730
gc_page_serializer_destroy(&serializer);
16451731
}
16461732
jl_atomic_fetch_add(&gc_n_threads_sweeping, -1);
16471733
}
16481734

1735+
// free all pages (i.e. through `madvise` on Linux) that were lazily freed
16491736
void gc_free_pages(void)
16501737
{
16511738
while (1) {
@@ -1670,7 +1757,7 @@ static void gc_sweep_pool(void)
16701757

16711758
// allocate enough space to hold the end of the free list chain
16721759
// for every thread and pool size
1673-
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
1760+
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) malloc_s(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
16741761

16751762
// update metadata of pages that were pointed to by freelist or newpages from a pool
16761763
// i.e. pages being the current allocation target
@@ -1712,17 +1799,18 @@ static void gc_sweep_pool(void)
17121799
}
17131800

17141801
// the actual sweeping
1715-
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
1716-
memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
1717-
jl_atomic_store(&gc_allocd_scratch, tmp);
1718-
gc_sweep_wake_all();
1719-
gc_sweep_pool_parallel();
1802+
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) malloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
1803+
memset(new_gc_allocd_scratch, 0, n_threads * sizeof(jl_gc_padded_page_stack_t));
1804+
jl_ptls_t ptls = jl_current_task->ptls;
1805+
gc_sweep_wake_all(ptls, new_gc_allocd_scratch);
1806+
gc_sweep_pool_parallel(ptls);
17201807
gc_sweep_wait_for_all();
17211808

1809+
// reset half-pages pointers
17221810
for (int t_i = 0; t_i < n_threads; t_i++) {
17231811
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
17241812
if (ptls2 != NULL) {
1725-
ptls2->page_metadata_allocd = tmp[t_i];
1813+
ptls2->page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
17261814
for (int i = 0; i < JL_GC_N_POOLS; i++) {
17271815
jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
17281816
p->newpages = NULL;
@@ -1760,6 +1848,10 @@ static void gc_sweep_pool(void)
17601848
}
17611849
}
17621850

1851+
// cleanup
1852+
free(pfl);
1853+
free(new_gc_allocd_scratch);
1854+
17631855
#ifdef _P64 // only enable concurrent sweeping on 64bit
17641856
// wake thread up to sweep concurrently
17651857
if (jl_n_sweepthreads > 0) {

src/gc.h

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,25 @@ extern jl_gc_page_stack_t global_page_pool_freed;
199199
// in the sweeping phase, which also doesn't push a node into the
200200
// same stack after it's popped
201201

202+
STATIC_INLINE void push_lf_back_nosync(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
203+
{
204+
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
205+
elt->next = old_back;
206+
jl_atomic_store_relaxed(&pool->bottom, elt);
207+
}
208+
209+
STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back_nosync(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
210+
{
211+
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
212+
if (old_back == NULL) {
213+
return NULL;
214+
}
215+
if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) {
216+
return old_back;
217+
}
218+
return NULL;
219+
}
220+
202221
STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
203222
{
204223
while (1) {
@@ -211,6 +230,23 @@ STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt)
211230
}
212231
}
213232

233+
#define MAX_POP_ATTEMPTS (1 << 10)
234+
235+
STATIC_INLINE jl_gc_pagemeta_t *try_pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
236+
{
237+
for (int i = 0; i < MAX_POP_ATTEMPTS; i++) {
238+
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
239+
if (old_back == NULL) {
240+
return NULL;
241+
}
242+
if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) {
243+
return old_back;
244+
}
245+
jl_cpu_pause();
246+
}
247+
return NULL;
248+
}
249+
214250
STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
215251
{
216252
while (1) {
@@ -224,6 +260,16 @@ STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFE
224260
jl_cpu_pause();
225261
}
226262
}
263+
typedef struct {
264+
jl_gc_page_stack_t stack;
265+
// pad to 128 bytes to avoid false-sharing
266+
#ifdef _P64
267+
void *_pad[15];
268+
#else
269+
void *_pad[31];
270+
#endif
271+
} jl_gc_padded_page_stack_t;
272+
static_assert(sizeof(jl_gc_padded_page_stack_t) == 128, "jl_gc_padded_page_stack_t is not 128 bytes");
227273

228274
typedef struct {
229275
_Atomic(size_t) n_freed_objs;
@@ -473,7 +519,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_
473519
void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
474520
void gc_mark_loop_serial(jl_ptls_t ptls);
475521
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
476-
void gc_sweep_pool_parallel(void);
522+
void gc_sweep_pool_parallel(jl_ptls_t ptls);
477523
void gc_free_pages(void);
478524
void sweep_stack_pools(void);
479525
void jl_gc_debug_init(void);

src/scheduler.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ void jl_parallel_gc_threadfun(void *arg)
147147
gc_mark_loop_parallel(ptls, 0);
148148
}
149149
if (may_sweep(ptls)) { // not an else!
150-
gc_sweep_pool_parallel();
150+
gc_sweep_pool_parallel(ptls);
151151
jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1);
152152
}
153153
}

0 commit comments

Comments
 (0)