Skip to content

Commit 569d190

Browse files
committed
reduce contention on page metadata lists during the sweeping phase
1 parent fb2d946 commit 569d190

File tree

4 files changed

+162
-14
lines changed

4 files changed

+162
-14
lines changed

src/gc.c

Lines changed: 119 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1541,7 +1541,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
15411541

15421542
done:
15431543
if (re_use_page) {
1544-
push_lf_back(allocd, pg);
1544+
// we're pushing into a local page stack to reduce contention
1545+
push_lf_back_nosync(allocd, pg);
15451546
}
15461547
else {
15471548
gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED);
@@ -1596,8 +1597,68 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
15961597
pg->nfree = nfree;
15971598
}
15981599

1599-
void gc_sweep_wake_all(void)
1600+
int gc_sweep_prescan(jl_ptls_t ptls)
16001601
{
1602+
// 4MB worth of pages is worth parallelizing
1603+
const int n_pages_worth_parallel_sweep = (int)(4 * (1 << 20) / GC_PAGE_SZ);
1604+
int n_pages_to_scan = 0;
1605+
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
1606+
// push into local page stack. we'll merge them later...
1607+
jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
1608+
assert(allocd_scratch != NULL);
1609+
jl_gc_page_stack_t *dest = &allocd_scratch[ptls->tid];
1610+
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
1611+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1612+
if (ptls2 == NULL) {
1613+
continue;
1614+
}
1615+
jl_gc_page_stack_t tmp;
1616+
jl_gc_pagemeta_t *tail = NULL;
1617+
memset(&tmp, 0, sizeof(tmp));
1618+
while (1) {
1619+
jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd);
1620+
if (pg == NULL) {
1621+
break;
1622+
}
1623+
if (tail == NULL) {
1624+
tail = pg;
1625+
}
1626+
int should_scan = 1;
1627+
if (!pg->has_marked) {
1628+
should_scan = 0;
1629+
}
1630+
if (!current_sweep_full && !pg->has_young) {
1631+
assert(!prev_sweep_full || pg->prev_nold >= pg->nold);
1632+
if (!prev_sweep_full || pg->prev_nold == pg->nold) {
1633+
should_scan = 0;
1634+
}
1635+
}
1636+
if (should_scan) {
1637+
n_pages_to_scan++;
1638+
push_lf_back_nosync(&tmp, pg);
1639+
}
1640+
else {
1641+
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
1642+
}
1643+
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
1644+
tail->next = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
1645+
}
1646+
}
1647+
ptls2->page_metadata_allocd = tmp;
1648+
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
1649+
break;
1650+
}
1651+
}
1652+
gc_page_serializer_destroy(&serializer);
1653+
return n_pages_to_scan >= n_pages_worth_parallel_sweep;
1654+
}
1655+
1656+
void gc_sweep_wake_all(jl_ptls_t ptls)
1657+
{
1658+
int parallel_sweep_worthwhile = gc_sweep_prescan(ptls);
1659+
if (!parallel_sweep_worthwhile) {
1660+
return;
1661+
}
16011662
uv_mutex_lock(&gc_threads_lock);
16021663
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
16031664
jl_ptls_t ptls2 = gc_all_tls_states[i];
@@ -1615,30 +1676,52 @@ void gc_sweep_wait_for_all(void)
16151676
}
16161677
}
16171678

1618-
void gc_sweep_pool_parallel(void)
1679+
void gc_sweep_pool_parallel(jl_ptls_t ptls)
16191680
{
16201681
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
16211682
jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
16221683
if (allocd_scratch != NULL) {
1684+
// push into local page stack to reduce contention
1685+
// we'll merge them later...
1686+
jl_gc_page_stack_t *dest = &allocd_scratch[ptls->tid];
16231687
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
16241688
while (1) {
16251689
int found_pg = 0;
1690+
// sequentially walk the threads and sweep the pages
16261691
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
16271692
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1693+
// skip foreign threads that already exited
16281694
if (ptls2 == NULL) {
16291695
continue;
16301696
}
1631-
jl_gc_page_stack_t *allocd = &allocd_scratch[t_i];
1632-
jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd);
1697+
jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd);
1698+
// failed steal attempt
16331699
if (pg == NULL) {
16341700
continue;
16351701
}
1636-
gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg);
1702+
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
16371703
found_pg = 1;
16381704
}
16391705
if (!found_pg) {
1640-
break;
1706+
// check for termination
1707+
int no_more_work = 1;
1708+
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
1709+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1710+
// skip foreign threads that already exited
1711+
if (ptls2 == NULL) {
1712+
continue;
1713+
}
1714+
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
1715+
if (pg != NULL) {
1716+
no_more_work = 0;
1717+
break;
1718+
}
1719+
}
1720+
if (no_more_work) {
1721+
break;
1722+
}
16411723
}
1724+
jl_cpu_pause();
16421725
}
16431726
gc_page_serializer_destroy(&serializer);
16441727
}
@@ -1669,7 +1752,7 @@ static void gc_sweep_pool(void)
16691752

16701753
// allocate enough space to hold the end of the free list chain
16711754
// for every thread and pool size
1672-
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
1755+
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) malloc_s(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
16731756

16741757
// update metadata of pages that were pointed to by freelist or newpages from a pool
16751758
// i.e. pages being the current allocation target
@@ -1711,17 +1794,37 @@ static void gc_sweep_pool(void)
17111794
}
17121795

17131796
// the actual sweeping
1714-
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
1797+
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)jl_malloc_aligned(n_threads * sizeof(jl_gc_page_stack_t), 128);
1798+
if (tmp == NULL) {
1799+
abort();
1800+
}
17151801
memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
17161802
jl_atomic_store(&gc_allocd_scratch, tmp);
1717-
gc_sweep_wake_all();
1718-
gc_sweep_pool_parallel();
1803+
jl_ptls_t ptls = jl_current_task->ptls;
1804+
gc_sweep_wake_all(ptls);
1805+
gc_sweep_pool_parallel(ptls);
17191806
gc_sweep_wait_for_all();
17201807

1808+
// merge the page metadata lists
1809+
for (int t_i = 0; t_i < n_threads; t_i++) {
1810+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1811+
if (ptls2 == NULL) {
1812+
continue;
1813+
}
1814+
while (1) {
1815+
jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&tmp[t_i]);
1816+
if (pg == NULL) {
1817+
break;
1818+
}
1819+
jl_ptls_t ptls3 = gc_all_tls_states[pg->thread_n];
1820+
push_lf_back_nosync(&ptls3->page_metadata_allocd, pg);
1821+
}
1822+
}
1823+
1824+
// reset half-pages pointers
17211825
for (int t_i = 0; t_i < n_threads; t_i++) {
17221826
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
17231827
if (ptls2 != NULL) {
1724-
ptls2->page_metadata_allocd = tmp[t_i];
17251828
for (int i = 0; i < JL_GC_N_POOLS; i++) {
17261829
jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
17271830
p->newpages = NULL;
@@ -1759,6 +1862,10 @@ static void gc_sweep_pool(void)
17591862
}
17601863
}
17611864

1865+
// cleanup
1866+
free(pfl);
1867+
free(tmp);
1868+
17621869
#ifdef _P64 // only enable concurrent sweeping on 64bit
17631870
// wake thread up to sweep concurrently
17641871
if (jl_n_sweepthreads > 0) {

src/gc.h

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,24 @@ extern jl_gc_page_stack_t global_page_pool_freed;
199199
// in the sweeping phase, which also doesn't push a node into the
200200
// same stack after it's popped
201201

202+
STATIC_INLINE void push_lf_back_nosync(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
203+
{
204+
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
205+
elt->next = old_back;
206+
jl_atomic_store_relaxed(&pool->bottom, elt);
207+
}
208+
209+
STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back_nosync(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
210+
{
211+
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
212+
if (old_back == NULL) {
213+
return NULL;
214+
}
215+
jl_gc_pagemeta_t *new_back = old_back->next;
216+
jl_atomic_store_relaxed(&pool->bottom, new_back);
217+
return old_back;
218+
}
219+
202220
STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
203221
{
204222
while (1) {
@@ -211,6 +229,23 @@ STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt)
211229
}
212230
}
213231

232+
#define MAX_POP_ATTEMPTS (1 << 10)
233+
234+
STATIC_INLINE jl_gc_pagemeta_t *try_pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
235+
{
236+
for (int i = 0; i < MAX_POP_ATTEMPTS; i++) {
237+
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
238+
if (old_back == NULL) {
239+
return NULL;
240+
}
241+
if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) {
242+
return old_back;
243+
}
244+
jl_cpu_pause();
245+
}
246+
return NULL;
247+
}
248+
214249
STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
215250
{
216251
while (1) {
@@ -473,7 +508,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_
473508
void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
474509
void gc_mark_loop_serial(jl_ptls_t ptls);
475510
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
476-
void gc_sweep_pool_parallel(void);
511+
void gc_sweep_pool_parallel(jl_ptls_t ptls);
477512
void gc_free_pages(void);
478513
void sweep_stack_pools(void);
479514
void jl_gc_debug_init(void);

src/julia_threads.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,12 @@ struct _jl_gc_pagemeta_t;
197197

198198
typedef struct {
199199
_Atomic(struct _jl_gc_pagemeta_t *) bottom;
200+
// pad to 128 bytes to avoid false-sharing
201+
#ifdef _P64
202+
void *_pad[15];
203+
#else
204+
void *_pad[31];
205+
#endif
200206
} jl_gc_page_stack_t;
201207

202208
// This includes all the thread local states we care about for a thread.

src/scheduler.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ void jl_parallel_gc_threadfun(void *arg)
147147
gc_mark_loop_parallel(ptls, 0);
148148
}
149149
if (may_sweep(ptls)) { // not an else!
150-
gc_sweep_pool_parallel();
150+
gc_sweep_pool_parallel(ptls);
151151
jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1);
152152
}
153153
}

0 commit comments

Comments
 (0)