Skip to content

Commit a1c395d

Browse files
gbaraldid-netto
authored andcommitted
parallel sweeping of stack pools
1 parent 7843330 commit a1c395d

File tree

5 files changed

+101
-26
lines changed

5 files changed

+101
-26
lines changed

src/gc-stacks.c

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
190190
return stk;
191191
}
192192

193-
void sweep_stack_pools(void)
193+
void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT
194194
{
195195
// Stack sweeping algorithm:
196196
// // deallocate stacks if we have too many sitting around unused
@@ -203,27 +203,43 @@ void sweep_stack_pools(void)
203203
// bufsz = t->bufsz
204204
// if (stkbuf)
205205
// push(free_stacks[sz], stkbuf)
206-
assert(gc_n_threads);
207-
for (int i = 0; i < gc_n_threads; i++) {
206+
jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, 1);
207+
while (1) {
208+
int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1);
209+
if (i < 0)
210+
break;
208211
jl_ptls_t ptls2 = gc_all_tls_states[i];
209-
212+
if (ptls2 == NULL)
213+
continue;
214+
assert(gc_n_threads);
210215
// free half of stacks that remain unused since last sweep
211-
for (int p = 0; p < JL_N_STACK_POOLS; p++) {
212-
small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
213-
size_t n_to_free;
214-
if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
215-
n_to_free = al->len / 2;
216-
if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
217-
n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
218-
}
219-
else {
220-
n_to_free = 0;
221-
}
222-
for (int n = 0; n < n_to_free; n++) {
223-
void *stk = small_arraylist_pop(al);
224-
free_stack(stk, pool_sizes[p]);
216+
if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) {
217+
for (int p = 0; p < JL_N_STACK_POOLS; p++) {
218+
small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
219+
size_t n_to_free;
220+
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
221+
n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
222+
}
223+
else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
224+
n_to_free = al->len / 2;
225+
if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
226+
n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
227+
}
228+
else {
229+
n_to_free = 0;
230+
}
231+
for (int n = 0; n < n_to_free; n++) {
232+
void *stk = small_arraylist_pop(al);
233+
free_stack(stk, pool_sizes[p]);
234+
}
235+
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
236+
small_arraylist_free(al);
237+
}
225238
}
226239
}
240+
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
241+
small_arraylist_free(ptls2->gc_tls.heap.free_stacks);
242+
}
227243

228244
small_arraylist_t *live_tasks = &ptls2->gc_tls.heap.live_tasks;
229245
size_t n = 0;
@@ -264,6 +280,7 @@ void sweep_stack_pools(void)
264280
}
265281
live_tasks->len -= ndel;
266282
}
283+
jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, -1);
267284
}
268285

269286
JL_DLLEXPORT jl_array_t *jl_live_tasks(void)

src/gc-tls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ typedef struct {
8282
jl_gc_markqueue_t mark_queue;
8383
jl_gc_mark_cache_t gc_cache;
8484
_Atomic(size_t) gc_sweeps_requested;
85+
_Atomic(size_t) gc_stack_sweep_requested;
8586
arraylist_t sweep_objs;
8687
} jl_gc_tls_states_t;
8788

src/gc.c

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,17 @@ int jl_n_sweepthreads;
2121
// Number of threads currently running the GC mark-loop
2222
_Atomic(int) gc_n_threads_marking;
2323
// Number of threads sweeping
24-
_Atomic(int) gc_n_threads_sweeping;
24+
_Atomic(int) gc_n_threads_sweeping_pools;
25+
// Number of threads sweeping stacks
26+
_Atomic(int) gc_n_threads_sweeping_stacks;
2527
// Temporary for the `ptls->gc_tls.page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
2628
_Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch;
2729
// `tid` of mutator thread that triggered GC
2830
_Atomic(int) gc_master_tid;
31+
// counter for sharing work when sweeping stacks
32+
_Atomic(int) gc_ptls_sweep_idx;
33+
// counter for round robin of giving back stack pages to the OS
34+
_Atomic(int) gc_stack_free_idx;
2935
// `tid` of first GC thread
3036
int gc_first_tid;
3137
// Mutex/cond used to synchronize wakeup of GC threads on parallel marking
@@ -1525,6 +1531,44 @@ static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
15251531
gc_num.total_sweep_free_mallocd_memory_time += t_free_mallocd_memory_end - t_free_mallocd_memory_start;
15261532
}
15271533

1534+
// wake up all threads to sweep the stacks
1535+
void gc_sweep_wake_all_stacks(jl_ptls_t ptls) JL_NOTSAFEPOINT
1536+
{
1537+
uv_mutex_lock(&gc_threads_lock);
1538+
int first = gc_first_parallel_collector_thread_id();
1539+
int last = gc_last_parallel_collector_thread_id();
1540+
for (int i = first; i <= last; i++) {
1541+
jl_ptls_t ptls2 = gc_all_tls_states[i];
1542+
gc_check_ptls_of_parallel_collector_thread(ptls2);
1543+
jl_atomic_fetch_add(&ptls2->gc_tls.gc_stack_sweep_requested, 1);
1544+
}
1545+
uv_cond_broadcast(&gc_threads_cond);
1546+
uv_mutex_unlock(&gc_threads_lock);
1547+
return;
1548+
}
1549+
1550+
void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT
1551+
{
1552+
while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx) >= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping_stacks) != 0) {
1553+
jl_cpu_pause();
1554+
}
1555+
}
1556+
1557+
void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT
1558+
{
1559+
// initialize ptls index for parallel sweeping of stack pools
1560+
assert(gc_n_threads);
1561+
int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx);
1562+
if (stack_free_idx + 1 == gc_n_threads)
1563+
jl_atomic_store_relaxed(&gc_stack_free_idx, 0);
1564+
else
1565+
jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1);
1566+
jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial
1567+
gc_sweep_wake_all_stacks(ptls);
1568+
sweep_stack_pool_loop();
1569+
gc_sweep_wait_for_all_stacks();
1570+
}
1571+
15281572
static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT
15291573
{
15301574
assert(pg->fl_begin_offset != UINT16_MAX);
@@ -1639,15 +1683,15 @@ void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_
16391683
void gc_sweep_wait_for_all(void)
16401684
{
16411685
jl_atomic_store(&gc_allocd_scratch, NULL);
1642-
while (jl_atomic_load_relaxed(&gc_n_threads_sweeping) != 0) {
1686+
while (jl_atomic_load_acquire(&gc_n_threads_sweeping_pools) != 0) {
16431687
jl_cpu_pause();
16441688
}
16451689
}
16461690

16471691
// sweep all pools
16481692
void gc_sweep_pool_parallel(jl_ptls_t ptls)
16491693
{
1650-
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
1694+
jl_atomic_fetch_add(&gc_n_threads_sweeping_pools, 1);
16511695
jl_gc_padded_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
16521696
if (allocd_scratch != NULL) {
16531697
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
@@ -1692,7 +1736,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls)
16921736
}
16931737
gc_page_serializer_destroy(&serializer);
16941738
}
1695-
jl_atomic_fetch_add(&gc_n_threads_sweeping, -1);
1739+
jl_atomic_fetch_add(&gc_n_threads_sweeping_pools, -1);
16961740
}
16971741

16981742
// free all pages (i.e. through `madvise` on Linux) that were lazily freed
@@ -3604,7 +3648,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
36043648
#endif
36053649
current_sweep_full = sweep_full;
36063650
sweep_weak_refs();
3607-
sweep_stack_pools();
3651+
sweep_stack_pools(ptls);
36083652
gc_sweep_foreign_objs();
36093653
gc_sweep_other(ptls, sweep_full);
36103654
gc_scrub();

src/gc.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -564,7 +564,10 @@ extern uv_mutex_t gc_threads_lock;
564564
extern uv_cond_t gc_threads_cond;
565565
extern uv_sem_t gc_sweep_assists_needed;
566566
extern _Atomic(int) gc_n_threads_marking;
567-
extern _Atomic(int) gc_n_threads_sweeping;
567+
extern _Atomic(int) gc_n_threads_sweeping_pools;
568+
extern _Atomic(int) gc_n_threads_sweeping_stacks;
569+
extern _Atomic(int) gc_ptls_sweep_idx;
570+
extern _Atomic(int) gc_stack_free_idx;
568571
extern uv_barrier_t thread_init_done;
569572
void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
570573
void gc_mark_finlist_(jl_gc_markqueue_t *mq, jl_value_t *fl_parent, jl_value_t **fl_begin, jl_value_t **fl_end) JL_NOTSAFEPOINT;
@@ -574,7 +577,7 @@ void gc_mark_loop_serial(jl_ptls_t ptls);
574577
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
575578
void gc_sweep_pool_parallel(jl_ptls_t ptls);
576579
void gc_free_pages(void);
577-
void sweep_stack_pools(void);
580+
void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT;
578581
void jl_gc_debug_init(void);
579582

580583
// GC pages

src/partr.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,11 @@ static inline int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT
118118
return (jl_atomic_load(&ptls->gc_tls.gc_sweeps_requested) > 0);
119119
}
120120

121+
static inline int may_sweep_stack(jl_ptls_t ptls) JL_NOTSAFEPOINT
122+
{
123+
return (jl_atomic_load(&ptls->gc_tls.gc_stack_sweep_requested) > 0);
124+
}
125+
121126
// parallel gc thread function
122127
void jl_parallel_gc_threadfun(void *arg)
123128
{
@@ -139,12 +144,17 @@ void jl_parallel_gc_threadfun(void *arg)
139144

140145
while (1) {
141146
uv_mutex_lock(&gc_threads_lock);
142-
while (!may_mark() && !may_sweep(ptls)) {
147+
while (!may_mark() && !may_sweep(ptls) && !may_sweep_stack(ptls)) {
143148
uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
144149
}
145150
uv_mutex_unlock(&gc_threads_lock);
146151
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
147152
gc_mark_loop_parallel(ptls, 0);
153+
if (may_sweep_stack(ptls)) {
154+
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
155+
sweep_stack_pool_loop();
156+
jl_atomic_fetch_add(&ptls->gc_tls.gc_stack_sweep_requested, -1);
157+
}
148158
if (may_sweep(ptls)) {
149159
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
150160
gc_sweep_pool_parallel(ptls);

0 commit comments

Comments
 (0)