@@ -21,8 +21,8 @@ int jl_n_sweepthreads;
2121_Atomic(int ) gc_n_threads_marking ;
2222// Number of threads sweeping
2323_Atomic(int ) gc_n_threads_sweeping ;
24- // Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
25- _Atomic(jl_gc_page_stack_t * ) gc_allocd_scratch ;
24+ // Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
25+ _Atomic(jl_gc_padded_page_stack_t * ) gc_allocd_scratch ;
2626// `tid` of mutator thread that triggered GC
2727_Atomic(int ) gc_master_tid ;
2828// `tid` of first GC thread
@@ -1596,8 +1596,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
15961596 pg -> nfree = nfree ;
15971597}
15981598
1599- void gc_sweep_wake_all (void )
1599+ // pre-scan pages to check whether there are enough pages so that's worth parallelizing
1600+ // also sweeps pages that don't need to be linearly scanned
1601+ int gc_sweep_prescan (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
16001602{
1603+ // 4MB worth of pages is worth parallelizing
1604+ const int n_pages_worth_parallel_sweep = (int )(4 * (1 << 20 ) / GC_PAGE_SZ );
1605+ int n_pages_to_scan = 0 ;
1606+ gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
1607+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1608+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1609+ if (ptls2 == NULL ) {
1610+ continue ;
1611+ }
1612+ jl_gc_page_stack_t * dest = & new_gc_allocd_scratch [ptls2 -> tid ].stack ;
1613+ jl_gc_page_stack_t tmp ;
1614+ jl_gc_pagemeta_t * tail = NULL ;
1615+ memset (& tmp , 0 , sizeof (tmp ));
1616+ while (1 ) {
1617+ jl_gc_pagemeta_t * pg = pop_lf_back_nosync (& ptls2 -> page_metadata_allocd );
1618+ if (pg == NULL ) {
1619+ break ;
1620+ }
1621+ int should_scan = 1 ;
1622+ if (!pg -> has_marked ) {
1623+ should_scan = 0 ;
1624+ }
1625+ if (!current_sweep_full && !pg -> has_young ) {
1626+ assert (!prev_sweep_full || pg -> prev_nold >= pg -> nold );
1627+ if (!prev_sweep_full || pg -> prev_nold == pg -> nold ) {
1628+ should_scan = 0 ;
1629+ }
1630+ }
1631+ if (should_scan ) {
1632+ if (tail == NULL ) {
1633+ tail = pg ;
1634+ }
1635+ n_pages_to_scan ++ ;
1636+ push_lf_back_nosync (& tmp , pg );
1637+ }
1638+ else {
1639+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
1640+ }
1641+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1642+ break ;
1643+ }
1644+ }
1645+ if (tail != NULL ) {
1646+ tail -> next = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1647+ }
1648+ ptls2 -> page_metadata_allocd = tmp ;
1649+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1650+ break ;
1651+ }
1652+ }
1653+ gc_page_serializer_destroy (& serializer );
1654+ return n_pages_to_scan >= n_pages_worth_parallel_sweep ;
1655+ }
1656+
1657+ // wake up all threads to sweep the pages
1658+ void gc_sweep_wake_all (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1659+ {
1660+ int parallel_sweep_worthwhile = gc_sweep_prescan (ptls , new_gc_allocd_scratch );
1661+ jl_atomic_store (& gc_allocd_scratch , new_gc_allocd_scratch );
1662+ if (!parallel_sweep_worthwhile ) {
1663+ return ;
1664+ }
16011665 uv_mutex_lock (& gc_threads_lock );
16021666 for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
16031667 jl_ptls_t ptls2 = gc_all_tls_states [i ];
@@ -1608,6 +1672,7 @@ void gc_sweep_wake_all(void)
16081672 uv_mutex_unlock (& gc_threads_lock );
16091673}
16101674
1675+ // wait for all threads to finish sweeping
16111676void gc_sweep_wait_for_all (void )
16121677{
16131678 jl_atomic_store (& gc_allocd_scratch , NULL );
@@ -1616,36 +1681,58 @@ void gc_sweep_wait_for_all(void)
16161681 }
16171682}
16181683
1619- void gc_sweep_pool_parallel (void )
1684+ // sweep all pools
1685+ void gc_sweep_pool_parallel (jl_ptls_t ptls )
16201686{
16211687 jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
1622- jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1688+ jl_gc_padded_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
16231689 if (allocd_scratch != NULL ) {
16241690 gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
16251691 while (1 ) {
16261692 int found_pg = 0 ;
1693+ // sequentially walk the threads and sweep the pages
16271694 for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
16281695 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1696+ // skip foreign threads that already exited
16291697 if (ptls2 == NULL ) {
16301698 continue ;
16311699 }
1632- jl_gc_page_stack_t * allocd = & allocd_scratch [t_i ];
1633- jl_gc_pagemeta_t * pg = pop_lf_back (& ptls2 -> page_metadata_allocd );
1700+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls2 -> tid ].stack ;
1701+ jl_gc_pagemeta_t * pg = try_pop_lf_back (& ptls2 -> page_metadata_allocd );
1702+ // failed steal attempt
16341703 if (pg == NULL ) {
16351704 continue ;
16361705 }
1637- gc_sweep_pool_page (& serializer , allocd , & ptls2 -> page_metadata_buffered , pg );
1706+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
16381707 found_pg = 1 ;
16391708 }
16401709 if (!found_pg ) {
1641- break ;
1710+ // check for termination
1711+ int no_more_work = 1 ;
1712+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1713+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1714+ // skip foreign threads that already exited
1715+ if (ptls2 == NULL ) {
1716+ continue ;
1717+ }
1718+ jl_gc_pagemeta_t * pg = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1719+ if (pg != NULL ) {
1720+ no_more_work = 0 ;
1721+ break ;
1722+ }
1723+ }
1724+ if (no_more_work ) {
1725+ break ;
1726+ }
16421727 }
1728+ jl_cpu_pause ();
16431729 }
16441730 gc_page_serializer_destroy (& serializer );
16451731 }
16461732 jl_atomic_fetch_add (& gc_n_threads_sweeping , -1 );
16471733}
16481734
1735+ // free all pages (i.e. through `madvise` on Linux) that were lazily freed
16491736void gc_free_pages (void )
16501737{
16511738 while (1 ) {
@@ -1670,7 +1757,7 @@ static void gc_sweep_pool(void)
16701757
16711758 // allocate enough space to hold the end of the free list chain
16721759 // for every thread and pool size
1673- jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) alloca (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1760+ jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) malloc_s (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
16741761
16751762 // update metadata of pages that were pointed to by freelist or newpages from a pool
16761763 // i.e. pages being the current allocation target
@@ -1712,17 +1799,18 @@ static void gc_sweep_pool(void)
17121799 }
17131800
17141801 // the actual sweeping
1715- jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )alloca (n_threads * sizeof (jl_gc_page_stack_t ));
1716- memset (tmp , 0 , n_threads * sizeof (jl_gc_page_stack_t ));
1717- jl_atomic_store ( & gc_allocd_scratch , tmp ) ;
1718- gc_sweep_wake_all ();
1719- gc_sweep_pool_parallel ();
1802+ jl_gc_padded_page_stack_t * new_gc_allocd_scratch = (jl_gc_padded_page_stack_t * ) malloc_s (n_threads * sizeof (jl_gc_padded_page_stack_t ));
1803+ memset (new_gc_allocd_scratch , 0 , n_threads * sizeof (jl_gc_padded_page_stack_t ));
1804+ jl_ptls_t ptls = jl_current_task -> ptls ;
1805+ gc_sweep_wake_all (ptls , new_gc_allocd_scratch );
1806+ gc_sweep_pool_parallel (ptls );
17201807 gc_sweep_wait_for_all ();
17211808
1809+ // reset half-pages pointers
17221810 for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
17231811 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
17241812 if (ptls2 != NULL ) {
1725- ptls2 -> page_metadata_allocd = tmp [t_i ];
1813+ ptls2 -> page_metadata_allocd = new_gc_allocd_scratch [t_i ]. stack ;
17261814 for (int i = 0 ; i < JL_GC_N_POOLS ; i ++ ) {
17271815 jl_gc_pool_t * p = & ptls2 -> heap .norm_pools [i ];
17281816 p -> newpages = NULL ;
@@ -1760,6 +1848,10 @@ static void gc_sweep_pool(void)
17601848 }
17611849 }
17621850
1851+ // cleanup
1852+ free (pfl );
1853+ free (new_gc_allocd_scratch );
1854+
17631855#ifdef _P64 // only enable concurrent sweeping on 64bit
17641856 // wake thread up to sweep concurrently
17651857 if (jl_n_sweepthreads > 0 ) {
0 commit comments