@@ -1541,7 +1541,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
15411541
15421542done :
15431543 if (re_use_page ) {
1544- push_lf_back (allocd , pg );
1544+ // we're pushing into a local page stack to reduce contention
1545+ push_lf_back_nosync (allocd , pg );
15451546 }
15461547 else {
15471548 gc_alloc_map_set (pg -> data , GC_PAGE_LAZILY_FREED );
@@ -1596,8 +1597,49 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
15961597 pg -> nfree = nfree ;
15971598}
15981599
1600+ int gc_sweep_prescan (void )
1601+ {
1602+ int n_pages_to_scan = 0 ;
1603+ // 4MB worth of pages is worth parallelizing
1604+ const int n_pages_worth_parallel_sweep = (int )(4 * (1 << 20 ) / GC_PAGE_SZ );
1605+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1606+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1607+ if (ptls2 == NULL ) {
1608+ continue ;
1609+ }
1610+ jl_gc_pagemeta_t * pg = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1611+ while (pg != NULL ) {
1612+ int should_scan = 1 ;
1613+ if (!pg -> has_marked ) {
1614+ should_scan = 0 ;
1615+ }
1616+ if (!current_sweep_full && !pg -> has_young ) {
1617+ assert (!prev_sweep_full || pg -> prev_nold >= pg -> nold );
1618+ if (!prev_sweep_full || pg -> prev_nold == pg -> nold ) {
1619+ should_scan = 0 ;
1620+ }
1621+ }
1622+ if (should_scan ) {
1623+ n_pages_to_scan ++ ;
1624+ }
1625+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1626+ break ;
1627+ }
1628+ pg = pg -> next ;
1629+ }
1630+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1631+ break ;
1632+ }
1633+ }
1634+ return n_pages_to_scan >= n_pages_worth_parallel_sweep ;
1635+ }
1636+
15991637void gc_sweep_wake_all (void )
16001638{
1639+ int parallel_sweep_worthwhile = gc_sweep_prescan ();
1640+ if (!parallel_sweep_worthwhile ) {
1641+ return ;
1642+ }
16011643 uv_mutex_lock (& gc_threads_lock );
16021644 for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
16031645 jl_ptls_t ptls2 = gc_all_tls_states [i ];
@@ -1615,30 +1657,52 @@ void gc_sweep_wait_for_all(void)
16151657 }
16161658}
16171659
1618- void gc_sweep_pool_parallel (void )
1660+ void gc_sweep_pool_parallel (jl_ptls_t ptls )
16191661{
16201662 jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
16211663 jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
16221664 if (allocd_scratch != NULL ) {
1665+ // push into local page stack to reduce contention
1666+ // we'll merge them later...
1667+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls -> tid ];
16231668 gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
16241669 while (1 ) {
16251670 int found_pg = 0 ;
1671+ // sequentially walk the threads and sweep the pages
16261672 for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
16271673 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1674+ // skip foreign threads that already exited
16281675 if (ptls2 == NULL ) {
16291676 continue ;
16301677 }
1631- jl_gc_page_stack_t * allocd = & allocd_scratch [ t_i ] ;
1632- jl_gc_pagemeta_t * pg = pop_lf_back ( & ptls2 -> page_metadata_allocd );
1678+ jl_gc_pagemeta_t * pg = try_pop_lf_back ( & ptls2 -> page_metadata_allocd ) ;
1679+ // failed steal attempt
16331680 if (pg == NULL ) {
16341681 continue ;
16351682 }
1636- gc_sweep_pool_page (& serializer , allocd , & ptls2 -> page_metadata_buffered , pg );
1683+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
16371684 found_pg = 1 ;
16381685 }
16391686 if (!found_pg ) {
1640- break ;
1687+ // check for termination
1688+ int no_more_work = 1 ;
1689+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1690+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1691+ // skip foreign threads that already exited
1692+ if (ptls2 == NULL ) {
1693+ continue ;
1694+ }
1695+ jl_gc_pagemeta_t * pg = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1696+ if (pg != NULL ) {
1697+ no_more_work = 0 ;
1698+ break ;
1699+ }
1700+ }
1701+ if (no_more_work ) {
1702+ break ;
1703+ }
16411704 }
1705+ jl_cpu_pause ();
16421706 }
16431707 gc_page_serializer_destroy (& serializer );
16441708 }
@@ -1669,7 +1733,7 @@ static void gc_sweep_pool(void)
16691733
16701734 // allocate enough space to hold the end of the free list chain
16711735 // for every thread and pool size
1672- jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) alloca (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1736+ jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) malloc_s (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
16731737
16741738 // update metadata of pages that were pointed to by freelist or newpages from a pool
16751739 // i.e. pages being the current allocation target
@@ -1711,17 +1775,36 @@ static void gc_sweep_pool(void)
17111775 }
17121776
17131777 // the actual sweeping
1714- jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )alloca (n_threads * sizeof (jl_gc_page_stack_t ));
1778+ jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )jl_malloc_aligned (n_threads * sizeof (jl_gc_page_stack_t ), 128 );
1779+ if (tmp == NULL ) {
1780+ abort ();
1781+ }
17151782 memset (tmp , 0 , n_threads * sizeof (jl_gc_page_stack_t ));
17161783 jl_atomic_store (& gc_allocd_scratch , tmp );
17171784 gc_sweep_wake_all ();
1718- gc_sweep_pool_parallel ();
1785+ gc_sweep_pool_parallel (jl_current_task -> ptls );
17191786 gc_sweep_wait_for_all ();
17201787
1788+ // merge the page metadata lists
1789+ for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
1790+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1791+ if (ptls2 == NULL ) {
1792+ continue ;
1793+ }
1794+ while (1 ) {
1795+ jl_gc_pagemeta_t * pg = pop_lf_back_nosync (& tmp [t_i ]);
1796+ if (pg == NULL ) {
1797+ break ;
1798+ }
1799+ jl_ptls_t ptls3 = gc_all_tls_states [pg -> thread_n ];
1800+ push_lf_back_nosync (& ptls3 -> page_metadata_allocd , pg );
1801+ }
1802+ }
1803+
1804+ // reset half-pages pointers
17211805 for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
17221806 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
17231807 if (ptls2 != NULL ) {
1724- ptls2 -> page_metadata_allocd = tmp [t_i ];
17251808 for (int i = 0 ; i < JL_GC_N_POOLS ; i ++ ) {
17261809 jl_gc_pool_t * p = & ptls2 -> heap .norm_pools [i ];
17271810 p -> newpages = NULL ;
@@ -1759,6 +1842,10 @@ static void gc_sweep_pool(void)
17591842 }
17601843 }
17611844
1845+ // cleanup
1846+ free (pfl );
1847+ free (tmp );
1848+
17621849#ifdef _P64 // only enable concurrent sweeping on 64bit
17631850 // wake thread up to sweep concurrently
17641851 if (jl_n_sweepthreads > 0 ) {
0 commit comments