@@ -1541,7 +1541,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
15411541
15421542done :
15431543 if (re_use_page ) {
1544- push_lf_back (allocd , pg );
1544+ // we're pushing into a local page stack to reduce contention
1545+ push_lf_back_nosync (allocd , pg );
15451546 }
15461547 else {
15471548 gc_alloc_map_set (pg -> data , GC_PAGE_LAZILY_FREED );
@@ -1596,8 +1597,71 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
15961597 pg -> nfree = nfree ;
15971598}
15981599
1599- void gc_sweep_wake_all ( void )
1600+ int gc_sweep_prescan ( jl_ptls_t ptls )
16001601{
1602+ // 4MB worth of pages is worth parallelizing
1603+ const int n_pages_worth_parallel_sweep = (int )(4 * (1 << 20 ) / GC_PAGE_SZ );
1604+ int n_pages_to_scan = 0 ;
1605+ gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
1606+ // push into local page stack. we'll merge them later...
1607+ jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1608+ assert (allocd_scratch != NULL );
1609+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls -> tid ];
1610+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1611+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1612+ if (ptls2 == NULL ) {
1613+ continue ;
1614+ }
1615+ jl_gc_page_stack_t tmp ;
1616+ jl_gc_pagemeta_t * tail = NULL ;
1617+ memset (& tmp , 0 , sizeof (tmp ));
1618+ while (1 ) {
1619+ jl_gc_pagemeta_t * pg = pop_lf_back_nosync (& ptls2 -> page_metadata_allocd );
1620+ if (pg == NULL ) {
1621+ break ;
1622+ }
1623+ int should_scan = 1 ;
1624+ if (!pg -> has_marked ) {
1625+ should_scan = 0 ;
1626+ }
1627+ if (!current_sweep_full && !pg -> has_young ) {
1628+ assert (!prev_sweep_full || pg -> prev_nold >= pg -> nold );
1629+ if (!prev_sweep_full || pg -> prev_nold == pg -> nold ) {
1630+ should_scan = 0 ;
1631+ }
1632+ }
1633+ if (should_scan ) {
1634+ if (tail == NULL ) {
1635+ tail = pg ;
1636+ }
1637+ n_pages_to_scan ++ ;
1638+ push_lf_back_nosync (& tmp , pg );
1639+ }
1640+ else {
1641+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
1642+ }
1643+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1644+ break ;
1645+ }
1646+ }
1647+ if (tail != NULL ) {
1648+ tail -> next = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1649+ }
1650+ ptls2 -> page_metadata_allocd = tmp ;
1651+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1652+ break ;
1653+ }
1654+ }
1655+ gc_page_serializer_destroy (& serializer );
1656+ return n_pages_to_scan >= n_pages_worth_parallel_sweep ;
1657+ }
1658+
1659+ void gc_sweep_wake_all (jl_ptls_t ptls )
1660+ {
1661+ int parallel_sweep_worthwhile = gc_sweep_prescan (ptls );
1662+ if (!parallel_sweep_worthwhile ) {
1663+ return ;
1664+ }
16011665 uv_mutex_lock (& gc_threads_lock );
16021666 for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
16031667 jl_ptls_t ptls2 = gc_all_tls_states [i ];
@@ -1615,30 +1679,52 @@ void gc_sweep_wait_for_all(void)
16151679 }
16161680}
16171681
1618- void gc_sweep_pool_parallel (void )
1682+ void gc_sweep_pool_parallel (jl_ptls_t ptls )
16191683{
16201684 jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
16211685 jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
16221686 if (allocd_scratch != NULL ) {
1687+ // push into local page stack to reduce contention
1688+ // we'll merge them later...
1689+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls -> tid ];
16231690 gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
16241691 while (1 ) {
16251692 int found_pg = 0 ;
1693+ // sequentially walk the threads and sweep the pages
16261694 for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
16271695 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1696+ // skip foreign threads that already exited
16281697 if (ptls2 == NULL ) {
16291698 continue ;
16301699 }
1631- jl_gc_page_stack_t * allocd = & allocd_scratch [ t_i ] ;
1632- jl_gc_pagemeta_t * pg = pop_lf_back ( & ptls2 -> page_metadata_allocd );
1700+ jl_gc_pagemeta_t * pg = try_pop_lf_back ( & ptls2 -> page_metadata_allocd ) ;
1701+ // failed steal attempt
16331702 if (pg == NULL ) {
16341703 continue ;
16351704 }
1636- gc_sweep_pool_page (& serializer , allocd , & ptls2 -> page_metadata_buffered , pg );
1705+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
16371706 found_pg = 1 ;
16381707 }
16391708 if (!found_pg ) {
1640- break ;
1709+ // check for termination
1710+ int no_more_work = 1 ;
1711+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1712+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1713+ // skip foreign threads that already exited
1714+ if (ptls2 == NULL ) {
1715+ continue ;
1716+ }
1717+ jl_gc_pagemeta_t * pg = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1718+ if (pg != NULL ) {
1719+ no_more_work = 0 ;
1720+ break ;
1721+ }
1722+ }
1723+ if (no_more_work ) {
1724+ break ;
1725+ }
16411726 }
1727+ jl_cpu_pause ();
16421728 }
16431729 gc_page_serializer_destroy (& serializer );
16441730 }
@@ -1669,7 +1755,7 @@ static void gc_sweep_pool(void)
16691755
16701756 // allocate enough space to hold the end of the free list chain
16711757 // for every thread and pool size
1672- jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) alloca (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1758+ jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) malloc_s (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
16731759
16741760 // update metadata of pages that were pointed to by freelist or newpages from a pool
16751761 // i.e. pages being the current allocation target
@@ -1711,17 +1797,37 @@ static void gc_sweep_pool(void)
17111797 }
17121798
17131799 // the actual sweeping
1714- jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )alloca (n_threads * sizeof (jl_gc_page_stack_t ));
1800+ jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )jl_malloc_aligned (n_threads * sizeof (jl_gc_page_stack_t ), 128 );
1801+ if (tmp == NULL ) {
1802+ abort ();
1803+ }
17151804 memset (tmp , 0 , n_threads * sizeof (jl_gc_page_stack_t ));
17161805 jl_atomic_store (& gc_allocd_scratch , tmp );
1717- gc_sweep_wake_all ();
1718- gc_sweep_pool_parallel ();
1806+ jl_ptls_t ptls = jl_current_task -> ptls ;
1807+ gc_sweep_wake_all (ptls );
1808+ gc_sweep_pool_parallel (ptls );
17191809 gc_sweep_wait_for_all ();
17201810
1811+ // merge the page metadata lists
1812+ for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
1813+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1814+ if (ptls2 == NULL ) {
1815+ continue ;
1816+ }
1817+ while (1 ) {
1818+ jl_gc_pagemeta_t * pg = pop_lf_back_nosync (& tmp [t_i ]);
1819+ if (pg == NULL ) {
1820+ break ;
1821+ }
1822+ jl_ptls_t ptls3 = gc_all_tls_states [pg -> thread_n ];
1823+ push_lf_back_nosync (& ptls3 -> page_metadata_allocd , pg );
1824+ }
1825+ }
1826+
1827+ // reset half-pages pointers
17211828 for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
17221829 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
17231830 if (ptls2 != NULL ) {
1724- ptls2 -> page_metadata_allocd = tmp [t_i ];
17251831 for (int i = 0 ; i < JL_GC_N_POOLS ; i ++ ) {
17261832 jl_gc_pool_t * p = & ptls2 -> heap .norm_pools [i ];
17271833 p -> newpages = NULL ;
@@ -1759,6 +1865,10 @@ static void gc_sweep_pool(void)
17591865 }
17601866 }
17611867
1868+ // cleanup
1869+ free (pfl );
1870+ free (tmp );
1871+
17621872#ifdef _P64 // only enable concurrent sweeping on 64bit
17631873 // wake thread up to sweep concurrently
17641874 if (jl_n_sweepthreads > 0 ) {
0 commit comments