@@ -1541,7 +1541,8 @@ static void gc_sweep_page(gc_page_profiler_serializer_t *s, jl_gc_pool_t *p, jl_
15411541
15421542done :
15431543 if (re_use_page ) {
1544- push_lf_back (allocd , pg );
1544+ // we're pushing into a local page stack to reduce contention
1545+ push_lf_back_nosync (allocd , pg );
15451546 }
15461547 else {
15471548 gc_alloc_map_set (pg -> data , GC_PAGE_LAZILY_FREED );
@@ -1596,8 +1597,68 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
15961597 pg -> nfree = nfree ;
15971598}
15981599
1599- void gc_sweep_wake_all ( void )
1600+ int gc_sweep_prescan ( jl_ptls_t ptls )
16001601{
1602+ // 4MB worth of pages is worth parallelizing
1603+ const int n_pages_worth_parallel_sweep = (int )(4 * (1 << 20 ) / GC_PAGE_SZ );
1604+ int n_pages_to_scan = 0 ;
1605+ gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
1606+ // push into local page stack. we'll merge them later...
1607+ jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1608+ assert (allocd_scratch != NULL );
1609+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls -> tid ];
1610+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1611+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1612+ if (ptls2 == NULL ) {
1613+ continue ;
1614+ }
1615+ jl_gc_page_stack_t tmp ;
1616+ jl_gc_pagemeta_t * tail = NULL ;
1617+ memset (& tmp , 0 , sizeof (tmp ));
1618+ while (1 ) {
1619+ jl_gc_pagemeta_t * pg = try_pop_lf_back (& ptls2 -> page_metadata_allocd );
1620+ if (pg == NULL ) {
1621+ break ;
1622+ }
1623+ if (tail == NULL ) {
1624+ tail = pg ;
1625+ }
1626+ int should_scan = 1 ;
1627+ if (!pg -> has_marked ) {
1628+ should_scan = 0 ;
1629+ }
1630+ if (!current_sweep_full && !pg -> has_young ) {
1631+ assert (!prev_sweep_full || pg -> prev_nold >= pg -> nold );
1632+ if (!prev_sweep_full || pg -> prev_nold == pg -> nold ) {
1633+ should_scan = 0 ;
1634+ }
1635+ }
1636+ if (should_scan ) {
1637+ n_pages_to_scan ++ ;
1638+ push_lf_back_nosync (& tmp , pg );
1639+ }
1640+ else {
1641+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
1642+ }
1643+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1644+ tail -> next = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1645+ }
1646+ }
1647+ ptls2 -> page_metadata_allocd = tmp ;
1648+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1649+ break ;
1650+ }
1651+ }
1652+ gc_page_serializer_destroy (& serializer );
1653+ return n_pages_to_scan >= n_pages_worth_parallel_sweep ;
1654+ }
1655+
1656+ void gc_sweep_wake_all (jl_ptls_t ptls )
1657+ {
1658+ int parallel_sweep_worthwhile = gc_sweep_prescan (ptls );
1659+ if (!parallel_sweep_worthwhile ) {
1660+ return ;
1661+ }
16011662 uv_mutex_lock (& gc_threads_lock );
16021663 for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
16031664 jl_ptls_t ptls2 = gc_all_tls_states [i ];
@@ -1615,30 +1676,52 @@ void gc_sweep_wait_for_all(void)
16151676 }
16161677}
16171678
1618- void gc_sweep_pool_parallel (void )
1679+ void gc_sweep_pool_parallel (jl_ptls_t ptls )
16191680{
16201681 jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
16211682 jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
16221683 if (allocd_scratch != NULL ) {
1684+ // push into local page stack to reduce contention
1685+ // we'll merge them later...
1686+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls -> tid ];
16231687 gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
16241688 while (1 ) {
16251689 int found_pg = 0 ;
1690+ // sequentially walk the threads and sweep the pages
16261691 for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
16271692 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1693+ // skip foreign threads that already exited
16281694 if (ptls2 == NULL ) {
16291695 continue ;
16301696 }
1631- jl_gc_page_stack_t * allocd = & allocd_scratch [ t_i ] ;
1632- jl_gc_pagemeta_t * pg = pop_lf_back ( & ptls2 -> page_metadata_allocd );
1697+ jl_gc_pagemeta_t * pg = try_pop_lf_back ( & ptls2 -> page_metadata_allocd ) ;
1698+ // failed steal attempt
16331699 if (pg == NULL ) {
16341700 continue ;
16351701 }
1636- gc_sweep_pool_page (& serializer , allocd , & ptls2 -> page_metadata_buffered , pg );
1702+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
16371703 found_pg = 1 ;
16381704 }
16391705 if (!found_pg ) {
1640- break ;
1706+ // check for termination
1707+ int no_more_work = 1 ;
1708+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1709+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1710+ // skip foreign threads that already exited
1711+ if (ptls2 == NULL ) {
1712+ continue ;
1713+ }
1714+ jl_gc_pagemeta_t * pg = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1715+ if (pg != NULL ) {
1716+ no_more_work = 0 ;
1717+ break ;
1718+ }
1719+ }
1720+ if (no_more_work ) {
1721+ break ;
1722+ }
16411723 }
1724+ jl_cpu_pause ();
16421725 }
16431726 gc_page_serializer_destroy (& serializer );
16441727 }
@@ -1669,7 +1752,7 @@ static void gc_sweep_pool(void)
16691752
16701753 // allocate enough space to hold the end of the free list chain
16711754 // for every thread and pool size
1672- jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) alloca (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1755+ jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) malloc_s (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
16731756
16741757 // update metadata of pages that were pointed to by freelist or newpages from a pool
16751758 // i.e. pages being the current allocation target
@@ -1711,17 +1794,37 @@ static void gc_sweep_pool(void)
17111794 }
17121795
17131796 // the actual sweeping
1714- jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )alloca (n_threads * sizeof (jl_gc_page_stack_t ));
1797+ jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )jl_malloc_aligned (n_threads * sizeof (jl_gc_page_stack_t ), 128 );
1798+ if (tmp == NULL ) {
1799+ abort ();
1800+ }
17151801 memset (tmp , 0 , n_threads * sizeof (jl_gc_page_stack_t ));
17161802 jl_atomic_store (& gc_allocd_scratch , tmp );
1717- gc_sweep_wake_all ();
1718- gc_sweep_pool_parallel ();
1803+ jl_ptls_t ptls = jl_current_task -> ptls ;
1804+ gc_sweep_wake_all (ptls );
1805+ gc_sweep_pool_parallel (ptls );
17191806 gc_sweep_wait_for_all ();
17201807
1808+ // merge the page metadata lists
1809+ for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
1810+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1811+ if (ptls2 == NULL ) {
1812+ continue ;
1813+ }
1814+ while (1 ) {
1815+ jl_gc_pagemeta_t * pg = pop_lf_back_nosync (& tmp [t_i ]);
1816+ if (pg == NULL ) {
1817+ break ;
1818+ }
1819+ jl_ptls_t ptls3 = gc_all_tls_states [pg -> thread_n ];
1820+ push_lf_back_nosync (& ptls3 -> page_metadata_allocd , pg );
1821+ }
1822+ }
1823+
1824+ // reset half-pages pointers
17211825 for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
17221826 jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
17231827 if (ptls2 != NULL ) {
1724- ptls2 -> page_metadata_allocd = tmp [t_i ];
17251828 for (int i = 0 ; i < JL_GC_N_POOLS ; i ++ ) {
17261829 jl_gc_pool_t * p = & ptls2 -> heap .norm_pools [i ];
17271830 p -> newpages = NULL ;
@@ -1759,6 +1862,10 @@ static void gc_sweep_pool(void)
17591862 }
17601863 }
17611864
1865+ // cleanup
1866+ free (pfl );
1867+ free (tmp );
1868+
17621869#ifdef _P64 // only enable concurrent sweeping on 64bit
17631870 // wake thread up to sweep concurrently
17641871 if (jl_n_sweepthreads > 0 ) {
0 commit comments