From 45709118111b01fb6cf0ada103487e8185d195d2 Mon Sep 17 00:00:00 2001 From: Peter Sollich Date: Tue, 9 Aug 2022 17:51:17 +0200 Subject: [PATCH] Mark phase prefetching. (#73375) This adds prefetching to the mark phase. The idea is that once we have established that an object is in one of the generations we want to collect, we prefetch its memory before we determine whether we have marked it already. This is because the mark bit is in the object itself, and thus requires accessing the object's memory. As the prefetching will take some time to take effect, we park the object in a queue (see type mark_queue_t below). We then retrieve an older object from the queue, and test whether it has been marked. This should be faster, because we have issued a prefetch for this older object's memory a while back. In quite a few places we now need to drain the queue to ensure correctness - see calls to drain_mark_queue(). --- src/coreclr/gc/gc.cpp | 215 ++++++++++++++++++++++++++++++++++++---- src/coreclr/gc/gcpriv.h | 24 +++++ 2 files changed, 221 insertions(+), 18 deletions(-) diff --git a/src/coreclr/gc/gc.cpp b/src/coreclr/gc/gc.cpp index 377092f16d269..69465bc59b0a8 100644 --- a/src/coreclr/gc/gc.cpp +++ b/src/coreclr/gc/gc.cpp @@ -2854,6 +2854,8 @@ size_t gc_heap::expand_mechanisms_per_heap[max_expand_mechanisms_count]; size_t gc_heap::interesting_mechanism_bits_per_heap[max_gc_mechanism_bits_count]; +mark_queue_t gc_heap::mark_queue; + #endif // MULTIPLE_HEAPS /* end of per heap static initialization */ @@ -23453,7 +23455,7 @@ BOOL gc_heap::gc_mark (uint8_t* o, uint8_t* low, uint8_t* high, int condemned_ge { return FALSE; } - set_marked(o); + set_marked (o); return TRUE; } return FALSE; @@ -23777,14 +23779,31 @@ void gc_heap::save_post_plug_info (uint8_t* last_pinned_plug, uint8_t* last_obje } } -//#define PREFETCH +// enable on processors known to have a useful prefetch instruction +#if defined(TARGET_AMD64) || defined(TARGET_X86) || defined(TARGET_ARM64) +#define PREFETCH +#endif + #ifdef PREFETCH -__declspec(naked) void __fastcall Prefetch(void* addr) +inline void Prefetch(void* addr) { - __asm { - PREFETCHT0 [ECX] - ret - }; +#ifdef TARGET_WINDOWS + +#if defined(TARGET_AMD64) || defined(TARGET_X86) + +#ifndef _MM_HINT_T0 +#define _MM_HINT_T0 1 +#endif + _mm_prefetch((const char*)addr, _MM_HINT_T0); +#elif defined(TARGET_ARM64) + __prefetch((const char*)addr); +#endif //defined(TARGET_AMD64) || defined(TARGET_X86) + +#elif defined(TARGET_UNIX) || defined(TARGET_OSX) + __builtin_prefetch(addr); +#else //!(TARGET_WINDOWS || TARGET_UNIX || TARGET_OSX) + UNREFERENCED_PARAMETER(addr); +#endif //TARGET_WINDOWS } #else //PREFETCH inline void Prefetch (void* addr) @@ -23840,6 +23859,114 @@ BOOL ref_p (uint8_t* r) return (straight_ref_p (r) || partial_object_p (r)); } +mark_queue_t::mark_queue_t() : curr_slot_index(0) +{ + for (size_t i = 0; i < slot_count; i++) + { + slot_table[i] = nullptr; + } +} + +// place an object in the mark queue +// returns a *different* object or nullptr +// if a non-null object is returned, that object is newly marked +// object o *must* be in a condemned generation +FORCEINLINE +uint8_t *mark_queue_t::queue_mark(uint8_t *o) +{ + Prefetch (o); + + // while the prefetch is taking effect, park our object in the queue + // and fetch an object that has been sitting in the queue for a while + // and where (hopefully) the memory is already in the cache + size_t slot_index = curr_slot_index; + uint8_t* old_o = slot_table[slot_index]; + slot_table[slot_index] = o; + + curr_slot_index = (slot_index + 1) % slot_count; + if (old_o == nullptr) + return nullptr; + + // this causes us to access the method table pointer of the old object + BOOL already_marked = marked (old_o); + if (already_marked) + { + return nullptr; + } + set_marked (old_o); + return old_o; +} + +// place an object in the mark queue +// returns a *different* object or nullptr +// if a non-null object is returned, that object is newly marked +// check first whether the object o is indeed in a condemned generation +FORCEINLINE +uint8_t *mark_queue_t::queue_mark(uint8_t *o, int condemned_gen) +{ +#ifdef USE_REGIONS + if (!is_in_heap_range (o)) + { + return nullptr; + } + if (condemned_gen != max_generation && gc_heap::get_region_gen_num (o) > condemned_gen) + { + return nullptr; + } + return queue_mark(o); +#else //USE_REGIONS + assert (condemned_gen == -1); + +#ifdef MULTIPLE_HEAPS + if (o) + { + gc_heap* hp = gc_heap::heap_of_gc (o); + assert (hp); + if ((o >= hp->gc_low) && (o < hp->gc_high)) + return queue_mark (o); + } +#else //MULTIPLE_HEAPS + if ((o >= gc_heap::gc_low) && (o < gc_heap::gc_high)) + return queue_mark (o); +#endif //MULTIPLE_HEAPS + return nullptr; +#endif //USE_REGIONS +} + +// retrieve a newly marked object from the queue +// returns nullptr if there is no such object +uint8_t* mark_queue_t::get_next_marked() +{ + size_t slot_index = curr_slot_index; + size_t empty_slot_count = 0; + while (empty_slot_count < slot_count) + { + uint8_t* o = slot_table[slot_index]; + slot_table[slot_index] = nullptr; + slot_index = (slot_index + 1) % slot_count; + if (o != nullptr) + { + BOOL already_marked = marked (o); + if (!already_marked) + { + set_marked (o); + curr_slot_index = slot_index; + return o; + } + } + empty_slot_count++; + } + return nullptr; +} + +mark_queue_t::~mark_queue_t() +{ + for (size_t slot_index = 0; slot_index < slot_count; slot_index++) + { + assert(slot_table[slot_index] == nullptr); + } +} + void gc_heap::mark_object_simple1 (uint8_t* oo, uint8_t* start THREAD_NUMBER_DCL) { SERVER_SC_MARK_VOLATILE(uint8_t*)* mark_stack_tos = (SERVER_SC_MARK_VOLATILE(uint8_t*)*)mark_stack_array; @@ -23899,9 +24026,8 @@ void gc_heap::mark_object_simple1 (uint8_t* oo, uint8_t* start THREAD_NUMBER_DCL go_through_object_cl (method_table(oo), oo, s, ppslot, { - uint8_t* o = *ppslot; - Prefetch(o); - if (gc_mark (o, gc_low, gc_high, condemned_gen)) + uint8_t* o = mark_queue.queue_mark(*ppslot, condemned_gen); + if (o != nullptr) { if (full_p) { @@ -23997,9 +24123,8 @@ void gc_heap::mark_object_simple1 (uint8_t* oo, uint8_t* start THREAD_NUMBER_DCL go_through_object (method_table(oo), oo, s, ppslot, start, use_start, (oo + s), { - uint8_t* o = *ppslot; - Prefetch(o); - if (gc_mark (o, gc_low, gc_high,condemned_gen)) + uint8_t* o = mark_queue.queue_mark(*ppslot, condemned_gen); + if (o != nullptr) { if (full_p) { @@ -24438,7 +24563,8 @@ gc_heap::mark_object_simple (uint8_t** po THREAD_NUMBER_DCL) snoop_stat.objects_checked_count++; #endif //SNOOP_STATS - if (gc_mark1 (o)) + o = mark_queue.queue_mark (o); + if (o != nullptr) { m_boundary (o); size_t s = size (o); @@ -24446,8 +24572,8 @@ gc_heap::mark_object_simple (uint8_t** po THREAD_NUMBER_DCL) { go_through_object_cl (method_table(o), o, s, poo, { - uint8_t* oo = *poo; - if (gc_mark (oo, gc_low, gc_high, condemned_gen)) + uint8_t* oo = mark_queue.queue_mark(*poo, condemned_gen); + if (oo != nullptr) { m_boundary (oo); add_to_promoted_bytes (oo, thread); @@ -24484,6 +24610,45 @@ void gc_heap::mark_object (uint8_t* o THREAD_NUMBER_DCL) #endif //USE_REGIONS } +void gc_heap::drain_mark_queue () +{ + int condemned_gen = +#ifdef USE_REGIONS + settings.condemned_generation; +#else + -1; +#endif //USE_REGIONS + +#ifdef MULTIPLE_HEAPS + THREAD_FROM_HEAP; +#else + const int thread = 0; +#endif //MULTIPLE_HEAPS + + uint8_t* o; + while ((o = mark_queue.get_next_marked()) != nullptr) + { + m_boundary (o); + size_t s = size (o); + add_to_promoted_bytes (o, s, thread); + if (contain_pointers_or_collectible (o)) + { + go_through_object_cl (method_table(o), o, s, poo, + { + uint8_t* oo = mark_queue.queue_mark(*poo, condemned_gen); + if (oo != nullptr) + { + m_boundary (oo); + add_to_promoted_bytes (oo, thread); + if (contain_pointers_or_collectible (oo)) + mark_object_simple1 (oo, oo THREAD_NUMBER_ARG); + } + } + ); + } + } +} + #ifdef BACKGROUND_GC #ifdef USE_REGIONS @@ -25660,6 +25825,8 @@ void gc_heap::scan_dependent_handles (int condemned_gen_number, ScanContext *sc, if (GCScan::GcDhUnpromotedHandlesExist(sc)) s_fUnpromotedHandles = TRUE; + drain_mark_queue(); + // Synchronize all the threads so we can read our state variables safely. The shared variable // s_fScanRequired, indicating whether we should scan the tables or terminate the loop, will be set by // a single thread inside the join. @@ -25761,6 +25928,8 @@ void gc_heap::scan_dependent_handles (int condemned_gen_number, ScanContext *sc, if (process_mark_overflow(condemned_gen_number)) fUnscannedPromotions = true; + drain_mark_queue(); + // Perform the scan and set the flag if any promotions resulted. if (GCScan::GcDhReScan(sc)) fUnscannedPromotions = true; @@ -26187,6 +26356,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p) if ((condemned_gen_number == max_generation) && (num_sizedrefs > 0)) { GCScan::GcScanSizedRefs(GCHeap::Promote, condemned_gen_number, max_generation, &sc); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_SIZEDREF, current_promoted_bytes, last_promoted_bytes); #ifdef MULTIPLE_HEAPS @@ -26210,12 +26380,14 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p) GCScan::GcScanRoots(GCHeap::Promote, condemned_gen_number, max_generation, &sc); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_STACK, current_promoted_bytes, last_promoted_bytes); #ifdef BACKGROUND_GC if (gc_heap::background_running_p()) { scan_background_roots (GCHeap::Promote, heap_number, &sc); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_BGC, current_promoted_bytes, last_promoted_bytes); } #endif //BACKGROUND_GC @@ -26223,6 +26395,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p) #ifdef FEATURE_PREMORTEM_FINALIZATION dprintf(3, ("Marking finalization data")); finalize_queue->GcScanRoots(GCHeap::Promote, heap_number, 0); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_FQ, current_promoted_bytes, last_promoted_bytes); #endif // FEATURE_PREMORTEM_FINALIZATION @@ -26230,6 +26403,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p) GCScan::GcScanHandles(GCHeap::Promote, condemned_gen_number, max_generation, &sc); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_HANDLES, current_promoted_bytes, last_promoted_bytes); if (!full_p) @@ -26341,6 +26515,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p) update_old_card_survived(); #endif //USE_REGIONS + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_OLDER, current_promoted_bytes, last_promoted_bytes); } } @@ -26349,6 +26524,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p) if (do_mark_steal_p) { mark_steal(); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_STEAL, current_promoted_bytes, last_promoted_bytes); } #endif //MH_SC_MARK @@ -26362,6 +26538,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p) // handle table has been fully promoted. GCScan::GcDhInitialScan(GCHeap::Promote, condemned_gen_number, max_generation, &sc); scan_dependent_handles(condemned_gen_number, &sc, true); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_DH_HANDLES, current_promoted_bytes, last_promoted_bytes); #ifdef MULTIPLE_HEAPS @@ -26444,12 +26621,14 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p) #ifdef FEATURE_PREMORTEM_FINALIZATION dprintf (3, ("Finalize marking")); finalize_queue->ScanForFinalization (GCHeap::Promote, condemned_gen_number, mark_only_p, __this); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_NEW_FQ, current_promoted_bytes, last_promoted_bytes); GCToEEInterface::DiagWalkFReachableObjects(__this); // Scan dependent handles again to promote any secondaries associated with primaries that were promoted // for finalization. As before scan_dependent_handles will also process any mark stack overflow. scan_dependent_handles(condemned_gen_number, &sc, false); + drain_mark_queue(); fire_mark_event (ETW::GC_ROOT_DH_HANDLES, current_promoted_bytes, last_promoted_bytes); #endif //FEATURE_PREMORTEM_FINALIZATION @@ -31927,7 +32106,7 @@ uint8_t* tree_search (uint8_t* tree, uint8_t* old_address) assert (candidate < tree); candidate = tree; tree = tree + cn; - Prefetch (tree - 8); + Prefetch (&((plug_and_pair*)tree)[-1].m_pair.left); continue; } else @@ -31938,7 +32117,7 @@ uint8_t* tree_search (uint8_t* tree, uint8_t* old_address) if ((cn = node_left_child (tree)) != 0) { tree = tree + cn; - Prefetch (tree - 8); + Prefetch (&((plug_and_pair*)tree)[-1].m_pair.left); continue; } else diff --git a/src/coreclr/gc/gcpriv.h b/src/coreclr/gc/gcpriv.h index 72ff309ffb076..e0640a860400a 100644 --- a/src/coreclr/gc/gcpriv.h +++ b/src/coreclr/gc/gcpriv.h @@ -1216,6 +1216,23 @@ enum bookkeeping_element total_bookkeeping_elements }; +class mark_queue_t +{ + static const size_t slot_count = 16; + uint8_t* slot_table[slot_count]; + size_t curr_slot_index; + +public: + mark_queue_t(); + + uint8_t *queue_mark(uint8_t *o); + uint8_t *queue_mark(uint8_t *o, int condemned_gen); + + uint8_t* get_next_marked(); + + ~mark_queue_t(); +}; + //class definition of the internal class class gc_heap { @@ -1242,6 +1259,8 @@ class gc_heap friend void PopulateDacVars(GcDacVars *gcDacVars); + friend class mark_queue_t; + #ifdef MULTIPLE_HEAPS typedef void (gc_heap::* card_fn) (uint8_t**, int); #define call_fn(this_arg,fn) (this_arg->*fn) @@ -2411,6 +2430,9 @@ class gc_heap PER_HEAP void mark_object_simple1 (uint8_t* o, uint8_t* start THREAD_NUMBER_DCL); + PER_HEAP + void drain_mark_queue(); + #ifdef MH_SC_MARK PER_HEAP void mark_steal (); @@ -5131,6 +5153,8 @@ class gc_heap PER_HEAP_ISOLATED size_t bookkeeping_sizes[total_bookkeeping_elements]; #endif //USE_REGIONS + PER_HEAP + mark_queue_t mark_queue; }; // class gc_heap #ifdef FEATURE_PREMORTEM_FINALIZATION