-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Instrument stackalloc for PGO #119041
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Instrument stackalloc for PGO #119041
Changes from all commits
671dae2
583bf0d
5439d1f
24ed612
253bcd7
d7bd244
e68b573
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1943,6 +1943,10 @@ class ValueHistogramProbeVisitor final : public GenTreeVisitor<ValueHistogramPro | |
| m_functor(m_compiler, node); | ||
| } | ||
| } | ||
| if (node->OperIs(GT_LCLHEAP)) | ||
| { | ||
| m_functor(m_compiler, node); | ||
| } | ||
| return Compiler::WALK_CONTINUE; | ||
| } | ||
| }; | ||
|
|
@@ -2026,14 +2030,27 @@ class BuildValueHistogramProbeSchemaGen | |
| { | ||
| } | ||
|
|
||
| void operator()(Compiler* compiler, GenTree* call) | ||
| void operator()(Compiler* compiler, GenTree* tree) | ||
| { | ||
| ICorJitInfo::PgoInstrumentationSchema schemaElem = {}; | ||
| schemaElem.Count = 1; | ||
| schemaElem.InstrumentationKind = compiler->opts.compCollect64BitCounts | ||
| ? ICorJitInfo::PgoInstrumentationKind::ValueHistogramLongCount | ||
| : ICorJitInfo::PgoInstrumentationKind::ValueHistogramIntCount; | ||
| schemaElem.ILOffset = (int32_t)call->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset; | ||
|
|
||
| if (tree->OperIs(GT_LCLHEAP)) | ||
| { | ||
| schemaElem.ILOffset = static_cast<int32_t>(tree->AsOpWithILOffset()->GetILOffset()); | ||
| } | ||
| else if (tree->OperIs(GT_CALL)) | ||
| { | ||
| schemaElem.ILOffset = static_cast<int32_t>(tree->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset); | ||
| } | ||
| else | ||
| { | ||
| unreached(); | ||
| } | ||
|
|
||
| m_schema.push_back(schemaElem); | ||
| m_schemaCount++; | ||
|
|
||
|
|
@@ -2267,12 +2284,24 @@ class ValueHistogramProbeInserter | |
| return; | ||
| } | ||
|
|
||
| assert(node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_Memmove) || | ||
| node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_SequenceEqual)); | ||
| int32_t ilOffset; | ||
| if (node->OperIs(GT_LCLHEAP)) | ||
| { | ||
| ilOffset = node->AsOpWithILOffset()->GetILOffset(); | ||
| } | ||
| else if (node->OperIs(GT_CALL)) | ||
| { | ||
| assert(node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_Memmove) || | ||
| node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_SequenceEqual)); | ||
| ilOffset = static_cast<int32_t>(node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset); | ||
| } | ||
| else | ||
| { | ||
| unreached(); | ||
| } | ||
|
|
||
| const ICorJitInfo::PgoInstrumentationSchema& countEntry = m_schema[*m_currentSchemaIndex]; | ||
| if (countEntry.ILOffset != | ||
| static_cast<int32_t>(node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset)) | ||
| if (countEntry.ILOffset != ilOffset) | ||
| { | ||
| return; | ||
| } | ||
|
|
@@ -2292,9 +2321,22 @@ class ValueHistogramProbeInserter | |
|
|
||
| *m_currentSchemaIndex += 2; | ||
|
|
||
| GenTree** lenArgRef = &node->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(); | ||
| GenTree** lenArgRef; | ||
| if (node->OperIs(GT_LCLHEAP)) | ||
| { | ||
| lenArgRef = &node->AsOp()->gtOp1; | ||
| } | ||
| else if (node->OperIs(GT_CALL)) | ||
| { | ||
| lenArgRef = &node->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef(); | ||
| } | ||
| else | ||
| { | ||
| unreached(); | ||
| } | ||
|
|
||
| // We have Memmove(dst, src, len) and we want to insert a call to CORINFO_HELP_VALUEPROFILE for the len: | ||
| // We have Memmove(dst, src, len) or LCLHEAP(len) and we want to insert a call to CORINFO_HELP_VALUEPROFILE for | ||
| // the len: | ||
| // | ||
| // \--* COMMA long | ||
| // +--* CALL help void CORINFO_HELP_VALUEPROFILE | ||
|
|
@@ -2305,13 +2347,18 @@ class ValueHistogramProbeInserter | |
| // | \--* CNS_INT long <hist> | ||
| // \--* LCL_VAR long tmp | ||
| // | ||
|
|
||
| const unsigned lenTmpNum = compiler->lvaGrabTemp(true DEBUGARG("length histogram profile tmp")); | ||
| GenTree* storeLenToTemp = compiler->gtNewTempStore(lenTmpNum, *lenArgRef); | ||
| GenTree* lengthLocal = compiler->gtNewLclvNode(lenTmpNum, genActualType(*lenArgRef)); | ||
| GenTreeOp* lengthNode = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), storeLenToTemp, lengthLocal); | ||
| GenTree* histNode = compiler->gtNewIconNode(reinterpret_cast<ssize_t>(hist), TYP_I_IMPL); | ||
| unsigned helper = is32 ? CORINFO_HELP_VALUEPROFILE32 : CORINFO_HELP_VALUEPROFILE64; | ||
|
|
||
| if (!lengthNode->TypeIs(TYP_I_IMPL)) | ||
| { | ||
| lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /* isUnsigned */ false, TYP_I_IMPL); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously, all memset/memcpy primitives used There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The ecma spec is weird here: at any rate, seems like representing it as TYP_I_IMPL would be ok |
||
| } | ||
|
|
||
| GenTreeCall* helperCallNode = compiler->gtNewHelperCallNode(helper, TYP_VOID, lengthNode, histNode); | ||
|
|
||
| *lenArgRef = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), helperCallNode, | ||
|
|
@@ -2456,7 +2503,7 @@ void ValueInstrumentor::Prepare(bool isPreImport) | |
| // | ||
| for (BasicBlock* const block : m_comp->Blocks()) | ||
| { | ||
| block->bbCountSchemaIndex = -1; | ||
| block->bbValueSchemaIndex = -1; | ||
| } | ||
| #endif | ||
| } | ||
|
|
@@ -2468,7 +2515,7 @@ void ValueInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& schema) | |
| return; | ||
| } | ||
|
|
||
| block->bbHistogramSchemaIndex = (int)schema.size(); | ||
| block->bbValueSchemaIndex = (int)schema.size(); | ||
|
|
||
| BuildValueHistogramProbeSchemaGen schemaGen(schema, m_schemaCount); | ||
| ValueHistogramProbeVisitor<BuildValueHistogramProbeSchemaGen> visitor(m_comp, schemaGen); | ||
|
|
@@ -2485,10 +2532,10 @@ void ValueInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* p | |
| return; | ||
| } | ||
|
|
||
| int histogramSchemaIndex = block->bbHistogramSchemaIndex; | ||
| assert((histogramSchemaIndex >= 0) && (histogramSchemaIndex < (int)schema.size())); | ||
| int valueSchemaIndex = block->bbValueSchemaIndex; | ||
| assert((valueSchemaIndex >= 0) && (valueSchemaIndex < (int)schema.size())); | ||
|
|
||
| ValueHistogramProbeInserter insertProbes(schema, profileMemory, &histogramSchemaIndex, m_instrCount); | ||
| ValueHistogramProbeInserter insertProbes(schema, profileMemory, &valueSchemaIndex, m_instrCount); | ||
| ValueHistogramProbeVisitor<ValueHistogramProbeInserter> visitor(m_comp, insertProbes); | ||
| for (Statement* const stmt : block->Statements()) | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10098,9 +10098,68 @@ void Compiler::impImportBlockCode(BasicBlock* block) | |
| return; | ||
| } | ||
|
|
||
| op1 = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, op2); | ||
| // May throw a stack overflow exception. Obviously, we don't want locallocs to be CSE'd. | ||
| op1->gtFlags |= (GTF_EXCEPT | GTF_DONT_CSE); | ||
| op1 = gtNewLclHeapNode(op2, opcodeOffs); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll move the entire CEE_LOCALLOC importation to a separate function in a separate PR in order to simplify code-review |
||
|
|
||
| // Do we have a profile for this non-constant localloc? | ||
| // if so, emit "size == profiledValue ? LCLHEAP(profiledValue) : LCLHEAP(size)" tree. | ||
| // | ||
| // We don't need that optimization if SkipInitLocals is set as even non-constant locallocs | ||
| // are cheap in that case, we only want to speed up zeroing. | ||
| // | ||
| if (info.compInitMem && !op2->IsIntegralConst()) | ||
| { | ||
| // Consuming the existing profile (optimizing) | ||
| if (opts.IsOptimizedWithProfile()) | ||
| { | ||
| ssize_t profiledValue = 0; | ||
| uint32_t likelihood = 0; | ||
| if (pickProfiledValue(opcodeOffs, &likelihood, &profiledValue) && (likelihood >= 50) && | ||
| ((uint32_t)profiledValue <= INT_MAX)) | ||
| { | ||
| assert(FitsIn<int>(profiledValue)); | ||
|
|
||
| GenTree* sizeNode = op2; | ||
| GenTree* clonedSizeNode = impCloneExpr(sizeNode, &sizeNode, CHECK_SPILL_ALL, | ||
| nullptr DEBUGARG("spilling sizeNode")); | ||
| GenTree* profiledValueNode = gtNewIconNode(profiledValue, op2->TypeGet()); | ||
| GenTree* fallback = gtNewLclHeapNode(clonedSizeNode, opcodeOffs); | ||
| fallback->gtFlags |= (GTF_EXCEPT | GTF_DONT_CSE); | ||
|
|
||
| GenTree* fastpath; | ||
| if (profiledValue == 0) | ||
| { | ||
| // Just nullptr | ||
| fastpath = gtNewIconNode(0, TYP_I_IMPL); | ||
| } | ||
| else | ||
| { | ||
| // NOTE: we don't want to convert the fastpath stackalloc to a local like we | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this block is executed frequently enough maybe we should convert to a local? You can compare the block's weight to that of the method entry, if the value is close to 1 then consider the conversion. |
||
| // normally do, because it will be an additional overhead for the fallback path | ||
| // (a redundant local to clear). | ||
| fastpath = gtNewLclHeapNode(profiledValueNode, opcodeOffs); | ||
| } | ||
|
|
||
| // TODO: Specify weights for the branches in the Qmark node. | ||
| GenTreeColon* colon = | ||
| new (this, GT_COLON) GenTreeColon(TYP_I_IMPL, fastpath, fallback); | ||
| GenTreeOp* cond = | ||
| gtNewOperNode(GT_EQ, TYP_INT, sizeNode, gtCloneExpr(profiledValueNode)); | ||
| GenTreeQmark* qmark = gtNewQmarkNode(TYP_I_IMPL, cond, colon); | ||
|
|
||
| // QMARK has to be a root node | ||
| unsigned tmp = lvaGrabTemp(true DEBUGARG("Grabbing temp for Qmark")); | ||
| impStoreToTemp(tmp, qmark, CHECK_SPILL_ALL); | ||
| op1 = gtNewLclvNode(tmp, qmark->TypeGet()); | ||
| } | ||
| } | ||
| // Instrumenting LCLHEAP for value profile | ||
| else if (opts.IsInstrumented() && !compIsForInlining()) | ||
| { | ||
| JITDUMP("\n ... marking [%06u] in " FMT_BB " for value profile instrumentation\n", | ||
| dspTreeID(op1), compCurBB->bbNum); | ||
| compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE); | ||
| } | ||
| } | ||
|
|
||
| // Ensure we have stack security for this method. | ||
| setNeedsGSSecurityCookie(); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Count and HandleHistogram each have their own index fields, Value probing used to use Handle's one and it could lead to asserts. This field doesn't increase BasicBlock's layout (it had paddings) - still same
272bytes on Release-64bit.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does this work if there are multiple value probes in a block?