Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/coreclr/jit/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -1419,6 +1419,8 @@ struct BasicBlock : private LIR::Range
int bbHistogramSchemaIndex; // schema index for histogram instrumentation
};

int bbValueSchemaIndex; // schema index for value instrumentation
Copy link
Member Author

@EgorBo EgorBo Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Count and HandleHistogram each have their own index fields, Value probing used to use Handle's one and it could lead to asserts. This field doesn't increase BasicBlock's layout (it had paddings) - still same 272 bytes on Release-64bit.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this work if there are multiple value probes in a block?


#define MAX_XCPTN_INDEX (USHRT_MAX - 1)

// It would be nice to make bbTryIndex and bbHndIndex private, but there is still code that uses them directly,
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -2983,6 +2983,8 @@ class Compiler
GenTreeColon* gtNewColonNode(var_types type, GenTree* thenNode, GenTree* elseNode);
GenTreeQmark* gtNewQmarkNode(var_types type, GenTree* cond, GenTreeColon* colon);

GenTreeOpWithILOffset* gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOffset = 0);

GenTree* gtNewLargeOperNode(genTreeOps oper,
var_types type = TYP_I_IMPL,
GenTree* op1 = nullptr,
Expand Down Expand Up @@ -7493,6 +7495,8 @@ class Compiler
optMethodFlags |= OMF_HAS_STACK_ARRAY;
}

bool pickProfiledValue(IL_OFFSET ilOffset, uint32_t* pLikelihood, ssize_t* pValue);

void pickGDV(GenTreeCall* call,
IL_OFFSET ilOffset,
bool isInterface,
Expand Down
75 changes: 61 additions & 14 deletions src/coreclr/jit/fgprofile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1943,6 +1943,10 @@ class ValueHistogramProbeVisitor final : public GenTreeVisitor<ValueHistogramPro
m_functor(m_compiler, node);
}
}
if (node->OperIs(GT_LCLHEAP))
{
m_functor(m_compiler, node);
}
return Compiler::WALK_CONTINUE;
}
};
Expand Down Expand Up @@ -2026,14 +2030,27 @@ class BuildValueHistogramProbeSchemaGen
{
}

void operator()(Compiler* compiler, GenTree* call)
void operator()(Compiler* compiler, GenTree* tree)
{
ICorJitInfo::PgoInstrumentationSchema schemaElem = {};
schemaElem.Count = 1;
schemaElem.InstrumentationKind = compiler->opts.compCollect64BitCounts
? ICorJitInfo::PgoInstrumentationKind::ValueHistogramLongCount
: ICorJitInfo::PgoInstrumentationKind::ValueHistogramIntCount;
schemaElem.ILOffset = (int32_t)call->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset;

if (tree->OperIs(GT_LCLHEAP))
{
schemaElem.ILOffset = static_cast<int32_t>(tree->AsOpWithILOffset()->GetILOffset());
}
else if (tree->OperIs(GT_CALL))
{
schemaElem.ILOffset = static_cast<int32_t>(tree->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset);
}
else
{
unreached();
}

m_schema.push_back(schemaElem);
m_schemaCount++;

Expand Down Expand Up @@ -2267,12 +2284,24 @@ class ValueHistogramProbeInserter
return;
}

assert(node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_Memmove) ||
node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_SequenceEqual));
int32_t ilOffset;
if (node->OperIs(GT_LCLHEAP))
{
ilOffset = node->AsOpWithILOffset()->GetILOffset();
}
else if (node->OperIs(GT_CALL))
{
assert(node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_Memmove) ||
node->AsCall()->IsSpecialIntrinsic(compiler, NI_System_SpanHelpers_SequenceEqual));
ilOffset = static_cast<int32_t>(node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset);
}
else
{
unreached();
}

const ICorJitInfo::PgoInstrumentationSchema& countEntry = m_schema[*m_currentSchemaIndex];
if (countEntry.ILOffset !=
static_cast<int32_t>(node->AsCall()->gtHandleHistogramProfileCandidateInfo->ilOffset))
if (countEntry.ILOffset != ilOffset)
{
return;
}
Expand All @@ -2292,9 +2321,22 @@ class ValueHistogramProbeInserter

*m_currentSchemaIndex += 2;

GenTree** lenArgRef = &node->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef();
GenTree** lenArgRef;
if (node->OperIs(GT_LCLHEAP))
{
lenArgRef = &node->AsOp()->gtOp1;
}
else if (node->OperIs(GT_CALL))
{
lenArgRef = &node->AsCall()->gtArgs.GetUserArgByIndex(2)->EarlyNodeRef();
}
else
{
unreached();
}

// We have Memmove(dst, src, len) and we want to insert a call to CORINFO_HELP_VALUEPROFILE for the len:
// We have Memmove(dst, src, len) or LCLHEAP(len) and we want to insert a call to CORINFO_HELP_VALUEPROFILE for
// the len:
//
// \--* COMMA long
// +--* CALL help void CORINFO_HELP_VALUEPROFILE
Expand All @@ -2305,13 +2347,18 @@ class ValueHistogramProbeInserter
// | \--* CNS_INT long <hist>
// \--* LCL_VAR long tmp
//

const unsigned lenTmpNum = compiler->lvaGrabTemp(true DEBUGARG("length histogram profile tmp"));
GenTree* storeLenToTemp = compiler->gtNewTempStore(lenTmpNum, *lenArgRef);
GenTree* lengthLocal = compiler->gtNewLclvNode(lenTmpNum, genActualType(*lenArgRef));
GenTreeOp* lengthNode = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), storeLenToTemp, lengthLocal);
GenTree* histNode = compiler->gtNewIconNode(reinterpret_cast<ssize_t>(hist), TYP_I_IMPL);
unsigned helper = is32 ? CORINFO_HELP_VALUEPROFILE32 : CORINFO_HELP_VALUEPROFILE64;

if (!lengthNode->TypeIs(TYP_I_IMPL))
{
lengthNode = compiler->gtNewCastNode(TYP_I_IMPL, lengthNode, /* isUnsigned */ false, TYP_I_IMPL);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, all memset/memcpy primitives used TYP_I_IMPL length. GT_LCLHEAP uses TYP_INT

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ecma spec is weird here:

III.3.47 
...
The localloc instruction allocates size (type native unsigned int or U4) bytes from the local

at any rate, seems like representing it as TYP_I_IMPL would be ok

}

GenTreeCall* helperCallNode = compiler->gtNewHelperCallNode(helper, TYP_VOID, lengthNode, histNode);

*lenArgRef = compiler->gtNewOperNode(GT_COMMA, lengthLocal->TypeGet(), helperCallNode,
Expand Down Expand Up @@ -2456,7 +2503,7 @@ void ValueInstrumentor::Prepare(bool isPreImport)
//
for (BasicBlock* const block : m_comp->Blocks())
{
block->bbCountSchemaIndex = -1;
block->bbValueSchemaIndex = -1;
}
#endif
}
Expand All @@ -2468,7 +2515,7 @@ void ValueInstrumentor::BuildSchemaElements(BasicBlock* block, Schema& schema)
return;
}

block->bbHistogramSchemaIndex = (int)schema.size();
block->bbValueSchemaIndex = (int)schema.size();

BuildValueHistogramProbeSchemaGen schemaGen(schema, m_schemaCount);
ValueHistogramProbeVisitor<BuildValueHistogramProbeSchemaGen> visitor(m_comp, schemaGen);
Expand All @@ -2485,10 +2532,10 @@ void ValueInstrumentor::Instrument(BasicBlock* block, Schema& schema, uint8_t* p
return;
}

int histogramSchemaIndex = block->bbHistogramSchemaIndex;
assert((histogramSchemaIndex >= 0) && (histogramSchemaIndex < (int)schema.size()));
int valueSchemaIndex = block->bbValueSchemaIndex;
assert((valueSchemaIndex >= 0) && (valueSchemaIndex < (int)schema.size()));

ValueHistogramProbeInserter insertProbes(schema, profileMemory, &histogramSchemaIndex, m_instrCount);
ValueHistogramProbeInserter insertProbes(schema, profileMemory, &valueSchemaIndex, m_instrCount);
ValueHistogramProbeVisitor<ValueHistogramProbeInserter> visitor(m_comp, insertProbes);
for (Statement* const stmt : block->Statements())
{
Expand Down
17 changes: 17 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2825,6 +2825,7 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK)
break;

// For the ones below no extra argument matters for comparison.
case GT_LCLHEAP:
case GT_BOX:
case GT_RUNTIMELOOKUP:
case GT_ARR_ADDR:
Expand Down Expand Up @@ -3398,6 +3399,7 @@ unsigned Compiler::gtHashValue(GenTree* tree)
break;

// For the ones below no extra argument matters for comparison.
case GT_LCLHEAP:
case GT_BOX:
case GT_ARR_ADDR:
break;
Expand Down Expand Up @@ -3444,6 +3446,7 @@ unsigned Compiler::gtHashValue(GenTree* tree)
break;

// For the ones below no extra argument matters for comparison.
case GT_LCLHEAP:
case GT_QMARK:
case GT_INDEX_ADDR:
break;
Expand Down Expand Up @@ -7637,6 +7640,15 @@ GenTreeQmark* Compiler::gtNewQmarkNode(var_types type, GenTree* cond, GenTreeCol
return result;
}

GenTreeOpWithILOffset* Compiler::gtNewLclHeapNode(GenTree* size, IL_OFFSET ilOffset)
{
assert(size != nullptr);
GenTreeOpWithILOffset* node =
new (this, GT_LCLHEAP) GenTreeOpWithILOffset(GT_LCLHEAP, TYP_I_IMPL, size, nullptr, ilOffset);
node->gtFlags |= (GTF_EXCEPT | GTF_DONT_CSE);
return node;
}

GenTreeIntCon* Compiler::gtNewIconNode(ssize_t value, var_types type)
{
assert(genActualType(type) == type);
Expand Down Expand Up @@ -9706,6 +9718,11 @@ GenTree* Compiler::gtCloneExpr(GenTree* tree)
copy = new (this, oper) GenTreeOp(oper, tree->TypeGet(), tree->gtGetOp1(), tree->gtGetOp2());
break;

case GT_LCLHEAP:
copy =
new (this, GT_LCLHEAP) GenTreeOpWithILOffset(GT_LCLHEAP, TYP_I_IMPL, tree->gtGetOp1(), nullptr, 0);
break;

default:
assert(!GenTree::IsExOp(tree->OperKind()) && tree->OperIsSimple());
// We're in the SimpleOp case, so it's always unary or binary.
Expand Down
35 changes: 35 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -3178,6 +3178,41 @@ struct GenTreeOp : public GenTreeUnOp
#endif
};

struct GenTreeOpWithILOffset : public GenTreeOp
{
private:
IL_OFFSET gtILOffset;

public:
IL_OFFSET GetILOffset() const
{
return gtILOffset;
}

void SetILOffset(IL_OFFSET ilOffset)
{
gtILOffset = ilOffset;
}

GenTreeOpWithILOffset(genTreeOps oper,
var_types type,
GenTree* op1,
GenTree* op2,
IL_OFFSET ilOffset DEBUGARG(bool largeNode = false))
: GenTreeOp(oper, type, op1, op2 DEBUGARG(largeNode))
, gtILOffset(ilOffset)
{
}

#if DEBUGGABLE_GENTREE
GenTreeOpWithILOffset()
: GenTreeOp()
, gtILOffset(0)
{
}
#endif
};

struct GenTreeVal : public GenTree
{
size_t gtVal1;
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/gtlist.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ GTNODE(KEEPALIVE , GenTree ,0,0,GTK_UNOP|GTK_NOVALUE) // kee
GTNODE(CAST , GenTreeCast ,0,0,GTK_UNOP|GTK_EXOP) // conversion to another type
GTNODE(BITCAST , GenTreeOp ,0,1,GTK_UNOP) // reinterpretation of bits as another type
GTNODE(CKFINITE , GenTreeOp ,0,1,GTK_UNOP|DBK_NOCONTAIN) // Check for NaN
GTNODE(LCLHEAP , GenTreeOp ,0,1,GTK_UNOP|DBK_NOCONTAIN) // alloca()
GTNODE(LCLHEAP , GenTreeOpWithILOffset, 0,1,GTK_UNOP|GTK_EXOP|DBK_NOCONTAIN) // alloca()

GTNODE(BOUNDS_CHECK , GenTreeBoundsChk ,0,1,GTK_BINOP|GTK_EXOP|GTK_NOVALUE) // a bounds check - for arrays/spans/SIMDs/HWINTRINSICs

Expand Down
1 change: 1 addition & 0 deletions src/coreclr/jit/gtstructs.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ GTSTRUCT_1(Colon , GT_COLON)
GTSTRUCT_1(FptrVal , GT_FTN_ADDR)
GTSTRUCT_1(Intrinsic , GT_INTRINSIC)
GTSTRUCT_1(IndexAddr , GT_INDEX_ADDR)
GTSTRUCT_1(OpWithILOffset, GT_LCLHEAP)
#if defined(FEATURE_HW_INTRINSICS)
GTSTRUCT_N(MultiOp , GT_HWINTRINSIC)
#endif // FEATURE_HW_INTRINSICS
Expand Down
65 changes: 62 additions & 3 deletions src/coreclr/jit/importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10098,9 +10098,68 @@ void Compiler::impImportBlockCode(BasicBlock* block)
return;
}

op1 = gtNewOperNode(GT_LCLHEAP, TYP_I_IMPL, op2);
// May throw a stack overflow exception. Obviously, we don't want locallocs to be CSE'd.
op1->gtFlags |= (GTF_EXCEPT | GTF_DONT_CSE);
op1 = gtNewLclHeapNode(op2, opcodeOffs);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll move the entire CEE_LOCALLOC importation to a separate function in a separate PR in order to simplify code-review


// Do we have a profile for this non-constant localloc?
// if so, emit "size == profiledValue ? LCLHEAP(profiledValue) : LCLHEAP(size)" tree.
//
// We don't need that optimization if SkipInitLocals is set as even non-constant locallocs
// are cheap in that case, we only want to speed up zeroing.
//
if (info.compInitMem && !op2->IsIntegralConst())
{
// Consuming the existing profile (optimizing)
if (opts.IsOptimizedWithProfile())
{
ssize_t profiledValue = 0;
uint32_t likelihood = 0;
if (pickProfiledValue(opcodeOffs, &likelihood, &profiledValue) && (likelihood >= 50) &&
((uint32_t)profiledValue <= INT_MAX))
{
assert(FitsIn<int>(profiledValue));

GenTree* sizeNode = op2;
GenTree* clonedSizeNode = impCloneExpr(sizeNode, &sizeNode, CHECK_SPILL_ALL,
nullptr DEBUGARG("spilling sizeNode"));
GenTree* profiledValueNode = gtNewIconNode(profiledValue, op2->TypeGet());
GenTree* fallback = gtNewLclHeapNode(clonedSizeNode, opcodeOffs);
fallback->gtFlags |= (GTF_EXCEPT | GTF_DONT_CSE);

GenTree* fastpath;
if (profiledValue == 0)
{
// Just nullptr
fastpath = gtNewIconNode(0, TYP_I_IMPL);
}
else
{
// NOTE: we don't want to convert the fastpath stackalloc to a local like we
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this block is executed frequently enough maybe we should convert to a local? You can compare the block's weight to that of the method entry, if the value is close to 1 then consider the conversion.

// normally do, because it will be an additional overhead for the fallback path
// (a redundant local to clear).
fastpath = gtNewLclHeapNode(profiledValueNode, opcodeOffs);
}

// TODO: Specify weights for the branches in the Qmark node.
GenTreeColon* colon =
new (this, GT_COLON) GenTreeColon(TYP_I_IMPL, fastpath, fallback);
GenTreeOp* cond =
gtNewOperNode(GT_EQ, TYP_INT, sizeNode, gtCloneExpr(profiledValueNode));
GenTreeQmark* qmark = gtNewQmarkNode(TYP_I_IMPL, cond, colon);

// QMARK has to be a root node
unsigned tmp = lvaGrabTemp(true DEBUGARG("Grabbing temp for Qmark"));
impStoreToTemp(tmp, qmark, CHECK_SPILL_ALL);
op1 = gtNewLclvNode(tmp, qmark->TypeGet());
}
}
// Instrumenting LCLHEAP for value profile
else if (opts.IsInstrumented() && !compIsForInlining())
{
JITDUMP("\n ... marking [%06u] in " FMT_BB " for value profile instrumentation\n",
dspTreeID(op1), compCurBB->bbNum);
compCurBB->SetFlags(BBF_HAS_VALUE_PROFILE);
}
}

// Ensure we have stack security for this method.
setNeedsGSSecurityCookie();
Expand Down
Loading
Loading