Skip to content
4 changes: 1 addition & 3 deletions src/coreclr/jit/fgbasic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -888,12 +888,10 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed
const bool isForceInline = (info.compFlags & CORINFO_FLG_FORCEINLINE) != 0;
const bool makeInlineObservations = (compInlineResult != nullptr);
const bool isInlining = compIsForInlining();
const bool isPreJit = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT);
const bool isTier1 = opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER1);
unsigned retBlocks = 0;
int prefixFlags = 0;
bool preciseScan = makeInlineObservations && compInlineResult->GetPolicy()->RequiresPreciseScan();
const bool resolveTokens = preciseScan && (isPreJit || isTier1);
const bool resolveTokens = preciseScan;

// Track offsets where IL instructions begin in DEBUG builds. Used to
// validate debug info generated by the JIT.
Expand Down
60 changes: 43 additions & 17 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2620,29 +2620,55 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp)
unsigned level = 0;
unsigned lvl2 = 0;

#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH)
if (multiOp->OperIs(GT_HWINTRINSIC) && (multiOp->GetOperandCount() == 1) &&
multiOp->AsHWIntrinsic()->OperIsMemoryLoadOrStore())
#if defined(FEATURE_HW_INTRINSICS)
if (multiOp->OperIs(GT_HWINTRINSIC))
{
costEx = IND_COST_EX;
costSz = 2;
GenTreeHWIntrinsic* hwTree = multiOp->AsHWIntrinsic();
#if defined(TARGET_XARCH)
if ((hwTree->GetOperandCount() == 1) && hwTree->OperIsMemoryLoadOrStore())
{
costEx = IND_COST_EX;
costSz = 2;

GenTree* addr = multiOp->Op(1)->gtEffectiveVal();
level = gtSetEvalOrder(addr);
GenTree* addr = hwTree->Op(1)->gtEffectiveVal();
level = gtSetEvalOrder(addr);

// See if we can form a complex addressing mode.
if (addr->OperIs(GT_ADD) && gtMarkAddrMode(addr, &costEx, &costSz, multiOp->TypeGet()))
{
// Nothing to do, costs have been set.
// See if we can form a complex addressing mode.
if (addr->OperIs(GT_ADD) && gtMarkAddrMode(addr, &costEx, &costSz, hwTree->TypeGet()))
{
// Nothing to do, costs have been set.
}
else
{
costEx += addr->GetCostEx();
costSz += addr->GetCostSz();
}

hwTree->SetCosts(costEx, costSz);
return level;
}
else
#endif
switch (hwTree->gtHWIntrinsicId)
{
costEx += addr->GetCostEx();
costSz += addr->GetCostSz();
#if defined(TARGET_XARCH)
case NI_Vector128_Create:
case NI_Vector256_Create:
#elif defined(TARGET_ARM64)
case NI_Vector64_Create:
case NI_Vector128_Create:
#endif
{
if (hwTree->gtGetOp1()->OperIsConst() && (hwTree->gtGetOp2() == nullptr))
{
// Vector.Create(cns) is cheap but not that cheap to be (1,1)
costEx = 2;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be IND_COST_EX

Copy link
Member

@tannergooding tannergooding Dec 1, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, it probably needs to be a "bit" more complex.

If all operands are constant and its not representing all bits zero or all bits set then its IND_COST_EX.

If part of the value isn't constant then the cost increases as the number of operands increases. We don't currently, but could eventually, handle "partial constants".

If the value represents all bits zero or all bits set, then its cheaper and its just xor or the relevant cmp SIMD instruction and is special cased by hardware.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding yes I had "all zeros/ones" cases in mind but the problem that they complicate code a lot (especially the AllBitsSet case for different types) for a very rare case where usually get_Zero/get_AllBitsSet intrinsics are used. I think it won't hurt if we do CSE more often for these cases or we better move the logic to recognize get_Zero/get_AllBitsSet early in morph/importer and it will work as expected + IR will be simplified earlier.

costSz = 2;
}
break;
}
default:
break;
}

multiOp->SetCosts(costEx, costSz);
return level;
}
#endif // defined(FEATURE_SIMD) || defined(FEATURE_HW_INTRINSICS)

Expand Down