Skip to content
9 changes: 3 additions & 6 deletions src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -708,16 +708,13 @@ void CodeGen::genBuildRegPairsStack(regMaskTP regsMask, ArrayStack<RegPair>* reg

while (regsMask != RBM_NONE)
{
regMaskTP reg1Mask = genFindLowestBit(regsMask);
regNumber reg1 = genRegNumFromMask(reg1Mask);
regsMask &= ~reg1Mask;
regNumber reg1 = genFirstRegNumFromMaskAndToggle(regsMask);
regsCount -= 1;

bool isPairSave = false;
if (regsCount > 0)
{
regMaskTP reg2Mask = genFindLowestBit(regsMask);
regNumber reg2 = genRegNumFromMask(reg2Mask);
regNumber reg2 = genFirstRegNumFromMask(regsMask);
if (reg2 == REG_NEXT(reg1))
{
// The JIT doesn't allow saving pair (R28,FP), even though the
Expand All @@ -733,7 +730,7 @@ void CodeGen::genBuildRegPairsStack(regMaskTP regsMask, ArrayStack<RegPair>* reg
{
isPairSave = true;

regsMask &= ~reg2Mask;
regsMask ^= genRegMask(reg2);
regsCount -= 1;

regStack->Push(RegPair(reg1, reg2));
Expand Down
44 changes: 44 additions & 0 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,50 @@ inline regNumber genRegNumFromMask(regMaskTP mask)
return regNum;
}

//------------------------------------------------------------------------------
// genFirstRegNumFromMaskAndToggle : Maps first bit set in the register mask to a
// register number and also toggle the bit in the `mask`.
// Arguments:
// mask - the register mask
//
// Return Value:
// The number of the first register contained in the mask and updates the `mask` to toggle
// the bit.
//

inline regNumber genFirstRegNumFromMaskAndToggle(regMaskTP& mask)
{
assert(mask != 0); // Must have one bit set, so can't have a mask of zero

/* Convert the mask to a register number */

regNumber regNum = (regNumber)BitOperations::BitScanForward(mask);
mask ^= genRegMask(regNum);

return regNum;
}

//------------------------------------------------------------------------------
// genFirstRegNumFromMask : Maps first bit set in the register mask to a register number.
//
// Arguments:
// mask - the register mask
//
// Return Value:
// The number of the first register contained in the mask.
//

inline regNumber genFirstRegNumFromMask(regMaskTP mask)
{
assert(mask != 0); // Must have one bit set, so can't have a mask of zero

/* Convert the mask to a register number */

regNumber regNum = (regNumber)BitOperations::BitScanForward(mask);

return regNum;
}

/*****************************************************************************
*
* Return the size in bytes of the given type.
Expand Down
6 changes: 3 additions & 3 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25802,9 +25802,9 @@ regNumber GenTree::ExtractTempReg(regMaskTP mask /* = (regMaskTP)-1 */)
{
regMaskTP availableSet = gtRsvdRegs & mask;
assert(genCountBits(availableSet) >= 1);
regMaskTP tempRegMask = genFindLowestBit(availableSet);
gtRsvdRegs &= ~tempRegMask;
return genRegNumFromMask(tempRegMask);
regNumber tempReg = genFirstRegNumFromMask(availableSet);
gtRsvdRegs ^= genRegMask(tempReg);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this actually faster than the previous code? Since it needs to do either a left shift (on amd64) or memory lookup (non-amd64). The same question applies to all the places where you introduced genRegMask.

It seems like you're saying b = genRegMask(...) + a ^= b is faster than a &= ~b?

The genFirstRegNumFromMaskAndToggle cases seem like a clear win, but I'm not as sure about these.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a ^= (1 << …) is specially recognized and transformed into btc on xarch. There is sometimes special optimizations possible on Arm64 as well, but it’s worst case the same number of instructions and execution cost (but often slightly shorter)

return tempReg;
}

//------------------------------------------------------------------------
Expand Down
156 changes: 94 additions & 62 deletions src/coreclr/jit/lsra.cpp

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/coreclr/jit/lsra.h
Original file line number Diff line number Diff line change
Expand Up @@ -1661,8 +1661,8 @@ class LinearScan : public LinearScanInterface
VarToRegMap* outVarToRegMaps;

// A temporary VarToRegMap used during the resolution of critical edges.
VarToRegMap sharedCriticalVarToRegMap;

VarToRegMap sharedCriticalVarToRegMap;
PhasedVar<regMaskTP> actualRegistersMask;
PhasedVar<regMaskTP> availableIntRegs;
PhasedVar<regMaskTP> availableFloatRegs;
PhasedVar<regMaskTP> availableDoubleRegs;
Expand Down
59 changes: 33 additions & 26 deletions src/coreclr/jit/lsrabuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -727,33 +727,30 @@ bool LinearScan::isContainableMemoryOp(GenTree* node)
//
void LinearScan::addRefsForPhysRegMask(regMaskTP mask, LsraLocation currentLoc, RefType refType, bool isLastUse)
{
if (refType == RefTypeKill)
{
// The mask identifies a set of registers that will be used during
// codegen. Mark these as modified here, so when we do final frame
// layout, we'll know about all these registers. This is especially
// important if mask contains callee-saved registers, which affect the
// frame size since we need to save/restore them. In the case where we
// have a copyBlk with GC pointers, can need to call the
// CORINFO_HELP_ASSIGN_BYREF helper, which kills callee-saved RSI and
// RDI, if LSRA doesn't assign RSI/RDI, they wouldn't get marked as
// modified until codegen, which is too late.
compiler->codeGen->regSet.rsSetRegsModified(mask DEBUGARG(true));
}

for (regNumber reg = REG_FIRST; mask; reg = REG_NEXT(reg), mask >>= 1)
{
if (mask & 1)
assert(refType == RefTypeKill);

// The mask identifies a set of registers that will be used during
// codegen. Mark these as modified here, so when we do final frame
// layout, we'll know about all these registers. This is especially
// important if mask contains callee-saved registers, which affect the
// frame size since we need to save/restore them. In the case where we
// have a copyBlk with GC pointers, can need to call the
// CORINFO_HELP_ASSIGN_BYREF helper, which kills callee-saved RSI and
// RDI, if LSRA doesn't assign RSI/RDI, they wouldn't get marked as
// modified until codegen, which is too late.
compiler->codeGen->regSet.rsSetRegsModified(mask DEBUGARG(true));

for (regMaskTP candidates = mask; candidates != RBM_NONE;)
{
regNumber reg = genFirstRegNumFromMaskAndToggle(candidates);
// This assumes that these are all "special" RefTypes that
// don't need to be recorded on the tree (hence treeNode is nullptr)
RefPosition* pos = newRefPosition(reg, currentLoc, refType, nullptr,
genRegMask(reg)); // This MUST occupy the physical register (obviously)

if (isLastUse)
{
// This assumes that these are all "special" RefTypes that
// don't need to be recorded on the tree (hence treeNode is nullptr)
RefPosition* pos = newRefPosition(reg, currentLoc, refType, nullptr,
genRegMask(reg)); // This MUST occupy the physical register (obviously)

if (isLastUse)
{
pos->lastUse = true;
}
pos->lastUse = true;
}
}
}
Expand Down Expand Up @@ -2756,6 +2753,16 @@ void LinearScan::buildIntervals()
availableRegCount = REG_INT_COUNT;
}

if (availableRegCount < (sizeof(regMaskTP) * 8))
{
// Mask out the bits that are between 64 ~ availableRegCount
actualRegistersMask = (1ULL << availableRegCount) - 1;
}
else
{
actualRegistersMask = ~RBM_NONE;
}
Comment on lines +2756 to +2764
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not have this always be actualRegisterMask = (1ULL << availableRegCount) - 1?

That way its always exactly the bitmask of actual registers available. No more, no less.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's ideally how it should be, but for arm64, availableRegCount == 65 (including the REG_STK, etc.). So (1ULL << 65) returns 0x2 and with - 1, we get actualRegisterMask becomes 1. Debugger shows correct value.

image

I am little confused on why that happens.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

regMaskTP is unsigned __int64 for Arm64 and so we can represent at most 64 registers and therefore 1 << 63 is the highest shift we can safely do, because 1 << 64 is overshifting and therefore undefined behavior.

Some compilers are going to do overshifting as if we had infinite bits and then truncated. This would make it (1 << 65) == 0, then 0 - 1 == -1, which is AllBitsSet. Other compilers are going to instead treat this as C# and x86/x64 do, which is to treat it as (1 << (65 % 63)) == (1 << 2) == 4 and then 4 - 1 == 3 and others still as something completely different.

It looks like this isn't an "issue" today because the register allocator cannot allocate REG_SP itself. It's only manually used by codegenarm64 instead and so it doesn't need to be included in actualRegistersMask. That makes working around this "simpler" since its effectively a "special" register like REG_STK.

Short term we probably want to add an assert to validate the tracked registers don't exceed 64-bits (that is ACTUAL_REG_CNT <= 64) and to special case when it is exactly 64-bits.

Long term, I imagine we want to consider better ways to represent this so we can avoid the problem altogether. Having distinct register files for each category (SIMD/FP vs General/Integer vs Special/Other) is one way. That may also help in other areas where some Integer registers are actually Special registers and cannot be used "generally" (i.e. REG_ZR is effectively reserved and cannot be assigned, just consumed). It would also reduce the cost for various operations in the case where only one register type is being used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some compilers are going to do overshifting as if we had infinite bits and then truncated. This would make it (1 << 65) == 0, then 0 - 1 == -1, which is AllBitsSet. Other compilers are going to instead treat this as C# and x86/x64 do, which is to treat it as (1 << (65 % 63)) == (1 << 2) == 4 and then 4 - 1 == 3 and others still as something completely different.

That's exactly what I understand it. What confuses me is the compiler decides different behavior during execution vs. "watch windows" in debugging.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While I agree with your suggestion, for this PR, I will keep the code that I have currently to handle the arm64 case.


#ifdef DEBUG
// Make sure we don't have any blocks that were not visited
for (BasicBlock* const block : compiler->Blocks())
Expand Down
60 changes: 0 additions & 60 deletions src/coreclr/jit/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2556,66 +2556,6 @@ double FloatingPointUtils::normalize(double value)
#endif
}

//------------------------------------------------------------------------
// BitOperations::BitScanForward: Search the mask data from least significant bit (LSB) to the most significant bit
// (MSB) for a set bit (1)
//
// Arguments:
// value - the value
//
// Return Value:
// 0 if the mask is zero; nonzero otherwise.
//
uint32_t BitOperations::BitScanForward(uint32_t value)
{
assert(value != 0);

#if defined(_MSC_VER)
unsigned long result;
::_BitScanForward(&result, value);
return static_cast<uint32_t>(result);
#else
int32_t result = __builtin_ctz(value);
return static_cast<uint32_t>(result);
#endif
}

//------------------------------------------------------------------------
// BitOperations::BitScanForward: Search the mask data from least significant bit (LSB) to the most significant bit
// (MSB) for a set bit (1)
//
// Arguments:
// value - the value
//
// Return Value:
// 0 if the mask is zero; nonzero otherwise.
//
uint32_t BitOperations::BitScanForward(uint64_t value)
{
assert(value != 0);

#if defined(_MSC_VER)
#if defined(HOST_64BIT)
unsigned long result;
::_BitScanForward64(&result, value);
return static_cast<uint32_t>(result);
#else
uint32_t lower = static_cast<uint32_t>(value);

if (lower == 0)
{
uint32_t upper = static_cast<uint32_t>(value >> 32);
return 32 + BitScanForward(upper);
}

return BitScanForward(lower);
#endif // HOST_64BIT
#else
int32_t result = __builtin_ctzll(value);
return static_cast<uint32_t>(result);
#endif
}

//------------------------------------------------------------------------
// BitOperations::BitScanReverse: Search the mask data from most significant bit (MSB) to least significant bit
// (LSB) for a set bit (1).
Expand Down
60 changes: 58 additions & 2 deletions src/coreclr/jit/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -775,9 +775,65 @@ class FloatingPointUtils
class BitOperations
{
public:
static uint32_t BitScanForward(uint32_t value);
//------------------------------------------------------------------------
// BitOperations::BitScanForward: Search the mask data from least significant bit (LSB) to the most significant bit
// (MSB) for a set bit (1)
//
// Arguments:
// value - the value
//
// Return Value:
// 0 if the mask is zero; nonzero otherwise.
//
FORCEINLINE static uint32_t BitScanForward(uint32_t value)
{
assert(value != 0);

#if defined(_MSC_VER)
unsigned long result;
::_BitScanForward(&result, value);
return static_cast<uint32_t>(result);
#else
int32_t result = __builtin_ctz(value);
return static_cast<uint32_t>(result);
#endif
}

//------------------------------------------------------------------------
// BitOperations::BitScanForward: Search the mask data from least significant bit (LSB) to the most significant bit
// (MSB) for a set bit (1)
//
// Arguments:
// value - the value
//
// Return Value:
// 0 if the mask is zero; nonzero otherwise.
//
FORCEINLINE static uint32_t BitScanForward(uint64_t value)
{
assert(value != 0);

#if defined(_MSC_VER)
#if defined(HOST_64BIT)
unsigned long result;
::_BitScanForward64(&result, value);
return static_cast<uint32_t>(result);
#else
uint32_t lower = static_cast<uint32_t>(value);

static uint32_t BitScanForward(uint64_t value);
if (lower == 0)
{
uint32_t upper = static_cast<uint32_t>(value >> 32);
return 32 + BitScanForward(upper);
}

return BitScanForward(lower);
#endif // HOST_64BIT
#else
int32_t result = __builtin_ctzll(value);
return static_cast<uint32_t>(result);
#endif
}

static uint32_t BitScanReverse(uint32_t value);

Expand Down