Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix regressions due to pipeline stalls introduced in #59415 #59497

Merged
merged 12 commits into from
Sep 23, 2021
90 changes: 27 additions & 63 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2837,21 +2837,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
#endif
if (bytesWritten + regSize > size)
{
#ifdef TARGET_AMD64
if (size - bytesWritten <= XMM_REGSIZE_BYTES)
{
regSize = XMM_REGSIZE_BYTES;
}

// Shift dstOffset back to use full SIMD move
unsigned shiftBack = regSize - (size - bytesWritten);
assert(shiftBack <= regSize);
bytesWritten -= shiftBack;
dstOffset -= shiftBack;
#else
assert(srcIntReg != REG_NA);
break;
#endif
}

if (dstLclNum != BAD_VAR_NUM)
Expand All @@ -2866,6 +2853,11 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)

dstOffset += regSize;
bytesWritten += regSize;

if (regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES)
{
regSize = XMM_REGSIZE_BYTES;
}
}

size -= bytesWritten;
Expand Down Expand Up @@ -3083,65 +3075,37 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;

for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
while (size >= regSize)
{
if (srcLclNum != BAD_VAR_NUM)
for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
{
emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}
if (srcLclNum != BAD_VAR_NUM)
{
emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}
}

if (size > 0)
{
if (size <= XMM_REGSIZE_BYTES)
// Size is too large for YMM moves, try stepping down to XMM size to finish SIMD copies.
if (regSize == YMM_REGSIZE_BYTES)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably worth adding a line of comment that further moves are not possible using YMM and so we will try XMM.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, good point. I'll add a note.

{
regSize = XMM_REGSIZE_BYTES;
}

// Copy the remainder by moving the last regSize bytes of the buffer
unsigned shiftBack = regSize - size;
assert(shiftBack <= regSize);

srcOffset -= shiftBack;
dstOffset -= shiftBack;

if (srcLclNum != BAD_VAR_NUM)
{
emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}

return;
}

// Fill the remainder with normal loads/stores
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
{
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
#ifdef TARGET_AMD64
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % XMM_REGSIZE_BYTES == 0);
#else
const bool willUseOnlySimdMov = (size % 8 == 0);
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1363,7 +1363,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
switch (blkNode->gtBlkOpKind)
{
case GenTreeBlk::BlkOpKindUnroll:
if (size < XMM_REGSIZE_BYTES)
if ((size % XMM_REGSIZE_BYTES) != 0)
kunalspathak marked this conversation as resolved.
Show resolved Hide resolved
{
regMaskTP regMask = allRegs(TYP_INT);
#ifdef TARGET_X86
Expand Down