Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle mis-sized structs in XARCH PUTARG_STK codegen #68611

Merged
merged 2 commits into from
Jun 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 33 additions & 40 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3471,36 +3471,34 @@ unsigned CodeGen::genMove1IfNeeded(unsigned size, regNumber intTmpReg, GenTree*
void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
{
GenTree* src = putArgNode->AsOp()->gtOp1;
// We will never call this method for SIMD types, which are stored directly
// in genPutStructArgStk().
// We will never call this method for SIMD types, which are stored directly in genPutStructArgStk().
assert(src->isContained() && src->OperIs(GT_OBJ) && src->TypeIs(TYP_STRUCT));
assert(!src->AsObj()->GetLayout()->HasGCPtr());
#ifdef TARGET_X86
assert(!m_pushStkArg);
#endif

unsigned size = putArgNode->GetStackByteSize();
#ifdef TARGET_X86
assert((XMM_REGSIZE_BYTES <= size) && (size <= CPBLK_UNROLL_LIMIT));
#else // !TARGET_X86
assert(size <= CPBLK_UNROLL_LIMIT);
#endif // !TARGET_X86

if (src->AsOp()->gtOp1->isUsedFromReg())
{
genConsumeReg(src->AsOp()->gtOp1);
}

unsigned loadSize = putArgNode->GetArgLoadSize();
assert(!src->AsObj()->GetLayout()->HasGCPtr() && (loadSize <= CPBLK_UNROLL_LIMIT));

unsigned offset = 0;
regNumber xmmTmpReg = REG_NA;
regNumber intTmpReg = REG_NA;
regNumber longTmpReg = REG_NA;

if (size >= XMM_REGSIZE_BYTES)
#ifdef TARGET_X86
if (loadSize >= 8)
#else
if (loadSize >= XMM_REGSIZE_BYTES)
#endif
{
xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT);
}
if ((size % XMM_REGSIZE_BYTES) != 0)
if ((loadSize % XMM_REGSIZE_BYTES) != 0)
{
intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT);
}
Expand All @@ -3512,7 +3510,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
#endif

// Let's use SSE2 to be able to do 16 byte at a time with loads and stores.
size_t slots = size / XMM_REGSIZE_BYTES;
size_t slots = loadSize / XMM_REGSIZE_BYTES;
while (slots-- > 0)
{
// TODO: In the below code the load and store instructions are for 16 bytes, but the
Expand All @@ -3528,13 +3526,13 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode)
}

// Fill the remainder (15 bytes or less) if there's one.
if ((size % XMM_REGSIZE_BYTES) != 0)
if ((loadSize % XMM_REGSIZE_BYTES) != 0)
{
offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset);
assert(offset == size);
offset += genMove8IfNeeded(loadSize, longTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove4IfNeeded(loadSize, intTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove2IfNeeded(loadSize, intTmpReg, src->AsOp()->gtOp1, offset);
offset += genMove1IfNeeded(loadSize, intTmpReg, src->AsOp()->gtOp1, offset);
assert(offset == loadSize);
}
}

Expand Down Expand Up @@ -3570,8 +3568,9 @@ void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode)
// putArgNode - the PutArgStk tree.
//
// Notes:
// Used only on x86, in two cases:
// Used (only) on x86 for:
// - Structs 4, 8, or 12 bytes in size (less than XMM_REGSIZE_BYTES, multiple of TARGET_POINTER_SIZE).
// - Local structs less than 16 bytes in size (it is ok to load "too much" from our stack frame).
// - Structs that contain GC pointers - they are guaranteed to be sized correctly by the VM.
//
void CodeGen::genStructPutArgPush(GenTreePutArgStk* putArgNode)
Expand Down Expand Up @@ -3605,9 +3604,9 @@ void CodeGen::genStructPutArgPush(GenTreePutArgStk* putArgNode)
}

ClassLayout* layout = src->AsObj()->GetLayout();
const unsigned byteSize = putArgNode->GetStackByteSize();
assert((byteSize % TARGET_POINTER_SIZE == 0) && ((byteSize < XMM_REGSIZE_BYTES) || layout->HasGCPtr()));
const unsigned numSlots = byteSize / TARGET_POINTER_SIZE;
const unsigned loadSize = putArgNode->GetArgLoadSize();
assert(((loadSize < XMM_REGSIZE_BYTES) || layout->HasGCPtr()) && ((loadSize % TARGET_POINTER_SIZE) == 0));
const unsigned numSlots = loadSize / TARGET_POINTER_SIZE;

for (int i = numSlots - 1; i >= 0; --i)
{
Expand Down Expand Up @@ -3654,9 +3653,9 @@ void CodeGen::genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgNode)
#endif // DEBUG

assert(layout->HasGCPtr());
const unsigned byteSize = putArgNode->GetStackByteSize();
assert(byteSize % TARGET_POINTER_SIZE == 0);
const unsigned numSlots = byteSize / TARGET_POINTER_SIZE;
const unsigned argSize = putArgNode->GetStackByteSize();
assert(argSize % TARGET_POINTER_SIZE == 0);
const unsigned numSlots = argSize / TARGET_POINTER_SIZE;

// No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always.
for (unsigned i = 0; i < numSlots;)
Expand Down Expand Up @@ -5525,23 +5524,17 @@ void CodeGen::genCall(GenTreeCall* call)
GenTree* argNode = arg.GetEarlyNode();
if (argNode->OperIs(GT_PUTARG_STK) && (arg.GetLateNode() == nullptr))
{
GenTree* source = argNode->AsPutArgStk()->gtGetOp1();
unsigned size = argNode->AsPutArgStk()->GetStackByteSize();
stackArgBytes += size;
GenTree* source = argNode->AsPutArgStk()->gtGetOp1();
unsigned argSize = argNode->AsPutArgStk()->GetStackByteSize();
stackArgBytes += argSize;

#ifdef DEBUG
assert(size == arg.AbiInfo.ByteSize);
assert(argSize == arg.AbiInfo.ByteSize);
#ifdef FEATURE_PUT_STRUCT_ARG_STK
if (!source->OperIs(GT_FIELD_LIST) && (source->TypeGet() == TYP_STRUCT))
if (source->TypeIs(TYP_STRUCT) && !source->OperIs(GT_FIELD_LIST))
{
GenTreeObj* obj = source->AsObj();
unsigned argBytes = roundUp(obj->GetLayout()->GetSize(), TARGET_POINTER_SIZE);
#ifdef TARGET_X86
// If we have an OBJ, we must have created a copy if the original arg was not a
// local and was not a multiple of TARGET_POINTER_SIZE.
// Note that on x64/ux this will be handled by unrolling in genStructPutArgUnroll.
assert((argBytes == obj->GetLayout()->GetSize()) || obj->Addr()->IsLocalAddrExpr());
#endif // TARGET_X86
assert(arg.AbiInfo.ByteSize == argBytes);
unsigned loadSize = source->AsObj()->GetLayout()->GetSize();
assert(argSize == roundUp(loadSize, TARGET_POINTER_SIZE));
}
#endif // FEATURE_PUT_STRUCT_ARG_STK
#endif // DEBUG
Expand Down
43 changes: 41 additions & 2 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -7442,12 +7442,18 @@ struct GenTreePutArgStk : public GenTreeUnOp
// TODO-Throughput: The following information should be obtained from the child
// block node.

enum class Kind : __int8{
enum class Kind : int8_t{
Invalid, RepInstr, PartialRepInstr, Unroll, Push,
};
Kind gtPutArgStkKind;
#endif

#ifdef TARGET_XARCH
private:
uint8_t m_argLoadSizeDelta;
#endif // TARGET_XARCH
#endif // FEATURE_PUT_STRUCT_ARG_STK

public:
GenTreePutArgStk(genTreeOps oper,
var_types type,
GenTree* op1,
Expand All @@ -7473,7 +7479,10 @@ struct GenTreePutArgStk : public GenTreeUnOp
#endif // FEATURE_FASTTAILCALL
#if defined(FEATURE_PUT_STRUCT_ARG_STK)
, gtPutArgStkKind(Kind::Invalid)
#if defined(TARGET_XARCH)
, m_argLoadSizeDelta(UINT8_MAX)
#endif
#endif // FEATURE_PUT_STRUCT_ARG_STK
{
}

Expand Down Expand Up @@ -7520,6 +7529,36 @@ struct GenTreePutArgStk : public GenTreeUnOp
return m_byteSize;
}

#ifdef TARGET_XARCH
//------------------------------------------------------------------------
// SetArgLoadSize: Set the optimal number of bytes to load for this argument.
//
// On XARCH, it is profitable to use wider loads when our source is a local
// variable. To not duplicate the logic between lowering, LSRA and codegen,
// we do the legality check once, in lowering, and save the result here, as
// a negative delta relative to the size of the argument with padding.
//
// Arguments:
// loadSize - The optimal number of bytes to load
//
void SetArgLoadSize(unsigned loadSize)
{
unsigned argSize = GetStackByteSize();
assert(roundUp(loadSize, TARGET_POINTER_SIZE) == argSize);

m_argLoadSizeDelta = static_cast<uint8_t>(argSize - loadSize);
}

//------------------------------------------------------------------------
// GetArgLoadSize: Get the optimal number of bytes to load for this argument.
//
unsigned GetArgLoadSize() const
{
assert(m_argLoadSizeDelta != UINT8_MAX);
return GetStackByteSize() - m_argLoadSizeDelta;
}
#endif // TARGET_XARCH

// Return true if this is a PutArgStk of a SIMD12 struct.
// This is needed because such values are re-typed to SIMD16, and the type of PutArgStk is VOID.
unsigned isSIMD12() const
Expand Down
20 changes: 7 additions & 13 deletions src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -545,30 +545,24 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
// The cpyXXXX code is rather complex and this could cause it to be more complex, but
// it might be the right thing to do.

unsigned size = putArgStk->GetStackByteSize();
unsigned loadSize = layout->GetSize();

assert(loadSize <= size);
// If possible, widen the load, this results in more compact code.
unsigned loadSize = srcIsLocal ? roundUp(layout->GetSize(), TARGET_POINTER_SIZE) : layout->GetSize();
putArgStk->SetArgLoadSize(loadSize);

// TODO-X86-CQ: The helper call either is not supported on x86 or required more work
// (I don't know which).

if (!layout->HasGCPtr())
{
#ifdef TARGET_X86
// Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE
// chunks. As such, the correctness of this code depends on the fact that
// morph will copy any "mis-sized" (too small) non-local OBJs into a temp,
// thus preventing any possible out-of-bounds memory reads.
assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() ||
(src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr()));
if (size < XMM_REGSIZE_BYTES)
// chunks. As such, we'll only use this path for correctly-sized sources.
if ((loadSize < XMM_REGSIZE_BYTES) && ((loadSize % TARGET_POINTER_SIZE) == 0))
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push;
}
else
#endif // TARGET_X86
if (size <= CPBLK_UNROLL_LIMIT)
if (loadSize <= CPBLK_UNROLL_LIMIT)
{
putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll;
}
Expand Down Expand Up @@ -604,7 +598,7 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk)
// so if possible, widen the load to avoid the sign/zero-extension.
if (varTypeIsSmall(regType) && srcIsLocal)
{
assert(putArgStk->GetStackByteSize() <= genTypeSize(TYP_INT));
assert(genTypeSize(TYP_INT) <= putArgStk->GetStackByteSize());
regType = TYP_INT;
}

Expand Down
20 changes: 15 additions & 5 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1561,21 +1561,31 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk)
return BuildOperandUses(src);
}

ssize_t size = putArgStk->GetStackByteSize();
unsigned loadSize = putArgStk->GetArgLoadSize();
switch (putArgStk->gtPutArgStkKind)
{
case GenTreePutArgStk::Kind::Unroll:
// If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg.
if ((size % XMM_REGSIZE_BYTES) != 0)
if ((loadSize % XMM_REGSIZE_BYTES) != 0)
{
regMaskTP regMask = allRegs(TYP_INT);
#ifdef TARGET_X86
// Storing at byte granularity requires a byteable register.
if ((loadSize & 1) != 0)
{
regMask &= allByteRegs();
}
#endif // TARGET_X86
buildInternalIntRegisterDefForNode(putArgStk, regMask);
}

if (size >= XMM_REGSIZE_BYTES)
#ifdef TARGET_X86
if (loadSize >= 8)
#else
if (loadSize >= XMM_REGSIZE_BYTES)
#endif
{
// If we have a buffer larger than or equal to XMM_REGSIZE_BYTES, reserve
// an XMM register to use it for a series of 16-byte loads and stores.
// See "genStructPutArgUnroll" -- we will use this XMM register for wide stores.
buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates());
SetContainsAVXFlags();
}
Expand Down
30 changes: 0 additions & 30 deletions src/coreclr/jit/morph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3435,36 +3435,6 @@ GenTreeCall* Compiler::fgMorphArgs(GenTreeCall* call)
assert(varTypeIsEnregisterable(argObj->TypeGet()) ||
(makeOutArgCopy && varTypeIsEnregisterable(structBaseType)));
}

#if !defined(UNIX_AMD64_ABI) && !defined(TARGET_ARMARCH) && !defined(TARGET_LOONGARCH64)
// TODO-CQ-XARCH: there is no need for a temp copy if we improve our code generation in
// `genPutStructArgStk` for xarch like we did it for Arm/Arm64.

// We still have a struct unless we converted the GT_OBJ into a GT_IND above...
if (isHfaArg && passUsingFloatRegs)
{
}
else if (structBaseType == TYP_STRUCT)
{
// If the valuetype size is not a multiple of TARGET_POINTER_SIZE,
// we must copyblk to a temp before doing the obj to avoid
// the obj reading memory past the end of the valuetype
if (roundupSize > originalSize)
{
makeOutArgCopy = true;

// There are a few special cases where we can omit using a CopyBlk
// where we normally would need to use one.

if (argObj->OperIs(GT_OBJ) &&
argObj->AsObj()->gtGetOp1()->IsLocalAddrExpr() != nullptr) // Is the source a LclVar?
{
makeOutArgCopy = false;
}
}
}

#endif // !UNIX_AMD64_ABI
}
}

Expand Down
Loading