Skip to content

Commit ce655e3

Browse files
Merged stores: Fix alignment-related issues and enable SIMD where possible (#92939)
Co-authored-by: SingleAccretion <[email protected]>
1 parent 28c6415 commit ce655e3

File tree

2 files changed

+175
-23
lines changed

2 files changed

+175
-23
lines changed

src/coreclr/jit/gentree.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,7 @@ enum GenTreeFlags : unsigned int
490490
GTF_IND_INVARIANT = 0x01000000, // GT_IND -- the target is invariant (a prejit indirection)
491491
GTF_IND_NONNULL = 0x00400000, // GT_IND -- the indirection never returns null (zero)
492492
GTF_IND_INITCLASS = 0x00200000, // OperIsIndir() -- the indirection requires preceding static cctor
493+
GTF_IND_ALLOW_NON_ATOMIC = 0x00100000, // GT_IND -- this memory access does not need to be atomic
493494

494495
GTF_IND_COPYABLE_FLAGS = GTF_IND_VOLATILE | GTF_IND_NONFAULTING | GTF_IND_UNALIGNED | GTF_IND_INITCLASS,
495496
GTF_IND_FLAGS = GTF_IND_COPYABLE_FLAGS | GTF_IND_NONNULL | GTF_IND_TGT_NOT_HEAP | GTF_IND_TGT_HEAP | GTF_IND_INVARIANT,

src/coreclr/jit/lower.cpp

Lines changed: 174 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7833,26 +7833,34 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo
78337833
}
78347834

78357835
// Data has to be INT_CNS, can be also VEC_CNS in future.
7836-
if (!ind->Data()->IsCnsIntOrI())
7836+
if (!ind->Data()->IsCnsIntOrI() && !ind->Data()->IsVectorConst())
78377837
{
78387838
return false;
78397839
}
78407840

7841+
auto isNodeInvariant = [](Compiler* comp, GenTree* node, bool allowNull) {
7842+
if (node == nullptr)
7843+
{
7844+
return allowNull;
7845+
}
7846+
// We can allow bigger trees here, but it's not clear if it's worth it.
7847+
return node->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
7848+
};
7849+
78417850
data->targetType = ind->TypeGet();
78427851
data->value = ind->Data();
78437852
if (ind->Addr()->OperIs(GT_LEA))
78447853
{
78457854
GenTree* base = ind->Addr()->AsAddrMode()->Base();
78467855
GenTree* index = ind->Addr()->AsAddrMode()->Index();
7847-
if ((base == nullptr) || !base->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(base->AsLclVar()->GetLclNum()))
7856+
if (!isNodeInvariant(comp, base, false))
78487857
{
78497858
// Base must be a local. It's possible for it to be nullptr when index is not null,
78507859
// but let's ignore such cases.
78517860
return false;
78527861
}
78537862

7854-
if ((index != nullptr) &&
7855-
(!index->OperIs(GT_LCL_VAR) || comp->lvaVarAddrExposed(index->AsLclVar()->GetLclNum())))
7863+
if (!isNodeInvariant(comp, index, true))
78567864
{
78577865
// Index should be either nullptr or a local.
78587866
return false;
@@ -7863,7 +7871,7 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo
78637871
data->scale = ind->Addr()->AsAddrMode()->GetScale();
78647872
data->offset = ind->Addr()->AsAddrMode()->Offset();
78657873
}
7866-
else if (ind->Addr()->OperIs(GT_LCL_VAR) && !comp->lvaVarAddrExposed(ind->Addr()->AsLclVar()->GetLclNum()))
7874+
else if (isNodeInvariant(comp, ind->Addr(), true))
78677875
{
78687876
// Address is just a local, no offset, scale is 1
78697877
data->baseAddr = ind->Addr();
@@ -7919,6 +7927,15 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
79197927
return;
79207928
}
79217929

7930+
// TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial.
7931+
// The algorithm does support TYP_REF (with null value), but it seems to be not worth
7932+
// it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two
7933+
// items at once. Although, it may be profitable to do "stp q0, q0, [addr]".
7934+
if (!varTypeIsIntegral(ind) && !varTypeIsSIMD(ind))
7935+
{
7936+
return;
7937+
}
7938+
79227939
// We're going to do it in a loop while we see suitable STOREINDs to coalesce.
79237940
// E.g.: we have the following LIR sequence:
79247941
//
@@ -7933,12 +7950,6 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
79337950
// to get a single store of 8 bytes.
79347951
do
79357952
{
7936-
// This check is not really needed, just for better throughput.
7937-
if (!ind->TypeIs(TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT))
7938-
{
7939-
return;
7940-
}
7941-
79427953
StoreCoalescingData currData;
79437954
StoreCoalescingData prevData;
79447955

@@ -8002,6 +8013,57 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80028013
return;
80038014
}
80048015

8016+
// Now the hardest part: decide whether it's safe to use an unaligned write.
8017+
//
8018+
// IND<byte> is always fine (and all IND<X> created here from such)
8019+
// IND<simd> is not required to be atomic per our Memory Model
8020+
const bool allowsNonAtomic =
8021+
((ind->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0) && ((prevInd->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0);
8022+
8023+
if (!allowsNonAtomic && (genTypeSize(ind) > 1) && !varTypeIsSIMD(ind))
8024+
{
8025+
// TODO-CQ: if we see that the target is a local memory (non address exposed)
8026+
// we can use any type (including SIMD) for a new load.
8027+
8028+
// Ignore indices for now, they can invalidate our alignment assumptions.
8029+
// Although, we can take scale into account.
8030+
if (currData.index != nullptr)
8031+
{
8032+
return;
8033+
}
8034+
8035+
// Base address being TYP_REF gives us a hint that data is pointer-aligned.
8036+
if (!currData.baseAddr->TypeIs(TYP_REF))
8037+
{
8038+
return;
8039+
}
8040+
8041+
// Check whether the combined indir is still aligned.
8042+
bool isCombinedIndirAtomic = (genTypeSize(ind) < TARGET_POINTER_SIZE) &&
8043+
(min(prevData.offset, currData.offset) % (genTypeSize(ind) * 2)) == 0;
8044+
8045+
if (genTypeSize(ind) == TARGET_POINTER_SIZE)
8046+
{
8047+
#ifdef TARGET_ARM64
8048+
// Per Arm Architecture Reference Manual for A-profile architecture:
8049+
//
8050+
// * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory
8051+
// are treated as a pair of single - copy atomic 64 - bit writes.
8052+
//
8053+
// Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null)
8054+
//
8055+
// And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise
8056+
// we're already doing a load that has no atomicity guarantees.
8057+
isCombinedIndirAtomic = true;
8058+
#endif
8059+
}
8060+
8061+
if (!isCombinedIndirAtomic)
8062+
{
8063+
return;
8064+
}
8065+
}
8066+
80058067
// Since we're merging two stores of the same type, the new type is twice wider.
80068068
var_types oldType = ind->TypeGet();
80078069
var_types newType;
@@ -8014,32 +8076,80 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80148076

80158077
case TYP_SHORT:
80168078
case TYP_USHORT:
8017-
newType = TYP_INT; // TYP_UINT is not legal in IR
8079+
newType = TYP_INT;
80188080
break;
80198081

80208082
#ifdef TARGET_64BIT
80218083
case TYP_INT:
80228084
newType = TYP_LONG;
80238085
break;
8086+
8087+
#if defined(FEATURE_HW_INTRINSICS)
8088+
case TYP_LONG:
8089+
case TYP_REF:
8090+
if (comp->IsBaselineSimdIsaSupported())
8091+
{
8092+
// TLDR: we should be here only if one of the conditions is true:
8093+
// 1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag
8094+
// 2) ARM64: Data is at least 8-byte aligned
8095+
// 3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+
8096+
//
8097+
newType = TYP_SIMD16;
8098+
if ((oldType == TYP_REF) &&
8099+
(!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0)))
8100+
{
8101+
// For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.:
8102+
//
8103+
// arr[1] = "hello";
8104+
// arr[0] = "world";
8105+
//
8106+
// but we don't want to load managed references into SIMD registers (we can only do so
8107+
// when we can issue a nongc region for a block)
8108+
return;
8109+
}
8110+
break;
8111+
}
8112+
return;
8113+
8114+
#if defined(TARGET_AMD64)
8115+
case TYP_SIMD16:
8116+
if (comp->getPreferredVectorByteLength() >= 32)
8117+
{
8118+
newType = TYP_SIMD32;
8119+
break;
8120+
}
8121+
return;
8122+
8123+
case TYP_SIMD32:
8124+
if (comp->getPreferredVectorByteLength() >= 64)
8125+
{
8126+
newType = TYP_SIMD64;
8127+
break;
8128+
}
8129+
return;
8130+
#endif // TARGET_AMD64
8131+
#endif // FEATURE_HW_INTRINSICS
80248132
#endif // TARGET_64BIT
80258133

80268134
// TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
80278135
// be converted to TYP_INT/TYP_LONG for constant value.
80288136
//
8029-
// TODO-CQ:
8030-
// 2 x LONG/REF -> SIMD16
8031-
// 2 x SIMD16 -> SIMD32
8032-
// 2 x SIMD32 -> SIMD64
8033-
//
8034-
// where it's legal (e.g. SIMD is not atomic on x64)
8137+
// TYP_UINT and TYP_ULONG are not legal for GT_IND.
80358138
//
80368139
default:
80378140
return;
80388141
}
80398142

8143+
// We should not be here for stores requiring write barriers.
8144+
assert(!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind));
8145+
assert(!comp->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd));
8146+
80408147
// Delete previous STOREIND entirely
80418148
BlockRange().Remove(std::move(prevIndRange));
80428149

8150+
// It's not expected to be contained yet, but just in case...
8151+
ind->Data()->ClearContained();
8152+
80438153
// We know it's always LEA for now
80448154
GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();
80458155

@@ -8050,8 +8160,29 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80508160
ind->gtType = newType;
80518161
ind->Data()->gtType = newType;
80528162

8053-
// We currently only support these constants for val
8054-
assert(prevData.value->IsCnsIntOrI() && currData.value->IsCnsIntOrI());
8163+
#if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS)
8164+
// Upgrading two SIMD stores to a wider SIMD store.
8165+
// Only on x64 since ARM64 has no options above SIMD16
8166+
if (varTypeIsSIMD(oldType))
8167+
{
8168+
int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8;
8169+
int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8;
8170+
8171+
// if the previous store was at a higher address, swap the constants
8172+
if (prevData.offset > currData.offset)
8173+
{
8174+
std::swap(lowerCns, upperCns);
8175+
}
8176+
8177+
simd_t newCns = {};
8178+
uint32_t oldWidth = genTypeSize(oldType);
8179+
memcpy(newCns.i8, lowerCns, oldWidth);
8180+
memcpy(newCns.i8 + oldWidth, upperCns, oldWidth);
8181+
8182+
ind->Data()->AsVecCon()->gtSimdVal = newCns;
8183+
continue;
8184+
}
8185+
#endif
80558186

80568187
size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue();
80578188
size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue();
@@ -8062,6 +8193,24 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80628193
std::swap(lowerCns, upperCns);
80638194
}
80648195

8196+
#if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS)
8197+
// We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16
8198+
// All legality checks were done above.
8199+
if (varTypeIsSIMD(newType))
8200+
{
8201+
// Replace two 64bit constants with a single 128bit constant
8202+
int8_t val[16];
8203+
memcpy(val, &lowerCns, 8);
8204+
memcpy(val + 8, &upperCns, 8);
8205+
GenTreeVecCon* vecCns = comp->gtNewVconNode(newType, &val);
8206+
8207+
BlockRange().InsertAfter(ind->Data(), vecCns);
8208+
BlockRange().Remove(ind->Data());
8209+
ind->gtOp2 = vecCns;
8210+
continue;
8211+
}
8212+
#endif // TARGET_64BIT && FEATURE_HW_INTRINSICS
8213+
80658214
// Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
80668215
// the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
80678216
size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_IN_BYTE;
@@ -8071,10 +8220,12 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80718220
size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_IN_BYTE)));
80728221
JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);
80738222

8074-
// It's not expected to be contained yet, but just in case...
8075-
ind->Data()->ClearContained();
80768223
ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val;
8077-
ind->gtFlags |= GTF_IND_UNALIGNED;
8224+
if (genTypeSize(oldType) == 1)
8225+
{
8226+
// A mark for future foldings that this IND doesn't need to be atomic.
8227+
ind->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC;
8228+
}
80788229

80798230
} while (true);
80808231
#endif // TARGET_XARCH || TARGET_ARM64

0 commit comments

Comments
 (0)