@@ -7833,26 +7833,34 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo
78337833    }
78347834
78357835    //  Data has to be INT_CNS, can be also VEC_CNS in future.
7836-     if  (!ind->Data ()->IsCnsIntOrI ())
7836+     if  (!ind->Data ()->IsCnsIntOrI () && !ind-> Data ()-> IsVectorConst () )
78377837    {
78387838        return  false ;
78397839    }
78407840
7841+     auto  isNodeInvariant = [](Compiler* comp, GenTree* node, bool  allowNull) {
7842+         if  (node == nullptr )
7843+         {
7844+             return  allowNull;
7845+         }
7846+         //  We can allow bigger trees here, but it's not clear if it's worth it.
7847+         return  node->OperIs (GT_LCL_VAR) && !comp->lvaVarAddrExposed (node->AsLclVar ()->GetLclNum ());
7848+     };
7849+ 
78417850    data->targetType  = ind->TypeGet ();
78427851    data->value       = ind->Data ();
78437852    if  (ind->Addr ()->OperIs (GT_LEA))
78447853    {
78457854        GenTree* base  = ind->Addr ()->AsAddrMode ()->Base ();
78467855        GenTree* index = ind->Addr ()->AsAddrMode ()->Index ();
7847-         if  ((base ==  nullptr ) || !base-> OperIs (GT_LCL_VAR) ||  comp-> lvaVarAddrExposed ( base-> AsLclVar ()-> GetLclNum () ))
7856+         if  (! isNodeInvariant ( comp,  base,  false ))
78487857        {
78497858            //  Base must be a local. It's possible for it to be nullptr when index is not null,
78507859            //  but let's ignore such cases.
78517860            return  false ;
78527861        }
78537862
7854-         if  ((index != nullptr ) &&
7855-             (!index->OperIs (GT_LCL_VAR) || comp->lvaVarAddrExposed (index->AsLclVar ()->GetLclNum ())))
7863+         if  (!isNodeInvariant (comp, index, true ))
78567864        {
78577865            //  Index should be either nullptr or a local.
78587866            return  false ;
@@ -7863,7 +7871,7 @@ static bool GetStoreCoalescingData(Compiler* comp, GenTreeStoreInd* ind, StoreCo
78637871        data->scale     = ind->Addr ()->AsAddrMode ()->GetScale ();
78647872        data->offset    = ind->Addr ()->AsAddrMode ()->Offset ();
78657873    }
7866-     else  if  (ind-> Addr ()-> OperIs (GT_LCL_VAR) && ! comp-> lvaVarAddrExposed ( ind->Addr ()-> AsLclVar ()-> GetLclNum () ))
7874+     else  if  (isNodeInvariant ( comp,  ind->Addr (),  true ))
78677875    {
78687876        //  Address is just a local, no offset, scale is 1
78697877        data->baseAddr  = ind->Addr ();
@@ -7919,6 +7927,15 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
79197927        return ;
79207928    }
79217929
7930+     //  TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial.
7931+     //  The algorithm does support TYP_REF (with null value), but it seems to be not worth
7932+     //  it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two
7933+     //  items at once. Although, it may be profitable to do "stp q0, q0, [addr]".
7934+     if  (!varTypeIsIntegral (ind) && !varTypeIsSIMD (ind))
7935+     {
7936+         return ;
7937+     }
7938+ 
79227939    //  We're going to do it in a loop while we see suitable STOREINDs to coalesce.
79237940    //  E.g.: we have the following LIR sequence:
79247941    // 
@@ -7933,12 +7950,6 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
79337950    //  to get a single store of 8 bytes.
79347951    do 
79357952    {
7936-         //  This check is not really needed, just for better throughput.
7937-         if  (!ind->TypeIs (TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT))
7938-         {
7939-             return ;
7940-         }
7941- 
79427953        StoreCoalescingData currData;
79437954        StoreCoalescingData prevData;
79447955
@@ -8002,6 +8013,57 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80028013            return ;
80038014        }
80048015
8016+         //  Now the hardest part: decide whether it's safe to use an unaligned write.
8017+         // 
8018+         //  IND<byte> is always fine (and all IND<X> created here from such)
8019+         //  IND<simd> is not required to be atomic per our Memory Model
8020+         const  bool  allowsNonAtomic =
8021+             ((ind->gtFlags  & GTF_IND_ALLOW_NON_ATOMIC) != 0 ) && ((prevInd->gtFlags  & GTF_IND_ALLOW_NON_ATOMIC) != 0 );
8022+ 
8023+         if  (!allowsNonAtomic && (genTypeSize (ind) > 1 ) && !varTypeIsSIMD (ind))
8024+         {
8025+             //  TODO-CQ: if we see that the target is a local memory (non address exposed)
8026+             //  we can use any type (including SIMD) for a new load.
8027+ 
8028+             //  Ignore indices for now, they can invalidate our alignment assumptions.
8029+             //  Although, we can take scale into account.
8030+             if  (currData.index  != nullptr )
8031+             {
8032+                 return ;
8033+             }
8034+ 
8035+             //  Base address being TYP_REF gives us a hint that data is pointer-aligned.
8036+             if  (!currData.baseAddr ->TypeIs (TYP_REF))
8037+             {
8038+                 return ;
8039+             }
8040+ 
8041+             //  Check whether the combined indir is still aligned.
8042+             bool  isCombinedIndirAtomic = (genTypeSize (ind) < TARGET_POINTER_SIZE) &&
8043+                                          (min (prevData.offset , currData.offset ) % (genTypeSize (ind) * 2 )) == 0 ;
8044+ 
8045+             if  (genTypeSize (ind) == TARGET_POINTER_SIZE)
8046+             {
8047+ #ifdef  TARGET_ARM64
8048+                 //  Per Arm Architecture Reference Manual for A-profile architecture:
8049+                 // 
8050+                 //  * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory
8051+                 //    are treated as a pair of single - copy atomic 64 - bit writes.
8052+                 // 
8053+                 //  Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null)
8054+                 // 
8055+                 //  And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise
8056+                 //  we're already doing a load that has no atomicity guarantees.
8057+                 isCombinedIndirAtomic = true ;
8058+ #endif 
8059+             }
8060+ 
8061+             if  (!isCombinedIndirAtomic)
8062+             {
8063+                 return ;
8064+             }
8065+         }
8066+ 
80058067        //  Since we're merging two stores of the same type, the new type is twice wider.
80068068        var_types oldType = ind->TypeGet ();
80078069        var_types newType;
@@ -8014,32 +8076,80 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80148076
80158077            case  TYP_SHORT:
80168078            case  TYP_USHORT:
8017-                 newType = TYP_INT;  //  TYP_UINT is not legal in IR 
8079+                 newType = TYP_INT;
80188080                break ;
80198081
80208082#ifdef  TARGET_64BIT
80218083            case  TYP_INT:
80228084                newType = TYP_LONG;
80238085                break ;
8086+ 
8087+ #if  defined(FEATURE_HW_INTRINSICS)
8088+             case  TYP_LONG:
8089+             case  TYP_REF:
8090+                 if  (comp->IsBaselineSimdIsaSupported ())
8091+                 {
8092+                     //  TLDR: we should be here only if one of the conditions is true:
8093+                     //  1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag
8094+                     //  2) ARM64: Data is at least 8-byte aligned
8095+                     //  3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+
8096+                     // 
8097+                     newType = TYP_SIMD16;
8098+                     if  ((oldType == TYP_REF) &&
8099+                         (!currData.value ->IsIntegralConst (0 ) || !prevData.value ->IsIntegralConst (0 )))
8100+                     {
8101+                         //  For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.:
8102+                         // 
8103+                         //    arr[1] = "hello";
8104+                         //    arr[0] = "world";
8105+                         // 
8106+                         //  but we don't want to load managed references into SIMD registers (we can only do so
8107+                         //  when we can issue a nongc region for a block)
8108+                         return ;
8109+                     }
8110+                     break ;
8111+                 }
8112+                 return ;
8113+ 
8114+ #if  defined(TARGET_AMD64)
8115+             case  TYP_SIMD16:
8116+                 if  (comp->getPreferredVectorByteLength () >= 32 )
8117+                 {
8118+                     newType = TYP_SIMD32;
8119+                     break ;
8120+                 }
8121+                 return ;
8122+ 
8123+             case  TYP_SIMD32:
8124+                 if  (comp->getPreferredVectorByteLength () >= 64 )
8125+                 {
8126+                     newType = TYP_SIMD64;
8127+                     break ;
8128+                 }
8129+                 return ;
8130+ #endif  //  TARGET_AMD64
8131+ #endif  //  FEATURE_HW_INTRINSICS
80248132#endif  //  TARGET_64BIT
80258133
80268134            //  TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
80278135            //  be converted to TYP_INT/TYP_LONG for constant value.
80288136            // 
8029-             //  TODO-CQ:
8030-             //    2 x LONG/REF  -> SIMD16
8031-             //    2 x SIMD16    -> SIMD32
8032-             //    2 x SIMD32    -> SIMD64
8033-             // 
8034-             //  where it's legal (e.g. SIMD is not atomic on x64)
8137+             //  TYP_UINT and TYP_ULONG are not legal for GT_IND.
80358138            // 
80368139            default :
80378140                return ;
80388141        }
80398142
8143+         //  We should not be here for stores requiring write barriers.
8144+         assert (!comp->codeGen ->gcInfo .gcIsWriteBarrierStoreIndNode (ind));
8145+         assert (!comp->codeGen ->gcInfo .gcIsWriteBarrierStoreIndNode (prevInd));
8146+ 
80408147        //  Delete previous STOREIND entirely
80418148        BlockRange ().Remove (std::move (prevIndRange));
80428149
8150+         //  It's not expected to be contained yet, but just in case...
8151+         ind->Data ()->ClearContained ();
8152+ 
80438153        //  We know it's always LEA for now
80448154        GenTreeAddrMode* addr = ind->Addr ()->AsAddrMode ();
80458155
@@ -8050,8 +8160,29 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80508160        ind->gtType          = newType;
80518161        ind->Data ()->gtType  = newType;
80528162
8053-         //  We currently only support these constants for val
8054-         assert (prevData.value ->IsCnsIntOrI () && currData.value ->IsCnsIntOrI ());
8163+ #if  defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS)
8164+         //  Upgrading two SIMD stores to a wider SIMD store.
8165+         //  Only on x64 since ARM64 has no options above SIMD16
8166+         if  (varTypeIsSIMD (oldType))
8167+         {
8168+             int8_t * lowerCns = prevData.value ->AsVecCon ()->gtSimdVal .i8 ;
8169+             int8_t * upperCns = currData.value ->AsVecCon ()->gtSimdVal .i8 ;
8170+ 
8171+             //  if the previous store was at a higher address, swap the constants
8172+             if  (prevData.offset  > currData.offset )
8173+             {
8174+                 std::swap (lowerCns, upperCns);
8175+             }
8176+ 
8177+             simd_t    newCns   = {};
8178+             uint32_t  oldWidth = genTypeSize (oldType);
8179+             memcpy (newCns.i8 , lowerCns, oldWidth);
8180+             memcpy (newCns.i8  + oldWidth, upperCns, oldWidth);
8181+ 
8182+             ind->Data ()->AsVecCon ()->gtSimdVal  = newCns;
8183+             continue ;
8184+         }
8185+ #endif 
80558186
80568187        size_t  lowerCns = (size_t )prevData.value ->AsIntCon ()->IconValue ();
80578188        size_t  upperCns = (size_t )currData.value ->AsIntCon ()->IconValue ();
@@ -8062,6 +8193,24 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80628193            std::swap (lowerCns, upperCns);
80638194        }
80648195
8196+ #if  defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS)
8197+         //  We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16
8198+         //  All legality checks were done above.
8199+         if  (varTypeIsSIMD (newType))
8200+         {
8201+             //  Replace two 64bit constants with a single 128bit constant
8202+             int8_t  val[16 ];
8203+             memcpy (val, &lowerCns, 8 );
8204+             memcpy (val + 8 , &upperCns, 8 );
8205+             GenTreeVecCon* vecCns = comp->gtNewVconNode (newType, &val);
8206+ 
8207+             BlockRange ().InsertAfter (ind->Data (), vecCns);
8208+             BlockRange ().Remove (ind->Data ());
8209+             ind->gtOp2  = vecCns;
8210+             continue ;
8211+         }
8212+ #endif  //  TARGET_64BIT && FEATURE_HW_INTRINSICS
8213+ 
80658214        //  Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
80668215        //  the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
80678216        size_t  mask = ~(size_t (0 )) >> (sizeof (size_t ) - genTypeSize (oldType)) * BITS_IN_BYTE;
@@ -8071,10 +8220,12 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeStoreInd* ind)
80718220        size_t  val = (lowerCns | (upperCns << (genTypeSize (oldType) * BITS_IN_BYTE)));
80728221        JITDUMP (" Coalesced two stores into a single store with value %lld\n "  , (int64_t )val);
80738222
8074-         //  It's not expected to be contained yet, but just in case...
8075-         ind->Data ()->ClearContained ();
80768223        ind->Data ()->AsIntCon ()->gtIconVal  = (ssize_t )val;
8077-         ind->gtFlags  |= GTF_IND_UNALIGNED;
8224+         if  (genTypeSize (oldType) == 1 )
8225+         {
8226+             //  A mark for future foldings that this IND doesn't need to be atomic.
8227+             ind->gtFlags  |= GTF_IND_ALLOW_NON_ATOMIC;
8228+         }
80788229
80798230    } while  (true );
80808231#endif  //  TARGET_XARCH || TARGET_ARM64
0 commit comments