diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 4aebcc3e201f46..e127f57a719183 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -274,6 +274,8 @@ CONFIG_DWORD_INFO(INTERNAL_GCUseGlobalAllocationContext, W("GCUseGlobalAllocatio /// CONFIG_DWORD_INFO(INTERNAL_JitBreakEmit, W("JitBreakEmit"), (DWORD)-1, "") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_JitDebuggable, W("JitDebuggable"), 0, "If set, suppress JIT optimizations that make debugging code difficult") +CONFIG_DWORD_INFO(INTERNAL_UseSveForVectorT, W("UseSveForVectorT"), 1, "Prefer SVE instructions for VectorT") + #if !defined(DEBUG) && !defined(_DEBUG) #define INTERNAL_JitEnableNoWayAssert_Default 0 #else diff --git a/src/coreclr/inc/corhdr.h b/src/coreclr/inc/corhdr.h index 0bd7755e3b0d5e..da45c9ad591016 100644 --- a/src/coreclr/inc/corhdr.h +++ b/src/coreclr/inc/corhdr.h @@ -1754,6 +1754,8 @@ typedef enum CorInfoHFAElemType : unsigned { CORINFO_HFA_ELEM_DOUBLE, CORINFO_HFA_ELEM_VECTOR64, CORINFO_HFA_ELEM_VECTOR128, + CORINFO_HFA_ELEM_VECTOR256, + CORINFO_HFA_ELEM_VECTOR512, } CorInfoHFAElemType; // diff --git a/src/coreclr/inc/corinfoinstructionset.h b/src/coreclr/inc/corinfoinstructionset.h index 68826ed36392ed..a2e1840ea2e596 100644 --- a/src/coreclr/inc/corinfoinstructionset.h +++ b/src/coreclr/inc/corinfoinstructionset.h @@ -25,24 +25,25 @@ enum CORINFO_InstructionSet InstructionSet_Sha1=7, InstructionSet_Sha256=8, InstructionSet_Atomics=9, - InstructionSet_Vector64=10, - InstructionSet_Vector128=11, - InstructionSet_Dczva=12, - InstructionSet_Rcpc=13, - InstructionSet_VectorT128=14, - InstructionSet_Rcpc2=15, - InstructionSet_Sve=16, - InstructionSet_Sve2=17, - InstructionSet_ArmBase_Arm64=18, - InstructionSet_AdvSimd_Arm64=19, - InstructionSet_Aes_Arm64=20, - InstructionSet_Crc32_Arm64=21, - InstructionSet_Dp_Arm64=22, - InstructionSet_Rdm_Arm64=23, - InstructionSet_Sha1_Arm64=24, - InstructionSet_Sha256_Arm64=25, - InstructionSet_Sve_Arm64=26, - InstructionSet_Sve2_Arm64=27, + InstructionSet_Vector=10, + InstructionSet_Vector64=11, + InstructionSet_Vector128=12, + InstructionSet_Dczva=13, + InstructionSet_Rcpc=14, + InstructionSet_VectorT128=15, + InstructionSet_Rcpc2=16, + InstructionSet_Sve=17, + InstructionSet_Sve2=18, + InstructionSet_ArmBase_Arm64=19, + InstructionSet_AdvSimd_Arm64=20, + InstructionSet_Aes_Arm64=21, + InstructionSet_Crc32_Arm64=22, + InstructionSet_Dp_Arm64=23, + InstructionSet_Rdm_Arm64=24, + InstructionSet_Sha1_Arm64=25, + InstructionSet_Sha256_Arm64=26, + InstructionSet_Sve_Arm64=27, + InstructionSet_Sve2_Arm64=28, #endif // TARGET_ARM64 #ifdef TARGET_RISCV64 InstructionSet_RiscV64Base=1, @@ -379,6 +380,8 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_Sve); if (resultflags.HasInstructionSet(InstructionSet_Sve2) && !resultflags.HasInstructionSet(InstructionSet_Sve)) resultflags.RemoveInstructionSet(InstructionSet_Sve2); + if (resultflags.HasInstructionSet(InstructionSet_Vector) && !resultflags.HasInstructionSet(InstructionSet_Sve)) + resultflags.RemoveInstructionSet(InstructionSet_Vector); #endif // TARGET_ARM64 #ifdef TARGET_RISCV64 if (resultflags.HasInstructionSet(InstructionSet_Zbb) && !resultflags.HasInstructionSet(InstructionSet_RiscV64Base)) @@ -627,6 +630,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet) return "Sha256_Arm64"; case InstructionSet_Atomics : return "Atomics"; + case InstructionSet_Vector : + return "Vector"; case InstructionSet_Vector64 : return "Vector64"; case InstructionSet_Vector128 : diff --git a/src/coreclr/inc/corjit.h b/src/coreclr/inc/corjit.h index d8e6a774784f52..a29f81767804f3 100644 --- a/src/coreclr/inc/corjit.h +++ b/src/coreclr/inc/corjit.h @@ -438,6 +438,8 @@ class ICorJitInfo : public ICorDynamicInfo // virtual uint32_t getExpectedTargetArchitecture() = 0; + virtual uint32_t getTargetVectorLength() = 0; + // Fetches extended flags for a particular compilation instance. Returns // the number of bytes written to the provided buffer. virtual uint32_t getJitFlags( diff --git a/src/coreclr/inc/icorjitinfoimpl_generated.h b/src/coreclr/inc/icorjitinfoimpl_generated.h index ee74e9c984fa98..85d1751b340268 100644 --- a/src/coreclr/inc/icorjitinfoimpl_generated.h +++ b/src/coreclr/inc/icorjitinfoimpl_generated.h @@ -748,6 +748,8 @@ uint16_t getRelocTypeHint( uint32_t getExpectedTargetArchitecture() override; +uint32_t getTargetVectorLength() override; + uint32_t getJitFlags( CORJIT_FLAGS* flags, uint32_t sizeInBytes) override; diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index 38a900d0178d00..12468e087b1f5a 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -37,11 +37,11 @@ #include -constexpr GUID JITEEVersionIdentifier = { /* 2004006b-bdff-4357-8e60-3ae950a4f165 */ - 0x2004006b, - 0xbdff, - 0x4357, - {0x8e, 0x60, 0x3a, 0xe9, 0x50, 0xa4, 0xf1, 0x65} +constexpr GUID JITEEVersionIdentifier = { /* 49287d16-74bd-42e9-9d47-132d7a5f67eb */ + 0x49287d16, + 0x74bd, + 0x42e9, + {0x9d, 0x47, 0x13, 0x2d, 0x7a, 0x5f, 0x67, 0xeb} }; #endif // JIT_EE_VERSIONING_GUID_H diff --git a/src/coreclr/jit/ICorJitInfo_names_generated.h b/src/coreclr/jit/ICorJitInfo_names_generated.h index e8e089f0b1dd59..b93eb24f3ae72f 100644 --- a/src/coreclr/jit/ICorJitInfo_names_generated.h +++ b/src/coreclr/jit/ICorJitInfo_names_generated.h @@ -181,6 +181,7 @@ DEF_CLR_API(recordCallSite) DEF_CLR_API(recordRelocation) DEF_CLR_API(getRelocTypeHint) DEF_CLR_API(getExpectedTargetArchitecture) +DEF_CLR_API(getTargetVectorLength) DEF_CLR_API(getJitFlags) DEF_CLR_API(getSpecialCopyHelper) diff --git a/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp b/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp index c2a8418e30256d..4a10529727e237 100644 --- a/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp +++ b/src/coreclr/jit/ICorJitInfo_wrapper_generated.hpp @@ -1752,6 +1752,14 @@ uint32_t WrapICorJitInfo::getExpectedTargetArchitecture() return temp; } +uint32_t WrapICorJitInfo::getTargetVectorLength() +{ + API_ENTER(getTargetVectorLength); + uint32_t temp = wrapHnd->getTargetVectorLength(); + API_LEAVE(getTargetVectorLength); + return temp; +} + uint32_t WrapICorJitInfo::getJitFlags( CORJIT_FLAGS* flags, uint32_t sizeInBytes) diff --git a/src/coreclr/jit/abi.cpp b/src/coreclr/jit/abi.cpp index 21e20fd00820c1..28a8c4aa1b52e2 100644 --- a/src/coreclr/jit/abi.cpp +++ b/src/coreclr/jit/abi.cpp @@ -123,7 +123,15 @@ var_types ABIPassingSegment::GetRegisterType() const #ifdef FEATURE_SIMD case 16: return TYP_SIMD16; -#endif +#ifdef TARGET_ARM64 + case 32: + assert(Compiler::SizeMatchesVectorTLength(Size)); + return TYP_SIMD32; + case 64: + assert(Compiler::SizeMatchesVectorTLength(Size)); + return TYP_SIMD64; +#endif // TARGET_ARM64 +#endif // FEATURE_SIMD default: assert(!"Unexpected size for floating point register"); return TYP_UNDEF; diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index b63ea4084e11d8..85978a64527283 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -284,6 +284,8 @@ bool IntegralRange::Contains(int64_t value) const // Example: IntCns = 42 gives [0..127] with a non -precise range, [42,42] with a precise range. return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::ByteMax}; #elif defined(TARGET_ARM64) + case NI_Vector_op_Equality: + case NI_Vector_op_Inequality: case NI_Vector64_op_Equality: case NI_Vector64_op_Inequality: case NI_Vector128_op_Equality: @@ -2983,8 +2985,7 @@ GenTree* Compiler::optVNBasedFoldConstExpr(BasicBlock* block, GenTree* parent, G conValTree = vecCon; break; } - -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t value = vnStore->ConstantValue(vnCns); @@ -3008,7 +3009,7 @@ GenTree* Compiler::optVNBasedFoldConstExpr(BasicBlock* block, GenTree* parent, G } break; -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #endif // FEATURE_SIMD #if defined(FEATURE_MASKED_HW_INTRINSICS) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index d63fa0d036d69d..a672201004aa43 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -2280,6 +2280,9 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre { // We ignore any differences between SIMD12 and SIMD16 here if we can broadcast the value // via mvni/movi. + // Also, even if UseSveForVectorT == true, we will continue generating loading in V* registers + // instead of Z* registers, because their size is same if VL == 16. + const bool is8 = tree->TypeIs(TYP_SIMD8); if (vecCon->IsAllBitsSet()) { @@ -2329,6 +2332,92 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre } break; } + case TYP_SIMD32: + { + // Use scalable registers + if (vecCon->IsAllBitsSet()) + { + // Use Scalable_B because for Ones, it doesn't matter. + emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, -1, INS_OPTS_SCALABLE_B); + } + else if (vecCon->IsZero()) + { + // Use Scalable_B because for Zero, it doesn't matter. + emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, 0, INS_OPTS_SCALABLE_B); + } + else + { + simd32_t val = vecCon->gtSimd32Val; + if (ElementsAreSame(val.i8, 32)) + { + emit->emitIns_R_I(INS_sve_dup, EA_SCALABLE, targetReg, val.i8[0], INS_OPTS_SCALABLE_B); + } + else if (ElementsAreSame(val.i16, 16)) + { + emit->emitIns_R_I(INS_sve_dup, EA_SCALABLE, targetReg, val.i16[0], INS_OPTS_SCALABLE_H); + } + else if (ElementsAreSame(val.i32, 8)) + { + emit->emitIns_R_I(INS_sve_dup, EA_SCALABLE, targetReg, val.i32[0], INS_OPTS_SCALABLE_S); + } + else + { + // Get a temp integer register to compute long address. + regNumber addrReg = internalRegisters.GetSingle(tree); + CORINFO_FIELD_HANDLE hnd; + hnd = emit->emitSimdConst(&vecCon->gtSimdVal, emitTypeSize(tree->TypeGet())); + emit->emitIns_R_C(INS_sve_ldr, attr, targetReg, addrReg, hnd, 0); + // emit->emitIns_R_C(INS_adr, EA_8BYTE, addrReg, REG_NA, hnd, 0); + // emit->emitIns_R_R_R_I(INS_sve_ld1b, EA_SCALABLE, targetReg, REG_P1, addrReg, 0, + // INS_OPTS_SCALABLE_B); + } + } + break; + } + case TYP_SIMD64: + { + // Use scalable registers + if (vecCon->IsAllBitsSet()) + { + // Use Scalable_B because for Ones, it doesn't matter. + emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, -1, INS_OPTS_SCALABLE_B); + } + else if (vecCon->IsZero()) + { + // Use Scalable_B because for Zero, it doesn't matter. + emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, 0, INS_OPTS_SCALABLE_B); + } + else + { + simd64_t val = vecCon->gtSimd64Val; + if (ElementsAreSame(val.i32, 16) && emitter::isValidSimm_MultipleOf<8, 256>(val.i32[0])) + { + emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, val.i32[0], INS_OPTS_SCALABLE_S, + INS_SCALABLE_OPTS_IMM_BITMASK); + } + else if (ElementsAreSame(val.i16, 32) && emitter::isValidSimm_MultipleOf<8, 256>(val.i16[0])) + { + emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, val.i16[0], INS_OPTS_SCALABLE_H, + INS_SCALABLE_OPTS_IMM_BITMASK); + } + else if (ElementsAreSame(val.i8, 64) && emitter::isValidSimm<8>(val.i8[0])) + { + emit->emitIns_R_I(INS_sve_mov, EA_SCALABLE, targetReg, val.i8[0], INS_OPTS_SCALABLE_B, + INS_SCALABLE_OPTS_IMM_BITMASK); + } + else + { + // Get a temp integer register to compute long address. + regNumber addrReg = internalRegisters.GetSingle(tree); + CORINFO_FIELD_HANDLE hnd; + simd64_t constValue; + memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd64_t)); + hnd = emit->emitSimdConst(&vecCon->gtSimdVal, emitTypeSize(tree->TypeGet())); + emit->emitIns_R_C(INS_sve_ldr, attr, targetReg, addrReg, hnd, 0); + } + } + break; + } default: { @@ -2998,8 +3087,19 @@ void CodeGen::genSimpleReturn(GenTree* treeNode) } } } - emitAttr attr = emitActualTypeSize(targetType); - GetEmitter()->emitIns_Mov(INS_mov, attr, retReg, op1->GetRegNum(), /* canSkip */ !movRequired); + emitAttr attr = emitActualTypeSize(targetType); + bool isScalable = (attr == EA_SCALABLE) || (Compiler::UseSveForType(targetType)); + + if (isScalable) + { + // TODO-VL: Should we check the baseType or it doesn't matter because it is just reg->reg move + GetEmitter()->emitIns_Mov(INS_sve_mov, attr, retReg, op1->GetRegNum(), /* canSkip */ !movRequired, + INS_OPTS_SCALABLE_Q); + } + else + { + GetEmitter()->emitIns_Mov(INS_mov, attr, retReg, op1->GetRegNum(), /* canSkip */ !movRequired); + } } /*********************************************************************************************** @@ -5299,14 +5399,28 @@ void CodeGen::genSimdUpperSave(GenTreeIntrinsic* node) GenTreeLclVar* lclNode = op1->AsLclVar(); LclVarDsc* varDsc = compiler->lvaGetDesc(lclNode); - assert(emitTypeSize(varDsc->GetRegisterType(lclNode)) == 16); - - regNumber tgtReg = node->GetRegNum(); - assert(tgtReg != REG_NA); + unsigned varSize = emitTypeSize(varDsc->GetRegisterType(lclNode)); + assert((varSize == 16) || (Compiler::SizeMatchesVectorTLength(varSize))); regNumber op1Reg = genConsumeReg(op1); assert(op1Reg != REG_NA); + regNumber tgtReg = node->GetRegNum(); +#ifdef TARGET_ARM64 + // TODO-VL: Write a helper to do this check for LclVars*, GenTree*, etc. + if (Compiler::UseStrictSveForType(op1->TypeGet())) + { + // Until we custom ABI for SVE, we will just store entire contents of Z* registers + // on stack. If we don't do it, we will need multiple free registers to save the + // contents of everything but lower 8-bytes. + assert(tgtReg == REG_NA); + + GetEmitter()->emitIns_S_R(INS_sve_str, EA_SCALABLE, op1Reg, lclNode->GetLclNum(), 0); + return; + } +#endif // TARGET_ARM64 + assert(tgtReg != REG_NA); + GetEmitter()->emitIns_R_R_I_I(INS_mov, EA_8BYTE, tgtReg, op1Reg, 0, 1); if ((node->gtFlags & GTF_SPILL) != 0) @@ -5355,10 +5469,12 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) GenTreeLclVar* lclNode = op1->AsLclVar(); LclVarDsc* varDsc = compiler->lvaGetDesc(lclNode); - assert(emitTypeSize(varDsc->GetRegisterType(lclNode)) == 16); + + unsigned varSize = emitTypeSize(varDsc->GetRegisterType(lclNode)); + assert((varSize == 16) || (Compiler::SizeMatchesVectorTLength(varSize))); regNumber srcReg = node->GetRegNum(); - assert(srcReg != REG_NA); + assert((srcReg != REG_NA) || (Compiler::UseStrictSveForType(node->TypeGet()))); regNumber lclVarReg = genConsumeReg(lclNode); assert(lclVarReg != REG_NA); @@ -5370,6 +5486,19 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) // The localVar must have a stack home. assert(varDsc->lvOnFrame); +#ifdef TARGET_ARM64 + // TODO-VL: Write a helper to do this check for LclVars*, GenTree*, etc. + if (Compiler::UseStrictSveForType(op1->TypeGet())) + { + // Until we custom ABI for SVE, we will just store entire contents of Z* registers + // on stack. If we don't do it, we will need multiple free registers to save the + // contents of everything but lower 8-bytes. + + GetEmitter()->emitIns_R_S(INS_sve_ldr, EA_SCALABLE, lclVarReg, varNum, 0); + return; + } +#endif // TARGET_ARM64 + // We will load this from the upper 8 bytes of this localVar's home. int offset = 8; diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 22ec04909917e9..848bc0cedc2f82 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -3173,8 +3173,15 @@ void CodeGen::genHomeRegisterParams(regNumber initReg, bool* initRegStillZeroed) busyRegs |= genRegMask(node->copiedReg); instruction ins = ins_Copy(node->reg, copyType); +#ifdef TARGET_ARM64 + insOpts opts = Compiler::UseStrictSveForType(copyType) ? INS_OPTS_SCALABLE_D : INS_OPTS_NONE; + GetEmitter()->emitIns_Mov(ins, emitActualTypeSize(copyType), node->copiedReg, node->reg, + /* canSkip */ false, opts); +#else GetEmitter()->emitIns_Mov(ins, emitActualTypeSize(copyType), node->copiedReg, node->reg, /* canSkip */ false); +#endif + if (node->copiedReg == initReg) { *initRegStillZeroed = false; @@ -3191,8 +3198,15 @@ void CodeGen::genHomeRegisterParams(regNumber initReg, bool* initRegStillZeroed) regNumber sourceReg = edge->from->copiedReg != REG_NA ? edge->from->copiedReg : edge->from->reg; instruction ins = ins_Copy(sourceReg, genActualType(edge->type)); +#ifdef TARGET_ARM64 + insOpts opts = Compiler::UseStrictSveForType(edge->type) ? INS_OPTS_SCALABLE_D : INS_OPTS_NONE; + GetEmitter()->emitIns_Mov(ins, emitActualTypeSize(edge->type), node->reg, sourceReg, + /* canSkip */ true, opts); +#else GetEmitter()->emitIns_Mov(ins, emitActualTypeSize(edge->type), node->reg, sourceReg, /* canSkip */ true); +#endif + break; } diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 29bdd306e7cd2c..9bcae10c464508 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -644,6 +644,13 @@ var_types Compiler::getPrimitiveTypeForStruct(unsigned structSize, CORINFO_CLASS { return useType; } +#ifdef TARGET_ARM64 + if (SizeMatchesVectorTLength(structSize)) + { + var_types hfaType = GetHfaType(clsHnd); + return UseSveForType(hfaType) ? hfaType : TYP_UNKNOWN; + } +#endif } // Now deal with non-HFA/HVA structs. @@ -869,7 +876,12 @@ var_types Compiler::getReturnTypeForStruct(CORINFO_CLASS_HANDLE clsHnd, // The largest "primitive type" is MAX_PASS_SINGLEREG_BYTES // so we can skip calling getPrimitiveTypeForStruct when we // have a struct that is larger than that. - if (canReturnInRegister && (useType == TYP_UNKNOWN) && (structSize <= MAX_PASS_SINGLEREG_BYTES)) + if (canReturnInRegister && (useType == TYP_UNKNOWN) && + ((structSize <= MAX_PASS_SINGLEREG_BYTES) +#ifdef TARGET_ARM64 + || SizeMatchesVectorTLength(structSize) +#endif + )) { // We set the "primitive" useType based upon the structSize // and also examine the clsHnd to see if it is an HFA of count one @@ -1990,6 +2002,10 @@ void Compiler::compSetProcessor() // Add virtual vector ISAs. These are both supported as part of the required baseline. instructionSetFlags.AddInstructionSet(InstructionSet_Vector64); instructionSetFlags.AddInstructionSet(InstructionSet_Vector128); + if (instructionSetFlags.HasInstructionSet(InstructionSet_Sve)) + { + instructionSetFlags.AddInstructionSet(InstructionSet_Vector); + } #endif // TARGET_ARM64 assert(instructionSetFlags.Equals(EnsureInstructionSetFlagsAreValid(instructionSetFlags))); @@ -2077,6 +2093,12 @@ unsigned ReinterpretHexAsDecimal(unsigned in) return result; } +#ifdef TARGET_ARM64 +unsigned Compiler::compVectorTLength = 0; +// unsigned Compiler::compMinVectorTLengthForSve = 0; +bool Compiler::compUseSveForVectorT = false; +#endif + void Compiler::compInitOptions(JitFlags* jitFlags) { opts = {}; @@ -2522,6 +2544,79 @@ void Compiler::compInitOptions(JitFlags* jitFlags) } } +#if defined(TARGET_ARM64) + + /* + * #ifdef DEBUG + * if (matched) + * { + * compVectorTLength = getTargetLength(); + * compUseSveForVectorT = (compVectorTLength > 16) && (compVectorTLength <= 256) && ((compVectorTLength & + * (compVectorTLength - 1)) == 0); compUseSveForVectorT |= JitConfig.UseSveForVectorT(); + * } + * else + * { + * compVectorTLength = 16; + * compUseSveForVectorT = JitConfig.UseSveForVectorT(); + * } + * #else + * if (matched) + * { + * compVectorTLength = getTargetLength(); + * compUseSveForVectorT = (compVectorTLength > 16) && (compVectorTLength <= 256) && ((compVectorTLength & + * (compVectorTLength - 1)) == 0); + * } + * else + * { + * compVectorTLength = 0; + * compUseSveForVectorT = false; + * } + * #endif + * + */ + + if (info.compMatchedVM) + { + compVectorTLength = info.compCompHnd->getTargetVectorLength(); + + if (compExactlyDependsOn(InstructionSet_Sve_Arm64)) + { + compUseSveForVectorT = (compVectorTLength > 16) && (compVectorTLength <= 256) && + ((compVectorTLength & (compVectorTLength - 1)) == 0); +#ifdef DEBUG + compUseSveForVectorT |= (bool)JitConfig.UseSveForVectorT(); +#endif // DEBUG + } + } + else + { + // altjit +#ifdef DEBUG + compUseSveForVectorT = JitConfig.UseSveForVectorT(); + // In test mode, if UseSveForVectorT=1, then mimic that + // we are generating for VL > 16B + compVectorTLength = 16; // 32; +#else + compVectorTLength = 0; +#endif // DEBUG + } + +// #ifdef DEBUG +// compUseSveForVectorT = JitConfig.UseSveForVectorT(); +// if (compUseSveForVectorT) +// { +// // In test mode, if UseSveForVectorT=1, then mimic that +// // we are generating for VL > 16B +// compVectorTLength = 16; //32; +// } +// else +// #endif // DEBUG +// { +// compVectorTLength = info.compCompHnd->getTargetVectorLength(); +// compUseSveForVectorT = (compVectorTLength > 16) && (compVectorTLength <= 256); +// } +#endif // TARGET_ARM64 + bool enableInliningMethodsWithEH = JitConfig.JitInlineMethodsWithEH() > 0; #ifdef DEBUG diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 99ebd6ac1eed82..fe53172b33ddae 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -157,6 +157,12 @@ inline var_types HfaTypeFromElemKind(CorInfoHFAElemType kind) return TYP_SIMD8; case CORINFO_HFA_ELEM_VECTOR128: return TYP_SIMD16; +#ifdef TARGET_ARM64 + case CORINFO_HFA_ELEM_VECTOR256: + return TYP_SIMD32; + case CORINFO_HFA_ELEM_VECTOR512: + return TYP_SIMD64; +#endif // TARGET_ARM64 #endif case CORINFO_HFA_ELEM_NONE: return TYP_UNDEF; @@ -178,6 +184,12 @@ inline CorInfoHFAElemType HfaElemKindFromType(var_types type) return CORINFO_HFA_ELEM_VECTOR64; case TYP_SIMD16: return CORINFO_HFA_ELEM_VECTOR128; +#ifdef TARGET_ARM64 + case TYP_SIMD32: + return CORINFO_HFA_ELEM_VECTOR256; + case TYP_SIMD64: + return CORINFO_HFA_ELEM_VECTOR512; +#endif #endif case TYP_UNDEF: return CORINFO_HFA_ELEM_NONE; @@ -3146,7 +3158,7 @@ class Compiler var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); #if defined(TARGET_ARM64) - GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType); + GenTree* gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize); GenTree* gtNewSimdFalseMaskByteNode(); #endif @@ -3155,7 +3167,9 @@ class Compiler GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable) + ); GenTree* gtNewSimdCeilNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); @@ -3165,21 +3179,28 @@ class Compiler GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable) + ARM64_ARG(bool wrapInCmtv = true) + ); GenTree* gtNewSimdCmpOpAllNode(genTreeOps op, var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable) + ); GenTree* gtNewSimdCmpOpAnyNode(genTreeOps op, var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable) + ); GenTree* gtNewSimdCndSelNode(var_types type, GenTree* op1, @@ -3240,7 +3261,8 @@ class Compiler GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable)); GenTree* gtNewSimdGetIndicesNode(var_types type, CorInfoType simdBaseJitType, unsigned simdSize); @@ -3277,12 +3299,14 @@ class Compiler GenTree* gtNewSimdIsNaNNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable)); GenTree* gtNewSimdIsNegativeNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable)); GenTree* gtNewSimdIsNegativeInfinityNode(var_types type, GenTree* op1, @@ -3302,12 +3326,14 @@ class Compiler GenTree* gtNewSimdIsPositiveNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable)); GenTree* gtNewSimdIsPositiveInfinityNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable)); GenTree* gtNewSimdIsSubnormalNode(var_types type, GenTree* op1, @@ -3317,7 +3343,8 @@ class Compiler GenTree* gtNewSimdIsZeroNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize + ARM64_ARG(bool isScalable)); GenTree* gtNewSimdLoadNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); @@ -3335,14 +3362,17 @@ class Compiler unsigned simdSize, bool isMax, bool isMagnitude, - bool isNumber); + bool isNumber + ARM64_ARG(bool isScalable)); + GenTree* gtNewSimdMinMaxNativeNode(var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, - bool isMax); + bool isMax + ARM64_ARG(bool isScalable)); GenTree* gtNewSimdNarrowNode(var_types type, GenTree* op1, @@ -3405,7 +3435,7 @@ class Compiler var_types type, GenTree* op1, CorInfoType simdBaseJitType, - unsigned simdSize); + unsigned simdSize ARM64_ARG(bool isScalable)); GenTree* gtNewSimdWidenLowerNode( var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize); @@ -6695,7 +6725,7 @@ class Compiler GenTree* fgMorphHWIntrinsic(GenTreeHWIntrinsic* tree); GenTree* fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree); GenTree* fgMorphHWIntrinsicOptional(GenTreeHWIntrinsic* tree); - GenTree* fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node); + GenTree* fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node ARM64_ARG(bool isScalable)); GenTree* fgOptimizeHWIntrinsicAssociative(GenTreeHWIntrinsic* node); #if defined(FEATURE_MASKED_HW_INTRINSICS) GenTreeHWIntrinsic* fgOptimizeForMaskedIntrinsic(GenTreeHWIntrinsic* node); @@ -8236,7 +8266,7 @@ class Compiler assert(type != TYP_STRUCT); // ARM64 ABI FP Callee save registers only require Callee to save lower 8 Bytes // For SIMD types longer than 8 bytes Caller is responsible for saving and restoring Upper bytes. - return ((type == TYP_SIMD16) || (type == TYP_SIMD12)); + return ((type == TYP_SIMD16) || (type == TYP_SIMD12) || (UseSveForType(type))); } #else // !defined(TARGET_AMD64) && !defined(TARGET_ARM64) #error("Unknown target architecture for FEATURE_PARTIAL_SIMD_CALLEE_SAVE") @@ -8921,6 +8951,40 @@ class Compiler XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX */ +#if defined(TARGET_ARM64) +private: + + static unsigned compVectorTLength; + // static unsigned compMinVectorTLengthForSve; + static bool compUseSveForVectorT; + +public: + FORCEINLINE static unsigned GetVectorTLength() + { + return compVectorTLength; + } + FORCEINLINE static bool UseSveForVectorT() + { + return compUseSveForVectorT; + } + FORCEINLINE static bool UseSveForType(var_types type) + { + return UseSveForVectorT() && varTypeIsSIMDOrMask(type) && (type != TYP_SIMD8) && (type != TYP_SIMD12); + } + FORCEINLINE static bool UseStrictSveForType(var_types type) + { + // This method is used in scenarios where we do not know the type of HIR node or how the LIR node was formed. + // For such cases, we will generate SVE, only if we are guaranteed to have VL >= 32B. + return UseSveForType(type) && (type != TYP_SIMD12) && (type != TYP_SIMD16); + } + FORCEINLINE static bool SizeMatchesVectorTLength(unsigned simdSize) + { + return simdSize == compVectorTLength; + } +#endif + +public: + bool isIntrinsicType(CORINFO_CLASS_HANDLE clsHnd) { return info.compCompHnd->isIntrinsicType(clsHnd); @@ -9164,7 +9228,11 @@ class Compiler return XMM_REGSIZE_BYTES; } #elif defined(TARGET_ARM64) - if (compExactlyDependsOn(InstructionSet_VectorT128)) + if (compExactlyDependsOn(InstructionSet_Sve_Arm64)) + { + return GetVectorTLength(); + } + else if (compExactlyDependsOn(InstructionSet_VectorT128)) { return FP_REGSIZE_BYTES; } @@ -9206,6 +9274,15 @@ class Compiler return XMM_REGSIZE_BYTES; } #elif defined(TARGET_ARM64) + // TODO-VL: There are several optimizations that use this method + // to decide to use higher vector length. E.g. ReadUtf8, Memmove, etc. + // To make them functional, some of them need SVE2 intrinsics/instructions. + // We will incrementally enable them as we add support for SVE2 APIs. + // if (compExactlyDependsOn(InstructionSet_Sve_Arm64)) + //{ + // return Compiler::compVectorTLength; + //} + // else return FP_REGSIZE_BYTES; #else assert(!"getMaxVectorByteLength() unimplemented on target arch"); @@ -9304,7 +9381,19 @@ class Compiler // Return 0 if size is even less than XMM, otherwise - XMM return (size >= XMM_REGSIZE_BYTES) ? XMM_REGSIZE_BYTES : 0; #elif defined(TARGET_ARM64) - assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES); + // if (FP_REGSIZE_BYTES < Compiler::compVectorTLength) + //{ + // if (size >= Compiler::compVectorTLength) + // { + // return Compiler::compVectorTLength; + // } + // } + // else + // TODO-VL: For now, disable most of the optimizations like memmove, struct copy, + // etc. for VL + { + assert(getMaxVectorByteLength() == FP_REGSIZE_BYTES); + } return (size >= FP_REGSIZE_BYTES) ? FP_REGSIZE_BYTES : 0; #else assert(!"roundDownSIMDSize() unimplemented on target arch"); @@ -9333,7 +9422,7 @@ class Compiler { simdType = TYP_SIMD16; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) else if (size == 32) { simdType = TYP_SIMD32; @@ -9342,7 +9431,7 @@ class Compiler { simdType = TYP_SIMD64; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 else { noway_assert(!"Unexpected size for SIMD type"); @@ -9471,7 +9560,12 @@ class Compiler #if defined(FEATURE_SIMD) if (canUseSimd) { - maxRegSize = getPreferredVectorByteLength(); +#if defined(TARGET_ARM64) + // For now, just use SIMD register size for unroll threshold + // decisions + // maxRegSize = getPreferredVectorByteLength(); + maxRegSize = FP_REGSIZE_BYTES; +#endif // TARGET_ARM64 #if defined(TARGET_XARCH) assert(maxRegSize <= ZMM_REGSIZE_BYTES); @@ -9548,7 +9642,11 @@ class Compiler bool structSizeMightRepresentSIMDType(size_t structSize) { #ifdef FEATURE_SIMD +#if defined(TARGET_ARM64) + return (structSize >= getMinVectorByteLength()) && (structSize <= getVectorTByteLength()); +#else return (structSize >= getMinVectorByteLength()) && (structSize <= getMaxVectorByteLength()); +#endif // TARGET_ARM64 #else return false; #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 0c51e4e3716ec5..766cbf9dcb1127 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -2874,9 +2874,6 @@ void* emitter::emitAddLabel(VARSET_VALARG_TP GCvars, regMaskTP gcrefRegs, regMas } else { - // This is not an EXTEND group. - assert((emitCurIG->igFlags & IGF_EXTEND) == 0); - #if defined(DEBUG) || defined(LATE_DISASM) emitCurIG->igWeight = getCurrentBlockWeight(); emitCurIG->igPerfScore = 0.0; @@ -8221,7 +8218,7 @@ CORINFO_FIELD_HANDLE emitter::emitSimd16Const(simd16_t constValue) return emitComp->eeFindJitDataOffs(cnum); } -#ifdef TARGET_XARCH +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) //------------------------------------------------------------------------ // emitSimdConst: Create a simd data section constant. // @@ -8253,7 +8250,9 @@ CORINFO_FIELD_HANDLE emitter::emitSimdConst(simd_t* constValue, emitAttr attr) UNATIVE_OFFSET cnum = emitDataConst(constValue, cnsSize, cnsAlign, dataType); return emitComp->eeFindJitDataOffs(cnum); } +#endif // TARGET_XARCH || TARGET_ARM64 +#if defined(TARGET_XARCH) //------------------------------------------------------------------------ // emitSimdConstCompressedLoad: Create a simd data section constant, // compressing it if possible, and emit an appropiate instruction diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index c398e4c26058e6..3c565c5fdaf389 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2674,9 +2674,11 @@ class emitter #if defined(FEATURE_SIMD) CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) CORINFO_FIELD_HANDLE emitSimdConst(simd_t* constValue, emitAttr attr); - void emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg); +#endif // TARGET_XARCH || TARGET_ARM64 +#if defined(TARGET_XARCH) + void emitSimdConstCompressedLoad(simd_t* constValue, emitAttr attr, regNumber targetReg); #endif // TARGET_XARCH #if defined(FEATURE_MASKED_HW_INTRINSICS) CORINFO_FIELD_HANDLE emitSimdMaskConst(simdmask_t constValue); diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 71f070973bf18c..777e8be800fe33 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -2363,6 +2363,33 @@ emitter::code_t emitter::emitInsCode(instruction ins, insFormat fmt) return code; } +/*static*/ bool emitter::emitIns_valid_imm_for_sve_mov(INT64 imm, emitAttr elemsize) +{ + switch (elemsize) + { + case EA_1BYTE: + { + return (-128 <= imm) && (imm <= 127); + } + case EA_2BYTE: + case EA_4BYTE: + case EA_8BYTE: + + { + if ((-32768 <= imm) && (imm <= 32512) && (imm != 0)) + { + return imm % 256 == 0; + } + break; + } + default: + { + unreached(); + } + } + return false; +} + // true if this 'imm' can be encoded as a input operand to a mov instruction /*static*/ bool emitter::emitIns_valid_imm_for_mov(INT64 imm, emitAttr size) { diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h index c30ab5a57dec82..46b03078a2c5bc 100644 --- a/src/coreclr/jit/emitarm64.h +++ b/src/coreclr/jit/emitarm64.h @@ -801,6 +801,8 @@ static bool isValidUimm_MultipleOf(ssize_t value) return isValidUimm(value / mod) && (value % mod == 0); } +public: + // Returns true if 'value' is a legal signed immediate with 'bits' number of bits. template static bool isValidSimm(ssize_t value) @@ -817,6 +819,8 @@ static bool isValidSimm_MultipleOf(ssize_t value) return isValidSimm(value / mod) && (value % mod == 0); } +private: + // Returns true if 'imm' is a valid broadcast immediate for some SVE DUP variants static bool isValidBroadcastImm(ssize_t imm, emitAttr laneSize) { @@ -1031,6 +1035,9 @@ static unsigned insGetRegisterListSize(instruction ins); /************************************************************************/ public: +// true if this 'imm' can be encoded as a input operand to a SVE mov instruction +static bool emitIns_valid_imm_for_sve_mov(INT64 imm, emitAttr size); + // true if this 'imm' can be encoded as a input operand to a mov instruction static bool emitIns_valid_imm_for_mov(INT64 imm, emitAttr size); diff --git a/src/coreclr/jit/emitarm64sve.cpp b/src/coreclr/jit/emitarm64sve.cpp index e660f93b3bc9cd..eaf5c055436f30 100644 --- a/src/coreclr/jit/emitarm64sve.cpp +++ b/src/coreclr/jit/emitarm64sve.cpp @@ -21,6 +21,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX /*****************************************************************************/ #include "instr.h" +#include "codegen.h" /*****************************************************************************/ @@ -2206,10 +2207,19 @@ void emitter::emitInsSve_R_R(instruction ins, { assert(size == EA_8BYTE); } - else + else if (opt == INS_OPTS_SCALABLE_S) { assert(size == EA_4BYTE); } + else if (opt == INS_OPTS_SCALABLE_H) + { + assert(size == EA_2BYTE); + } + else + { + assert(opt == INS_OPTS_SCALABLE_B); + assert(size == EA_1BYTE); + } #endif // DEBUG reg2 = encodingSPtoZR(reg2); fmt = IF_SVE_CB_2A; @@ -2663,12 +2673,32 @@ void emitter::emitInsSve_R_R_I(instruction ins, case INS_sve_ldr: assert(insOptsNone(opt)); - assert(isScalableVectorSize(size)); + assert(isScalableVectorSize(size) || (size == EA_16BYTE)); assert(isGeneralRegister(reg2)); // nnnnn assert(isValidSimm<9>(imm)); // iii // iiiiii assert(insScalableOptsNone(sopt)); + + // Since SVE uses "mul vl", we need to make sure that we calculate + // the offset correctly. + if (Compiler::UseSveForVectorT()) + { + if ((imm % Compiler::GetVectorTLength()) == 0) + { + // If imm is a multiple of Compiler::compVectorTLength, + // we can use the `[#imm mul vl]` + imm = imm / Compiler::GetVectorTLength(); + } + else + { + regNumber rsvdReg = codeGen->rsGetRsvdReg(); + // For larger imm values (> 9 bits), calculate base + imm in a reserved register first. + codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm); + reg2 = rsvdReg; + imm = 0; + } + } if (isVectorRegister(reg1)) { fmt = IF_SVE_IE_2A; @@ -2682,12 +2712,32 @@ void emitter::emitInsSve_R_R_I(instruction ins, case INS_sve_str: assert(insOptsNone(opt)); - assert(isScalableVectorSize(size)); + assert(isScalableVectorSize(size) || (size == EA_16BYTE)); assert(isGeneralRegister(reg2)); // nnnnn assert(isValidSimm<9>(imm)); // iii // iiiiii assert(insScalableOptsNone(sopt)); + + // Since SVE uses "mul vl", we need to make sure that we calculate + // the offset correctly. + if (Compiler::UseSveForVectorT()) + { + if ((imm % Compiler::GetVectorTLength()) == 0) + { + // If imm is a multiple of Compiler::compVectorTLength, + // we can use the `[#imm mul vl]` + imm = imm / Compiler::GetVectorTLength(); + } + else + { + regNumber rsvdReg = codeGen->rsGetRsvdReg(); + // For larger imm values (> 9 bits), calculate base + imm in a reserved register first. + codeGen->instGen_Set_Reg_To_Base_Plus_Imm(EA_PTRSIZE, rsvdReg, reg2, imm); + reg2 = rsvdReg; + imm = 0; + } + } if (isVectorRegister(reg1)) { fmt = IF_SVE_JH_2A; @@ -14185,7 +14235,7 @@ void emitter::emitInsSveSanityCheck(instrDesc* id) case IF_SVE_IE_2A: // ..........iiiiii ...iiinnnnnttttt -- SVE load vector register case IF_SVE_JH_2A: // ..........iiiiii ...iiinnnnnttttt -- SVE store vector register assert(insOptsNone(id->idInsOpt())); - assert(isScalableVectorSize(id->idOpSize())); + assert(isScalableVectorSize(id->idOpSize()) || (id->idOpSize() == EA_16BYTE)); assert(isVectorRegister(id->idReg1())); // ttttt assert(isGeneralRegisterOrZR(id->idReg2())); // nnnnn assert(isValidSimm<9>(emitGetInsSC(id))); // iii @@ -16212,9 +16262,12 @@ void emitter::emitDispInsSveHelp(instrDesc* id) // ., case IF_SVE_CB_2A: // ........xx...... ......nnnnnddddd -- SVE broadcast general register + { emitDispSveReg(id->idReg1(), id->idInsOpt(), true); - emitDispReg(encodingZRtoSP(id->idReg2()), size, false); + emitAttr gprSize = (size == EA_8BYTE) ? size : EA_4BYTE; + emitDispReg(encodingZRtoSP(id->idReg2()), gprSize, false); break; + } // .H, .B case IF_SVE_HH_2A: // ................ ......nnnnnddddd -- SVE2 FP8 upconverts diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index a0b4cd8d7b6c09..9a012efd3d8e2b 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -1239,6 +1239,7 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed case NI_ArmBase_Arm64_ReverseElementBits: case NI_ArmBase_LeadingZeroCount: case NI_ArmBase_ReverseElementBits: + case NI_Vector_Create: case NI_Vector64_Create: case NI_Vector64_CreateScalar: case NI_Vector64_CreateScalarUnsafe: @@ -1470,6 +1471,19 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed #if defined(FEATURE_HW_INTRINSICS) #if defined(TARGET_ARM64) + case NI_Vector_As: + case NI_Vector_AsVectorByte: + case NI_Vector_AsVectorDouble: + case NI_Vector_AsVectorInt16: + case NI_Vector_AsVectorInt32: + case NI_Vector_AsVectorInt64: + case NI_Vector_AsVectorNInt: + case NI_Vector_AsVectorNUInt: + case NI_Vector_AsVectorSByte: + case NI_Vector_AsVectorSingle: + case NI_Vector_AsVectorUInt16: + case NI_Vector_AsVectorUInt32: + case NI_Vector_AsVectorUInt64: case NI_Vector64_As: case NI_Vector64_AsByte: case NI_Vector64_AsDouble: @@ -1484,7 +1498,8 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed case NI_Vector64_AsUInt32: case NI_Vector64_AsUInt64: case NI_Vector64_op_UnaryPlus: -#endif // TARGET_XARCH + case NI_Vector_op_UnaryPlus: +#endif // TARGET_ARM64 case NI_Vector128_As: case NI_Vector128_AsByte: case NI_Vector128_AsDouble: @@ -1547,6 +1562,9 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed #if defined(FEATURE_HW_INTRINSICS) #if defined(TARGET_ARM64) + case NI_Vector_get_AllBitsSet: + case NI_Vector_get_One: + case NI_Vector_get_Zero: case NI_Vector64_get_AllBitsSet: case NI_Vector64_get_One: case NI_Vector64_get_Zero: diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index ee57e958e439f9..8b836957021b02 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -240,7 +240,7 @@ void GenTree::InitNodeSize() // clang-format off GenTree::s_gtNodeSizes[GT_CALL] = TREE_NODE_SZ_LARGE; -#if defined(FEATURE_SIMD) && defined(TARGET_XARCH) +#if defined(FEATURE_SIMD) && (defined(TARGET_XARCH) || defined(TARGET_ARM64)) GenTree::s_gtNodeSizes[GT_CNS_VEC] = TREE_NODE_SZ_LARGE; #endif // FEATURE_SIMD && TARGET_XARCH GenTree::s_gtNodeSizes[GT_CAST] = TREE_NODE_SZ_LARGE; @@ -281,7 +281,7 @@ void GenTree::InitNodeSize() static_assert_no_msg(sizeof(GenTreeDblCon) <= TREE_NODE_SZ_SMALL); static_assert_no_msg(sizeof(GenTreeStrCon) <= TREE_NODE_SZ_SMALL); #if defined(FEATURE_SIMD) -#ifdef TARGET_XARCH +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) static_assert_no_msg(sizeof(GenTreeVecCon) <= TREE_NODE_SZ_LARGE); // *** large node #else static_assert_no_msg(sizeof(GenTreeVecCon) <= TREE_NODE_SZ_SMALL); @@ -3193,7 +3193,7 @@ unsigned Compiler::gtHashValue(GenTree* tree) switch (vecCon->TypeGet()) { -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD64: { add = genTreeHashAdd(ulo32(add), vecCon->gtSimdVal.u32[15]); @@ -3215,7 +3215,7 @@ unsigned Compiler::gtHashValue(GenTree* tree) add = genTreeHashAdd(ulo32(add), vecCon->gtSimdVal.u32[4]); FALLTHROUGH; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 case TYP_SIMD16: { @@ -3900,6 +3900,7 @@ unsigned Compiler::gtSetMultiOpOrder(GenTreeMultiOp* multiOp) case NI_Vector512_CreateScalar: case NI_Vector512_CreateScalarUnsafe: #elif defined(TARGET_ARM64) + case NI_Vector_Create: case NI_Vector64_Create: case NI_Vector64_CreateScalar: case NI_Vector64_CreateScalarUnsafe: @@ -8130,10 +8131,10 @@ GenTree* Compiler::gtNewGenericCon(var_types type, uint8_t* cnsVal) case TYP_SIMD8: case TYP_SIMD12: case TYP_SIMD16: -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: case TYP_SIMD64: -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 { return gtNewVconNode(type, cnsVal); } @@ -8195,10 +8196,10 @@ GenTree* Compiler::gtNewConWithPattern(var_types type, uint8_t pattern) case TYP_SIMD8: case TYP_SIMD12: case TYP_SIMD16: -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: case TYP_SIMD64: -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 { GenTreeVecCon* node = gtNewVconNode(type); memset(&node->gtSimdVal, pattern, sizeof(node->gtSimdVal)); @@ -12234,7 +12235,7 @@ void Compiler::gtDispConst(GenTree* tree) break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { printf("<0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx>", vecCon->gtSimdVal.u64[0], @@ -12251,7 +12252,7 @@ void Compiler::gtDispConst(GenTree* tree) break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -18508,7 +18509,7 @@ void GenTreeVecCon::EvaluateUnaryInPlace(genTreeOps oper, bool scalar, var_types break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = {}; @@ -18524,7 +18525,7 @@ void GenTreeVecCon::EvaluateUnaryInPlace(genTreeOps oper, bool scalar, var_types gtSimd64Val = result; break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -18570,7 +18571,7 @@ void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_type break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = {}; @@ -18586,7 +18587,7 @@ void GenTreeVecCon::EvaluateBinaryInPlace(genTreeOps oper, bool scalar, var_type gtSimd64Val = result; break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -20831,7 +20832,6 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si } #elif defined(TARGET_ARM64) NamedIntrinsic intrinsic = NI_AdvSimd_Abs; - if (simdBaseType == TYP_DOUBLE) { intrinsic = (simdSize == 8) ? NI_AdvSimd_AbsScalar : NI_AdvSimd_Arm64_Abs; @@ -20841,6 +20841,8 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si intrinsic = (simdSize == 8) ? NI_AdvSimd_Arm64_AbsScalar : NI_AdvSimd_Arm64_Abs; } + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); #else @@ -20848,8 +20850,12 @@ GenTree* Compiler::gtNewSimdAbsNode(var_types type, GenTree* op1, CorInfoType si #endif } -GenTree* Compiler::gtNewSimdBinOpNode( - genTreeOps op, var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdBinOpNode(genTreeOps op, + var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -20863,14 +20869,21 @@ GenTree* Compiler::gtNewSimdBinOpNode( assert(op2 != nullptr); + bool isOp2SimdType = (genActualType(op2) == genActualType(type)) || + (genActualType(op2) == genActualType(simdBaseType)) || + (op2->TypeIs(TYP_SIMD12) && (type == TYP_SIMD16)); + if ((op == GT_LSH) || (op == GT_RSH) || (op == GT_RSZ)) { - assert(genActualType(op2) == TYP_INT); + bool op2Type = genActualType(op2) == TYP_INT; +#if defined(TARGET_ARM64) + op2Type |= (isScalable && isOp2SimdType); +#endif + assert(op2Type && "op2's type is unexpected."); } else { - assert((genActualType(op2) == genActualType(type)) || (genActualType(op2) == genActualType(simdBaseType)) || - (op2->TypeIs(TYP_SIMD12) && (type == TYP_SIMD16))); + assert(isOp2SimdType); } bool needsReverseOps = false; @@ -20927,6 +20940,13 @@ GenTree* Compiler::gtNewSimdBinOpNode( } #endif // TARGET_ARM64 } +#ifdef TARGET_ARM64 + else if (UseSveForType(type) && isScalable && varTypeIsSIMD(op2->TypeGet())) + { + // SVE already have variant that operates on vector operands. + // Do not do anything. + } +#endif else { op2 = gtNewOperNode(GT_AND, TYP_INT, op2, gtNewIconNode(shiftCountMask)); @@ -20940,7 +20960,15 @@ GenTree* Compiler::gtNewSimdBinOpNode( op2 = gtNewOperNode(GT_NEG, TYP_INT, op2); } - op2 = gtNewSimdCreateBroadcastNode(type, op2, simdBaseJitType, simdSize); + if (UseSveForType(type) && isScalable) + { + op2 = + gtNewSimdHWIntrinsicNode(type, op2, NI_Sve_DuplicateScalarToVector, simdBaseJitType, simdSize); + } + else + { + op2 = gtNewSimdCreateBroadcastNode(type, op2, simdBaseJitType, simdSize); + } #endif // !TARGET_XARCH && !TARGET_ARM64 } break; @@ -20970,7 +20998,12 @@ GenTree* Compiler::gtNewSimdBinOpNode( if (broadcastOp != nullptr) { #if defined(TARGET_ARM64) - if (varTypeIsLong(simdBaseType)) + if (isScalable) + { + *broadcastOp = gtNewSimdHWIntrinsicNode(type, *broadcastOp, NI_Sve_DuplicateScalarToVector, + simdBaseJitType, simdSize); + } + else if (varTypeIsLong(simdBaseType)) { // This is handled via emulation and the scalar is consumed directly break; @@ -20981,9 +21014,11 @@ GenTree* Compiler::gtNewSimdBinOpNode( *broadcastOp = gtNewSimdCreateScalarUnsafeNode(TYP_SIMD8, *broadcastOp, simdBaseJitType, 8); break; } + else #endif // TARGET_ARM64 - - *broadcastOp = gtNewSimdCreateBroadcastNode(type, *broadcastOp, simdBaseJitType, simdSize); + { + *broadcastOp = gtNewSimdCreateBroadcastNode(type, *broadcastOp, simdBaseJitType, simdSize); + } } break; } @@ -21035,8 +21070,8 @@ GenTree* Compiler::gtNewSimdBinOpNode( assert(op2ForLookup != op1); } - NamedIntrinsic intrinsic = - GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, simdBaseType, simdSize, false); + NamedIntrinsic intrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, op, op1, op2ForLookup, simdBaseType, + simdSize, false ARM64_ARG(isScalable)); if (intrinsic != NI_Illegal) { @@ -21064,8 +21099,8 @@ GenTree* Compiler::gtNewSimdBinOpNode( // and produce overall better codegen. assert(fgNodeThreading != NodeThreading::LIR); - op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize); - return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize); + op2 = gtNewSimdUnOpNode(GT_NOT, type, op2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); } #if defined(TARGET_XARCH) @@ -21423,11 +21458,36 @@ GenTree* Compiler::gtNewSimdBinOpNode( return gtNewSimdBinOpNode(GT_ADD, type, low, mid, simdBaseJitType, simdSize); } #elif defined(TARGET_ARM64) - if (varTypeIsLong(simdBaseType)) + if (isScalable) { - GenTree** op2ToDup = nullptr; + if (!varTypeIsSIMD(op2)) + { + if (varTypeIsFloating(op2)) + { + double op2Cns = 0.0; + if (op2->IsCnsFltOrDbl()) + { + op2Cns = op2->AsDblCon()->DconValue(); + } + if ((op2Cns == 0.5) || (op2Cns == 2.0)) + { + // GenTree* trueMask = gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); + return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_Sve_MultiplyByScalar, simdBaseJitType, + simdSize); + } + } + op2 = + gtNewSimdHWIntrinsicNode(type, op2, NI_Sve_DuplicateScalarToVector, simdBaseJitType, simdSize); + } + return gtNewSimdHWIntrinsicNode(type, op1, op2, NI_Sve_Multiply, simdBaseJitType, simdSize); + } + else if (varTypeIsLong(simdBaseType)) + { assert(varTypeIsSIMD(op1)); + + GenTree** op2ToDup = nullptr; + op1 = gtNewSimdToScalarNode(TYP_LONG, op1, simdBaseJitType, simdSize); GenTree** op1ToDup = &op1->AsHWIntrinsic()->Op(1); @@ -21457,11 +21517,13 @@ GenTree* Compiler::gtNewSimdBinOpNode( GenTree* op2Dup = fgMakeMultiUse(op2ToDup); assert(!varTypeIsArithmetic(op1Dup)); - op1Dup = gtNewSimdGetElementNode(TYP_LONG, op1Dup, gtNewIconNode(1), simdBaseJitType, simdSize); + op1Dup = gtNewSimdGetElementNode(TYP_LONG, op1Dup, gtNewIconNode(1), simdBaseJitType, + simdSize ARM64_ARG(isScalable)); if (!varTypeIsArithmetic(op2Dup)) { - op2Dup = gtNewSimdGetElementNode(TYP_LONG, op2Dup, gtNewIconNode(1), simdBaseJitType, simdSize); + op2Dup = gtNewSimdGetElementNode(TYP_LONG, op2Dup, gtNewIconNode(1), simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } // upper = op1.GetElement(1) * op2.GetElement(1) @@ -21524,6 +21586,8 @@ GenTree* Compiler::gtNewSimdCeilNode(var_types type, GenTree* op1, CorInfoType s #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); } @@ -21860,7 +21924,7 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, unreached(); } #elif defined(TARGET_ARM64) - assert((simdSize == 8) || (simdSize == 16)); + assert((simdSize == 8) || (simdSize == 16) || (SizeMatchesVectorTLength(simdSize))); switch (simdSourceBaseJitType) { @@ -21917,6 +21981,7 @@ GenTree* Compiler::gtNewSimdCvtNativeNode(var_types type, #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 + // hwIntrinsicID = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, hwIntrinsicID); assert(hwIntrinsicID != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, hwIntrinsicID, simdSourceBaseJitType, simdSize); } @@ -21955,8 +22020,12 @@ GenTree* Compiler::gtNewSimdCvtVectorToMaskNode(var_types type, } #endif // FEATURE_MASKED_HW_INTRINSICS -GenTree* Compiler::gtNewSimdCmpOpNode( - genTreeOps op, var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdCmpOpNode(genTreeOps op, + var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable) ARM64_ARG(bool wrapInCmtv)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -21970,19 +22039,45 @@ GenTree* Compiler::gtNewSimdCmpOpNode( var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); assert(varTypeIsArithmetic(simdBaseType)); - var_types lookupType = GenTreeHWIntrinsic::GetLookupTypeForCmpOp(this, op, type, simdBaseType, simdSize); + var_types lookupType = + GenTreeHWIntrinsic::GetLookupTypeForCmpOp(this, op, type, simdBaseType, simdSize ARM64_ARG(isScalable)); NamedIntrinsic intrinsic = - GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(this, op, lookupType, op1, op2, simdBaseType, simdSize, false); + GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(this, op, lookupType, op1, op2, simdBaseType, simdSize, + false ARM64_ARG(isScalable)); if (intrinsic != NI_Illegal) { #if defined(FEATURE_MASKED_HW_INTRINSICS) - if (lookupType != type) +#if defined(TARGET_ARM64) + if (isScalable) { assert(varTypeIsMask(lookupType)); - GenTree* retNode = gtNewSimdHWIntrinsicNode(lookupType, op1, op2, intrinsic, simdBaseJitType, simdSize); - return gtNewSimdCvtMaskToVectorNode(type, retNode, simdBaseJitType, simdSize); + + if (wrapInCmtv) + { + // cndsel(result, 0xFF, 0) + GenTree* retNode = gtNewSimdHWIntrinsicNode(lookupType, op1, op2, intrinsic, simdBaseJitType, simdSize); + GenTree* allOnes = gtNewAllBitsSetConNode(type); + GenTree* allZeros = gtNewZeroConNode(Compiler::getSIMDTypeForSize(simdSize)); + return gtNewSimdHWIntrinsicNode(type, retNode, allOnes, allZeros, NI_Sve_ConditionalSelect, + simdBaseJitType, simdSize); + } + else + { + // will be wrapped by GetActiveElementCount + return gtNewSimdHWIntrinsicNode(lookupType, op1, op2, intrinsic, simdBaseJitType, simdSize); + } + } + else +#endif // TARGET_ARM64 + { + if (lookupType != type) + { + assert(varTypeIsMask(lookupType)); + GenTree* retNode = gtNewSimdHWIntrinsicNode(lookupType, op1, op2, intrinsic, simdBaseJitType, simdSize); + return gtNewSimdCvtMaskToVectorNode(type, retNode, simdBaseJitType, simdSize); + } } #else assert(lookupType == type); @@ -22234,8 +22329,9 @@ GenTree* Compiler::gtNewSimdCmpOpNode( assert(!canUseEvexEncodingDebugOnly()); #endif // TARGET_XARCH - GenTree* result = gtNewSimdCmpOpNode(GT_EQ, type, op1, op2, simdBaseJitType, simdSize); - return gtNewSimdUnOpNode(GT_NOT, type, result, simdBaseJitType, simdSize); + GenTree* result = + gtNewSimdCmpOpNode(GT_EQ, type, op1, op2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + return gtNewSimdUnOpNode(GT_NOT, type, result, simdBaseJitType, simdSize ARM64_ARG(isScalable)); } default: @@ -22245,8 +22341,12 @@ GenTree* Compiler::gtNewSimdCmpOpNode( } } -GenTree* Compiler::gtNewSimdCmpOpAllNode( - genTreeOps op, var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op, + var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(type == TYP_INT); @@ -22333,7 +22433,39 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( #elif defined(TARGET_ARM64) case GT_EQ: { - intrinsic = (simdSize == 8) ? NI_Vector64_op_Equality : NI_Vector128_op_Equality; + if (UseSveForType(simdType) && isScalable) + { + assert(UseSveForType(simdType) && isScalable); + + intrinsic = NI_Vector_op_Equality; + GenTree* cmpResult = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, + /* isScalable */ true, /* wrapInCmtv */ false); + + // The operation `p1 = SVE_CMP_CC(a, b)` returns predicate mask, having `1` for lanes for which `a CC b` + // is true. For `All` operation, we can perform `r1 = CNTP(p1)` and then if `r1 == VL`, it means `ALL` + // lanes satisfies the CC condition and hence can return true. So the operations will be: + // p1 = SVE_CMP_CC(a, b) + // r1 = CNTP(p1) + // r2 = CNT{B,H,W,D} // only for NativeAOT. For JIT, this is a constant + // cmp r1, r2 + // + // It can also be done without having to find out VL using CNT{B,H,W,D}, using something like: + // p1 = SVE_CMP_CC(a, b) + // p2 = SVE_NOT(p1) + // r1 = CNTP(p2) + // if r1 == 0 return true else false + // + // However, NOT() operation only operates on "byte" variant i.e. `p1.B`, while the result of `p1` from + // `SVE_CMP_CC` can be of other variants like `p1.S` or `p1.D`, etc. + GenTree* allTrue = gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); + op1 = gtNewSimdHWIntrinsicNode(TYP_LONG, allTrue, cmpResult, NI_Sve_GetActiveElementCount, + simdBaseJitType, simdSize); + op2 = gtNewSimdFalseMaskByteNode(); + } + else + { + intrinsic = (simdSize == 8) ? NI_Vector64_op_Equality : NI_Vector128_op_Equality; + } break; } @@ -22345,18 +22477,43 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( // We want to generate a comparison along the lines of // GT_XX(op1, op2).As() == Vector128.AllBitsSet - if (simdSize == 8) + if (UseSveForType(simdType) && isScalable) { - intrinsic = NI_Vector64_op_Equality; + intrinsic = NI_Vector_op_Equality; + + GenTree* cmpResult = + gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, + simdSize /* isScalable */ ARM64_ARG(true) /* wrapInCmtv */ ARM64_ARG(false)); + + // The operation `p1 = SVE_CMP_CC(a, b)` returns predicate mask, having `1` for lanes for which `a CC b` + // is true. For `All` operation, we can perform `r1 = CNTP(p1)` and then if `r1 == VL`, it means `ALL` + // lanes satisfies the CC condition and hence can return true. So the operations will be: + // p1 = SVE_CMP_CC(a, b) + // r1 = CNTP(p1) + // r2 = CNT{B,H,W,D} // only for NativeAOT. For JIT, this is a constant + // cmp r1, r2 + // + // It can also be done without having to find out VL using CNT{B,H,W,D}, using something like: + // p1 = SVE_CMP_CC(a, b) + // p2 = SVE_NOT(p1) + // r1 = CNTP(p2) + // if r1 == 0 return true else false + // + // However, NOT() operation only operates on "byte" variant i.e. `p1.B`, while the result of `p1` from + // `SVE_CMP_CC` can be of other variants like `p1.S` or `p1.D`, etc. + GenTree* allTrue = gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); + op1 = gtNewSimdHWIntrinsicNode(TYP_LONG, allTrue, cmpResult, NI_Sve_GetActiveElementCount, + simdBaseJitType, simdSize); + op2 = gtNewSimdFalseMaskByteNode(); } else { - intrinsic = NI_Vector128_op_Equality; + intrinsic = (simdSize == 8) ? NI_Vector64_op_Equality : NI_Vector128_op_Equality; + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, /* isScalable */ false, + /* wrapInCmtv */ false); + op2 = gtNewAllBitsSetConNode(simdType); } - op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize); - op2 = gtNewAllBitsSetConNode(simdType); - if (simdBaseType == TYP_FLOAT) { simdBaseType = TYP_INT; @@ -22379,12 +22536,18 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode( } } + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); } -GenTree* Compiler::gtNewSimdCmpOpAnyNode( - genTreeOps op, var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op, + var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(type == TYP_INT); @@ -22480,10 +22643,33 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode( // We want to generate a comparison along the lines of // GT_XX(op1, op2).As() != Vector128.Zero - intrinsic = (simdSize == 8) ? NI_Vector64_op_Inequality : NI_Vector128_op_Inequality; + if (UseSveForType(simdType) && isScalable) + { + intrinsic = NI_Vector_op_Inequality; + + GenTree* cmpResult = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, + /* isScalable */ true, /* wrapInCmtv */ false); - op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize); - op2 = gtNewZeroConNode(simdType); + // The operation `p1 = SVE_CMP_CC(a, b)` returns predicate mask, having `1` for lanes for which `a CC b` + // is true. For `Any` operation, we can perform `r1 = CNTP(p1)` and then if `r1 != 0`, it means `SOME` + // lanes satisfies the CC condition and hence can return true. So the operations will be: + // p1 = SVE_CMP_CC(a, b) + // r1 = CNTP(p1) + // if r1 != 0 return true else false + + GenTree* allTrue = gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); + op1 = gtNewSimdHWIntrinsicNode(TYP_LONG, allTrue, cmpResult, NI_Sve_GetActiveElementCount, + simdBaseJitType, simdSize); + + op2 = gtNewSimdFalseMaskByteNode(); + } + else + { + intrinsic = (simdSize == 8) ? NI_Vector64_op_Inequality : NI_Vector128_op_Inequality; + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, /* isScalable */ false, + /* wrapInCmtv */ false); + op2 = gtNewZeroConNode(simdType); + } if (simdBaseType == TYP_FLOAT) { @@ -22500,7 +22686,30 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode( case GT_NE: { - intrinsic = (simdSize == 8) ? NI_Vector64_op_Inequality : NI_Vector128_op_Inequality; + if (UseSveForType(simdType) && isScalable) + { + intrinsic = NI_Vector_op_Inequality; + + GenTree* cmpResult = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, + /* isScalable */ true, /* wrapInCmtv */ false); + + // The operation `p1 = SVE_CMP_CC(a, b)` returns predicate mask, having `1` for lanes for which `a CC b` + // is true. For `Any` operation, we can perform `r1 = CNTP(p1)` and then if `r1 != 0`, it means `SOME` + // lanes satisfies the CC condition and hence can return true. So the operations will be: + // p1 = SVE_CMP_CC(a, b) + // r1 = CNTP(p1) + // if r1 != 0 return true else false + + GenTree* allTrue = gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); + op1 = gtNewSimdHWIntrinsicNode(TYP_LONG, allTrue, cmpResult, NI_Sve_GetActiveElementCount, + simdBaseJitType, simdSize); + + op2 = gtNewSimdFalseMaskByteNode(); + } + else + { + intrinsic = (simdSize == 8) ? NI_Vector64_op_Inequality : NI_Vector128_op_Inequality; + } break; } #else @@ -22513,6 +22722,8 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode( } } + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsic, simdBaseJitType, simdSize); } @@ -22661,7 +22872,6 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, unreached(); } } - return vecCon; } @@ -22675,7 +22885,11 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode(var_types type, hwIntrinsicID = NI_Vector256_Create; } #elif defined(TARGET_ARM64) - if (simdSize == 8) + if ((simdSize == 64) || (simdSize == 32)) + { + hwIntrinsicID = NI_Vector_Create; + } + else if (simdSize == 8) { hwIntrinsicID = NI_Vector64_Create; } @@ -23103,13 +23317,13 @@ GenTree* Compiler::gtNewSimdCreateSequenceNode( else { GenTree* indices = gtNewSimdGetIndicesNode(type, simdBaseJitType, simdSize); - result = gtNewSimdBinOpNode(GT_MUL, type, indices, op2, simdBaseJitType, simdSize); + result = gtNewSimdBinOpNode(GT_MUL, type, indices, op2, simdBaseJitType, simdSize ARM64_ARG(false)); } if (isPartial) { GenTree* start = gtNewSimdCreateBroadcastNode(type, op1, simdBaseJitType, simdSize); - result = gtNewSimdBinOpNode(GT_ADD, type, result, start, simdBaseJitType, simdSize); + result = gtNewSimdBinOpNode(GT_ADD, type, result, start, simdBaseJitType, simdSize ARM64_ARG(false)); } return result; @@ -23200,6 +23414,8 @@ GenTree* Compiler::gtNewSimdFloorNode(var_types type, GenTree* op1, CorInfoType #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); } @@ -23253,12 +23469,17 @@ GenTree* Compiler::gtNewSimdFmaNode( #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, intrinsic, simdBaseJitType, simdSize); } -GenTree* Compiler::gtNewSimdGetElementNode( - var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdGetElementNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { NamedIntrinsic intrinsicId = NI_Vector128_GetElement; var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); @@ -23336,6 +23557,16 @@ GenTree* Compiler::gtNewSimdGetElementNode( op2 = addRangeCheckForHWIntrinsic(op2, 0, immUpperBound); } +#if defined(TARGET_ARM64) + if (isScalable) + { + var_types op1Type = op1->TypeGet(); + op1 = gtNewSimdHWIntrinsicNode(op1Type, op1, op2, NI_Sve_DuplicateSelectedScalarToVector, simdBaseJitType, + simdSize); + return gtNewSimdToScalarNode(type, op1, simdBaseJitType, 16); + } +#endif + return gtNewSimdHWIntrinsicNode(type, op1, op2, intrinsicId, simdBaseJitType, simdSize); } @@ -23514,8 +23745,9 @@ GenTree* Compiler::gtNewSimdIsEvenIntegerNode(var_types type, var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); assert(varTypeIsIntegral(simdBaseType)); - op1 = gtNewSimdBinOpNode(GT_AND, type, op1, gtNewOneConNode(type, simdBaseType), simdBaseJitType, simdSize); - return gtNewSimdIsZeroNode(type, op1, simdBaseJitType, simdSize); + op1 = gtNewSimdBinOpNode(GT_AND, type, op1, gtNewOneConNode(type, simdBaseType), simdBaseJitType, + simdSize ARM64_ARG(false)); + return gtNewSimdIsZeroNode(type, op1, simdBaseJitType, simdSize ARM64_ARG(false)); } //---------------------------------------------------------------------------------------------- @@ -23561,8 +23793,9 @@ GenTree* Compiler::gtNewSimdIsFiniteNode(var_types type, GenTree* op1, CorInfoTy } cnsNode = gtNewSimdCreateBroadcastNode(type, cnsNode, simdBaseJitType, simdSize); - op1 = gtNewSimdBinOpNode(GT_AND_NOT, type, cnsNode, op1, simdBaseJitType, simdSize); - return gtNewSimdCmpOpNode(GT_NE, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize); + assert(varTypeIsNeonSIMD(type)); + op1 = gtNewSimdBinOpNode(GT_AND_NOT, type, cnsNode, op1, simdBaseJitType, simdSize ARM64_ARG(false)); + return gtNewSimdCmpOpNode(GT_NE, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize ARM64_ARG(false)); } assert(varTypeIsIntegral(simdBaseType)); @@ -23595,7 +23828,7 @@ GenTree* Compiler::gtNewSimdIsInfinityNode(var_types type, GenTree* op1, CorInfo if (varTypeIsFloating(simdBaseType)) { op1 = gtNewSimdAbsNode(type, op1, simdBaseJitType, simdSize); - return gtNewSimdIsPositiveInfinityNode(type, op1, simdBaseJitType, simdSize); + return gtNewSimdIsPositiveInfinityNode(type, op1, simdBaseJitType, simdSize ARM64_ARG(false)); } return gtNewZeroConNode(type); } @@ -23622,6 +23855,7 @@ GenTree* Compiler::gtNewSimdIsIntegerNode(var_types type, GenTree* op1, CorInfoT var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); assert(varTypeIsArithmetic(simdBaseType)); + assert(varTypeIsNeonSIMD(type)); if (varTypeIsFloating(simdBaseType)) { @@ -23631,9 +23865,9 @@ GenTree* Compiler::gtNewSimdIsIntegerNode(var_types type, GenTree* op1, CorInfoT op1 = gtNewSimdIsFiniteNode(type, op1, simdBaseJitType, simdSize); op1Dup1 = gtNewSimdTruncNode(type, op1Dup1, simdBaseJitType, simdSize); - GenTree* op2 = gtNewSimdCmpOpNode(GT_EQ, type, op1Dup1, op1Dup2, simdBaseJitType, simdSize); + GenTree* op2 = gtNewSimdCmpOpNode(GT_EQ, type, op1Dup1, op1Dup2, simdBaseJitType, simdSize ARM64_ARG(false)); - return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize); + return gtNewSimdBinOpNode(GT_AND, type, op1, op2, simdBaseJitType, simdSize ARM64_ARG(false)); } assert(varTypeIsIntegral(simdBaseType)); @@ -23652,7 +23886,10 @@ GenTree* Compiler::gtNewSimdIsIntegerNode(var_types type, GenTree* op1, CorInfoT // Returns: // The created IsNaN node // -GenTree* Compiler::gtNewSimdIsNaNNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdIsNaNNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -23666,7 +23903,8 @@ GenTree* Compiler::gtNewSimdIsNaNNode(var_types type, GenTree* op1, CorInfoType if (varTypeIsFloating(simdBaseType)) { GenTree* op1Dup = fgMakeMultiUse(&op1); - return gtNewSimdCmpOpNode(GT_NE, type, op1, op1Dup, simdBaseJitType, simdSize); + return gtNewSimdCmpOpNode(GT_NE, type, op1, op1Dup, simdBaseJitType, + simdSize ARM64_ARG(isScalable) ARM64_ARG(true)); } return gtNewZeroConNode(type); } @@ -23683,7 +23921,10 @@ GenTree* Compiler::gtNewSimdIsNaNNode(var_types type, GenTree* op1, CorInfoType // Returns: // The created IsNegative node // -GenTree* Compiler::gtNewSimdIsNegativeNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdIsNegativeNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -23707,7 +23948,8 @@ GenTree* Compiler::gtNewSimdIsNegativeNode(var_types type, GenTree* op1, CorInfo { return gtNewZeroConNode(type); } - return gtNewSimdCmpOpNode(GT_LT, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize); + return gtNewSimdCmpOpNode(GT_LT, type, op1, gtNewZeroConNode(type), simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } //---------------------------------------------------------------------------------------------- @@ -23756,7 +23998,7 @@ GenTree* Compiler::gtNewSimdIsNegativeInfinityNode(var_types type, } cnsNode = gtNewSimdCreateBroadcastNode(type, cnsNode, simdBaseJitType, simdSize); - return gtNewSimdCmpOpNode(GT_EQ, type, op1, cnsNode, simdBaseJitType, simdSize); + return gtNewSimdCmpOpNode(GT_EQ, type, op1, cnsNode, simdBaseJitType, simdSize ARM64_ARG(false)); } return gtNewZeroConNode(type); } @@ -23783,6 +24025,7 @@ GenTree* Compiler::gtNewSimdIsNormalNode(var_types type, GenTree* op1, CorInfoTy var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); assert(varTypeIsArithmetic(simdBaseType)); + assert(varTypeIsNeonSIMD(type)); if (varTypeIsFloating(simdBaseType)) { @@ -23813,12 +24056,12 @@ GenTree* Compiler::gtNewSimdIsNormalNode(var_types type, GenTree* op1, CorInfoTy cnsNode1 = gtNewSimdCreateBroadcastNode(type, cnsNode1, simdBaseJitType, simdSize); cnsNode2 = gtNewSimdCreateBroadcastNode(type, cnsNode2, simdBaseJitType, simdSize); - op1 = gtNewSimdBinOpNode(GT_SUB, type, op1, cnsNode1, simdBaseJitType, simdSize); - return gtNewSimdCmpOpNode(GT_LT, type, op1, cnsNode2, simdBaseJitType, simdSize); + op1 = gtNewSimdBinOpNode(GT_SUB, type, op1, cnsNode1, simdBaseJitType, simdSize ARM64_ARG(false)); + return gtNewSimdCmpOpNode(GT_LT, type, op1, cnsNode2, simdBaseJitType, simdSize ARM64_ARG(false)); } assert(varTypeIsIntegral(simdBaseType)); - return gtNewSimdCmpOpNode(GT_NE, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize); + return gtNewSimdCmpOpNode(GT_NE, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize ARM64_ARG(false)); } //---------------------------------------------------------------------------------------------- @@ -23840,6 +24083,7 @@ GenTree* Compiler::gtNewSimdIsOddIntegerNode(var_types type, { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); + assert(varTypeIsNeonSIMD(type)); assert(op1 != nullptr); assert(op1->TypeIs(type)); @@ -23847,8 +24091,9 @@ GenTree* Compiler::gtNewSimdIsOddIntegerNode(var_types type, var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); assert(varTypeIsIntegral(simdBaseType)); - op1 = gtNewSimdBinOpNode(GT_AND, type, op1, gtNewOneConNode(type, simdBaseType), simdBaseJitType, simdSize); - return gtNewSimdCmpOpNode(GT_NE, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize); + op1 = gtNewSimdBinOpNode(GT_AND, type, op1, gtNewOneConNode(type, simdBaseType), simdBaseJitType, + simdSize ARM64_ARG(false)); + return gtNewSimdCmpOpNode(GT_NE, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize ARM64_ARG(false)); } //---------------------------------------------------------------------------------------------- @@ -23863,7 +24108,10 @@ GenTree* Compiler::gtNewSimdIsOddIntegerNode(var_types type, // Returns: // The created IsPositive node // -GenTree* Compiler::gtNewSimdIsPositiveNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdIsPositiveNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -23887,7 +24135,8 @@ GenTree* Compiler::gtNewSimdIsPositiveNode(var_types type, GenTree* op1, CorInfo { return gtNewAllBitsSetConNode(type); } - return gtNewSimdCmpOpNode(GT_GE, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize); + return gtNewSimdCmpOpNode(GT_GE, type, op1, gtNewZeroConNode(type), simdBaseJitType, + simdSize ARM64_ARG(isScalable) ARM64_ARG(true)); } //---------------------------------------------------------------------------------------------- @@ -23902,10 +24151,10 @@ GenTree* Compiler::gtNewSimdIsPositiveNode(var_types type, GenTree* op1, CorInfo // Returns: // The created IsPositiveInfinity node // -GenTree* Compiler::gtNewSimdIsPositiveInfinityNode(var_types type, - GenTree* op1, - CorInfoType simdBaseJitType, - unsigned simdSize) +GenTree* Compiler::gtNewSimdIsPositiveInfinityNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -23936,7 +24185,8 @@ GenTree* Compiler::gtNewSimdIsPositiveInfinityNode(var_types type, } cnsNode = gtNewSimdCreateBroadcastNode(type, cnsNode, simdBaseJitType, simdSize); - return gtNewSimdCmpOpNode(GT_EQ, type, op1, cnsNode, simdBaseJitType, simdSize); + return gtNewSimdCmpOpNode(GT_EQ, type, op1, cnsNode, simdBaseJitType, + simdSize ARM64_ARG(isScalable) ARM64_ARG(true)); } return gtNewZeroConNode(type); } @@ -23994,9 +24244,9 @@ GenTree* Compiler::gtNewSimdIsSubnormalNode(var_types type, cnsNode1 = gtNewOneConNode(type, simdBaseType); cnsNode2 = gtNewSimdCreateBroadcastNode(type, cnsNode2, simdBaseJitType, simdSize); - op1 = gtNewSimdBinOpNode(GT_SUB, type, op1, cnsNode1, simdBaseJitType, simdSize); + op1 = gtNewSimdBinOpNode(GT_SUB, type, op1, cnsNode1, simdBaseJitType, simdSize ARM64_ARG(false)); - return gtNewSimdCmpOpNode(GT_LT, type, op1, cnsNode2, simdBaseJitType, simdSize); + return gtNewSimdCmpOpNode(GT_LT, type, op1, cnsNode2, simdBaseJitType, simdSize ARM64_ARG(false)); } return gtNewZeroConNode(type); } @@ -24013,7 +24263,10 @@ GenTree* Compiler::gtNewSimdIsSubnormalNode(var_types type, // Returns: // The created IsZero node // -GenTree* Compiler::gtNewSimdIsZeroNode(var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdIsZeroNode(var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -24024,7 +24277,8 @@ GenTree* Compiler::gtNewSimdIsZeroNode(var_types type, GenTree* op1, CorInfoType var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); assert(varTypeIsArithmetic(simdBaseType)); - return gtNewSimdCmpOpNode(GT_EQ, type, op1, gtNewZeroConNode(type), simdBaseJitType, simdSize); + return gtNewSimdCmpOpNode(GT_EQ, type, op1, gtNewZeroConNode(type), simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } //---------------------------------------------------------------------------------------------- @@ -24216,14 +24470,14 @@ GenTree* Compiler::gtNewSimdLoadNonTemporalNode(var_types type, // Return Value: // The node representing the minimum or maximum operation // -GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, - GenTree* op1, - GenTree* op2, - CorInfoType simdBaseJitType, - unsigned simdSize, - bool isMax, - bool isMagnitude, - bool isNumber) +GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isMax, + bool isMagnitude, + bool isNumber ARM64_ARG(bool isScalable)) { assert(op1 != nullptr); assert(op1->TypeIs(type)); @@ -24686,7 +24940,7 @@ GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, #elif defined(TARGET_ARM64) if (!isMagnitude && !isNumber) { - return gtNewSimdMinMaxNativeNode(type, op1, op2, simdBaseJitType, simdSize, isMax); + return gtNewSimdMinMaxNativeNode(type, op1, op2, simdBaseJitType, simdSize, isMax ARM64_ARG(isScalable)); } if (isScalar) @@ -24703,6 +24957,8 @@ GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, if (retNode == nullptr) { + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + GenTree* op1Dup = fgMakeMultiUse(&op1); GenTree* op2Dup = fgMakeMultiUse(&op2); @@ -24733,58 +24989,68 @@ GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, absOp1Dup = fgMakeMultiUse(&absOp1); absOp2Dup = fgMakeMultiUse(&absOp2); - equalsMask = gtNewSimdCmpOpNode(GT_EQ, type, absOp1, absOp2, simdBaseJitType, simdSize); + equalsMask = + gtNewSimdCmpOpNode(GT_EQ, type, absOp1, absOp2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); if (isMax) { - signMask = gtNewSimdIsPositiveNode(type, op1Dup, simdBaseJitType, simdSize); - cmpMask = gtNewSimdCmpOpNode(GT_GT, type, absOp1Dup, absOp2Dup, simdBaseJitType, simdSize); + signMask = gtNewSimdIsPositiveNode(type, op1Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + cmpMask = gtNewSimdCmpOpNode(GT_GT, type, absOp1Dup, absOp2Dup, simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } else { - signMask = gtNewSimdIsNegativeNode(type, op1Dup, simdBaseJitType, simdSize); - cmpMask = gtNewSimdCmpOpNode(GT_LT, type, absOp1Dup, absOp2Dup, simdBaseJitType, simdSize); + signMask = gtNewSimdIsNegativeNode(type, op1Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + cmpMask = gtNewSimdCmpOpNode(GT_LT, type, absOp1Dup, absOp2Dup, simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } if (isNumber) { - nanMask = gtNewSimdIsNaNNode(type, gtCloneExpr(absOp2Dup), simdBaseJitType, simdSize); + nanMask = gtNewSimdIsNaNNode(type, gtCloneExpr(absOp2Dup), simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } else { - nanMask = gtNewSimdIsNaNNode(type, gtCloneExpr(absOp1Dup), simdBaseJitType, simdSize); + nanMask = gtNewSimdIsNaNNode(type, gtCloneExpr(absOp1Dup), simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } } else { - equalsMask = gtNewSimdCmpOpNode(GT_EQ, type, op1, op2, simdBaseJitType, simdSize); + equalsMask = gtNewSimdCmpOpNode(GT_EQ, type, op1, op2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); if (isMax) { - signMask = gtNewSimdIsNegativeNode(type, op2Dup, simdBaseJitType, simdSize); - cmpMask = gtNewSimdCmpOpNode(GT_LT, type, gtCloneExpr(op2Dup), op1Dup, simdBaseJitType, simdSize); + signMask = gtNewSimdIsNegativeNode(type, op2Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + cmpMask = gtNewSimdCmpOpNode(GT_LT, type, gtCloneExpr(op2Dup), op1Dup, simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } else { - signMask = gtNewSimdIsNegativeNode(type, op1Dup, simdBaseJitType, simdSize); - cmpMask = gtNewSimdCmpOpNode(GT_LT, type, gtCloneExpr(op1Dup), op2Dup, simdBaseJitType, simdSize); + signMask = gtNewSimdIsNegativeNode(type, op1Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + cmpMask = gtNewSimdCmpOpNode(GT_LT, type, gtCloneExpr(op1Dup), op2Dup, simdBaseJitType, + simdSize ARM64_ARG(isScalable)); } if (isNumber) { - nanMask = gtNewSimdIsNaNNode(type, gtCloneExpr(op2Dup), simdBaseJitType, simdSize); + nanMask = + gtNewSimdIsNaNNode(type, gtCloneExpr(op2Dup), simdBaseJitType, simdSize ARM64_ARG(isScalable)); } else { - nanMask = gtNewSimdIsNaNNode(type, gtCloneExpr(op1Dup), simdBaseJitType, simdSize); + nanMask = + gtNewSimdIsNaNNode(type, gtCloneExpr(op1Dup), simdBaseJitType, simdSize ARM64_ARG(isScalable)); } op2Dup = gtCloneExpr(op2Dup); } - GenTree* mask = gtNewSimdBinOpNode(GT_AND, type, equalsMask, signMask, simdBaseJitType, simdSize); - mask = gtNewSimdBinOpNode(GT_OR, type, mask, nanMask, simdBaseJitType, simdSize); - mask = gtNewSimdBinOpNode(GT_OR, type, mask, cmpMask, simdBaseJitType, simdSize); + GenTree* mask = + gtNewSimdBinOpNode(GT_AND, type, equalsMask, signMask, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + mask = gtNewSimdBinOpNode(GT_OR, type, mask, nanMask, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + mask = gtNewSimdBinOpNode(GT_OR, type, mask, cmpMask, simdBaseJitType, simdSize ARM64_ARG(isScalable)); retNode = gtNewSimdCndSelNode(type, mask, gtCloneExpr(op1Dup), op2Dup, simdBaseJitType, simdSize); } @@ -24796,7 +25062,6 @@ GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, } return retNode; } - assert(!isScalar); if (isMagnitude) @@ -24810,7 +25075,8 @@ GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, GenTree* absOp1Dup = fgMakeMultiUse(&absOp1); GenTree* absOp2Dup = fgMakeMultiUse(&absOp2); - GenTree* equalsMask = gtNewSimdCmpOpNode(GT_EQ, type, absOp1, absOp2, simdBaseJitType, simdSize); + GenTree* equalsMask = + gtNewSimdCmpOpNode(GT_EQ, type, absOp1, absOp2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); ; GenTree* signMask1 = nullptr; GenTree* signMask2 = nullptr; @@ -24819,29 +25085,31 @@ GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, if (isMax) { - signMask1 = gtNewSimdIsNegativeNode(type, op2Dup, simdBaseJitType, simdSize); - signMask2 = gtNewSimdIsPositiveNode(type, absOp2Dup, simdBaseJitType, simdSize); - signMask3 = gtNewSimdIsNegativeNode(type, absOp1Dup, simdBaseJitType, simdSize); + signMask1 = gtNewSimdIsNegativeNode(type, op2Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + signMask2 = gtNewSimdIsPositiveNode(type, absOp2Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + signMask3 = gtNewSimdIsNegativeNode(type, absOp1Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); cmpMask = gtNewSimdCmpOpNode(GT_GT, type, gtCloneExpr(absOp1Dup), gtCloneExpr(absOp2Dup), simdBaseJitType, - simdSize); + simdSize ARM64_ARG(isScalable)); } else { - signMask1 = gtNewSimdIsNegativeNode(type, op1Dup, simdBaseJitType, simdSize); - signMask2 = gtNewSimdIsPositiveNode(type, absOp1Dup, simdBaseJitType, simdSize); - signMask3 = gtNewSimdIsNegativeNode(type, absOp2Dup, simdBaseJitType, simdSize); + signMask1 = gtNewSimdIsNegativeNode(type, op1Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + signMask2 = gtNewSimdIsPositiveNode(type, absOp1Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + signMask3 = gtNewSimdIsNegativeNode(type, absOp2Dup, simdBaseJitType, simdSize ARM64_ARG(isScalable)); cmpMask = gtNewSimdCmpOpNode(GT_LT, type, gtCloneExpr(absOp1Dup), gtCloneExpr(absOp2Dup), simdBaseJitType, - simdSize); + simdSize ARM64_ARG(isScalable)); } - GenTree* mask1 = gtNewSimdBinOpNode(GT_AND, type, equalsMask, signMask1, simdBaseJitType, simdSize); - GenTree* mask2 = gtNewSimdBinOpNode(GT_AND, type, cmpMask, signMask2, simdBaseJitType, simdSize); - GenTree* mask3 = gtNewSimdBinOpNode(GT_OR, type, mask1, mask2, simdBaseJitType, simdSize); - mask3 = gtNewSimdBinOpNode(GT_OR, type, mask3, signMask3, simdBaseJitType, simdSize); + GenTree* mask1 = + gtNewSimdBinOpNode(GT_AND, type, equalsMask, signMask1, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + GenTree* mask2 = + gtNewSimdBinOpNode(GT_AND, type, cmpMask, signMask2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + GenTree* mask3 = gtNewSimdBinOpNode(GT_OR, type, mask1, mask2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); + mask3 = gtNewSimdBinOpNode(GT_OR, type, mask3, signMask3, simdBaseJitType, simdSize ARM64_ARG(isScalable)); return gtNewSimdCndSelNode(type, mask3, gtCloneExpr(op1Dup), gtCloneExpr(op2Dup), simdBaseJitType, simdSize); } - return gtNewSimdMinMaxNativeNode(type, op1, op2, simdBaseJitType, simdSize, isMax); + return gtNewSimdMinMaxNativeNode(type, op1, op2, simdBaseJitType, simdSize, isMax ARM64_ARG(isScalable)); } //------------------------------------------------------------------------ @@ -24864,8 +25132,12 @@ GenTree* Compiler::gtNewSimdMinMaxNode(var_types type, // is most efficient. This means that the exact result returned if either input is // NaN or -0 can differ based on the underlying hardware. // -GenTree* Compiler::gtNewSimdMinMaxNativeNode( - var_types type, GenTree* op1, GenTree* op2, CorInfoType simdBaseJitType, unsigned simdSize, bool isMax) +GenTree* Compiler::gtNewSimdMinMaxNativeNode(var_types type, + GenTree* op1, + GenTree* op2, + CorInfoType simdBaseJitType, + unsigned simdSize, + bool isMax ARM64_ARG(bool isScalable)) { assert(op1 != nullptr); assert(op1->TypeIs(type)); @@ -25099,7 +25371,7 @@ GenTree* Compiler::gtNewSimdMinMaxNativeNode( // op1 = op1 < op2 // -or- // op1 = op1 > op2 - op1 = gtNewSimdCmpOpNode(isMax ? GT_GT : GT_LT, type, op1, op2, simdBaseJitType, simdSize); + op1 = gtNewSimdCmpOpNode(isMax ? GT_GT : GT_LT, type, op1, op2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); // result = ConditionalSelect(op1, op1Dup, op2Dup) return gtNewSimdCndSelNode(type, op1, op1Dup, op2Dup, simdBaseJitType, simdSize); @@ -25662,6 +25934,8 @@ GenTree* Compiler::gtNewSimdRoundNode(var_types type, GenTree* op1, CorInfoType #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); } @@ -26186,7 +26460,7 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( cnsNode = gtNewVconNode(type); cnsNode->AsVecCon()->gtSimdVal = orCns; - op2 = gtNewSimdBinOpNode(GT_OR, type, op2, cnsNode, simdBaseJitType, simdSize); + op2 = gtNewSimdBinOpNode(GT_OR, type, op2, cnsNode, simdBaseJitType, simdSize ARM64_ARG(false)); } retNode = gtNewSimdHWIntrinsicNode(type, op1, op2, lookupIntrinsic, simdBaseJitType, simdSize); @@ -26272,8 +26546,8 @@ GenTree* Compiler::gtNewSimdShuffleVariableNode( assert(genTypeSize(JitType2PreciseVarType(corType)) == elementSize); // create the mask node (op2 < comparand), and the result node (mask & nativeResult) - GenTree* mask = gtNewSimdCmpOpNode(GT_LT, type, op2DupSafe, comparand, corType, simdSize); - retNode = gtNewSimdBinOpNode(GT_AND, type, retNode, mask, simdBaseJitType, simdSize); + GenTree* mask = gtNewSimdCmpOpNode(GT_LT, type, op2DupSafe, comparand, corType, simdSize ARM64_ARG(false)); + retNode = gtNewSimdBinOpNode(GT_AND, type, retNode, mask, simdBaseJitType, simdSize ARM64_ARG(false)); } else { @@ -27216,6 +27490,12 @@ GenTree* Compiler::gtNewSimdSumNode(var_types type, GenTree* op1, CorInfoType si return gtNewSimdToScalarNode(type, op1, simdBaseJitType, simdSize); #elif defined(TARGET_ARM64) + if (UseSveForType(type)) + { + tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_Sve_AddAcross, simdBaseJitType, simdSize); + return gtNewSimdToScalarNode(type, tmp, simdBaseJitType, 16); + } + switch (simdBaseType) { case TYP_BYTE: @@ -27446,12 +27726,17 @@ GenTree* Compiler::gtNewSimdTruncNode(var_types type, GenTree* op1, CorInfoType #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); } -GenTree* Compiler::gtNewSimdUnOpNode( - genTreeOps op, var_types type, GenTree* op1, CorInfoType simdBaseJitType, unsigned simdSize) +GenTree* Compiler::gtNewSimdUnOpNode(genTreeOps op, + var_types type, + GenTree* op1, + CorInfoType simdBaseJitType, + unsigned simdSize ARM64_ARG(bool isScalable)) { assert(varTypeIsSIMD(type)); assert(getSIMDTypeForSize(simdSize) == type); @@ -27504,7 +27789,7 @@ GenTree* Compiler::gtNewSimdUnOpNode( #endif // TARGET_ARM64 NamedIntrinsic intrinsic = - GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp(this, op, op1, simdBaseType, simdSize, false); + GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp(this, op, op1, simdBaseType, simdSize, false ARM64_ARG(isScalable)); if (intrinsic != NI_Illegal) { @@ -27726,7 +28011,7 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, GenTree* op1, CorInfo } else { - assert(simdSize == 8); + assert((simdSize == 8) || (SizeMatchesVectorTLength(simdSize))); tmp1 = op1; } @@ -27744,8 +28029,10 @@ GenTree* Compiler::gtNewSimdWidenLowerNode(var_types type, GenTree* op1, CorInfo intrinsic = NI_AdvSimd_ZeroExtendWideningLower; } + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); + assert(intrinsic != NI_Illegal); - tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, tmp1, intrinsic, simdBaseJitType, 8); + tmp1 = gtNewSimdHWIntrinsicNode(type, tmp1, intrinsic, simdBaseJitType, 8); if (simdSize == 8) { @@ -27938,7 +28225,7 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, GenTree* op1, CorInfo return gtNewSimdHWIntrinsicNode(type, op1, tmp1, NI_X86Base_UnpackHigh, simdBaseJitType, simdSize); } #elif defined(TARGET_ARM64) - if (simdSize == 16) + if ((simdSize == 16) || (SizeMatchesVectorTLength(simdSize))) { if (varTypeIsFloating(simdBaseType)) { @@ -27954,6 +28241,7 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, GenTree* op1, CorInfo intrinsic = NI_AdvSimd_ZeroExtendWideningUpper; } + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(type, intrinsic); assert(intrinsic != NI_Illegal); return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); } @@ -27981,6 +28269,7 @@ GenTree* Compiler::gtNewSimdWidenUpperNode(var_types type, GenTree* op1, CorInfo tmp1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, simdBaseJitType, simdSize); return gtNewSimdGetUpperNode(TYP_SIMD8, tmp1, simdBaseJitType, 16); } + #else #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 @@ -29157,6 +29446,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(bool* isScalar, bool getE // genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_types simdBaseType, bool* isScalar) { + // TODO-VL: Update this method with SVE_ intrinsics as well *isScalar = false; switch (id) @@ -29652,6 +29942,183 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty } } +//------------------------------------------------------------------------------ +// GetScalableHWIntrinsicId: Returns SVE equivalent of given intrinsic ID, if applicable +// +// NamedIntrinsic GenTreeHWIntrinsic::GetScalableHWIntrinsicId(unsigned simdSize, NamedIntrinsic id) +NamedIntrinsic GenTreeHWIntrinsic::GetScalableHWIntrinsicId(var_types simdType, + var_types simdBaseType, + NamedIntrinsic id) +{ + NamedIntrinsic sveId = id; + +#ifdef TARGET_ARM64 + if ((id == NI_Illegal) || ((FIRST_NI_Sve <= sveId) && (sveId <= LAST_NI_Sve))) + { + return sveId; + } + + // TODO-VL: Look for all places where NI_AdvSimd_* is used and add logic for NI_Sve_* at all those places + + if (Compiler::UseSveForType(simdType)) + { + switch (id) + { + case NI_AdvSimd_Abs: + case NI_AdvSimd_Arm64_Abs: + sveId = NI_Sve_Abs; + break; + case NI_AdvSimd_Add: + case NI_AdvSimd_Arm64_Add: + sveId = NI_Sve_Add; + break; + case NI_AdvSimd_And: + sveId = NI_Sve_And; + break; + case NI_AdvSimd_BitwiseSelect: + sveId = NI_Sve2_BitwiseSelect; + break; + case NI_AdvSimd_Ceiling: + case NI_AdvSimd_Arm64_Ceiling: + sveId = NI_Sve_RoundToPositiveInfinity; + break; + case NI_AdvSimd_CompareEqual: + case NI_AdvSimd_Arm64_CompareEqual: + sveId = NI_Sve_CompareEqual; + break; + case NI_AdvSimd_Arm64_CompareGreaterThanOrEqual: + case NI_AdvSimd_CompareGreaterThanOrEqual: + sveId = NI_Sve_CompareGreaterThanOrEqual; + break; + case NI_AdvSimd_Arm64_CompareGreaterThan: + case NI_AdvSimd_CompareGreaterThan: + sveId = NI_Sve_CompareGreaterThan; + break; + case NI_AdvSimd_Arm64_CompareLessThanOrEqual: + case NI_AdvSimd_CompareLessThanOrEqual: + sveId = NI_Sve_CompareLessThanOrEqual; + break; + case NI_AdvSimd_Arm64_CompareLessThan: + case NI_AdvSimd_CompareLessThan: + sveId = NI_Sve_CompareLessThan; + break; + case NI_AdvSimd_Arm64_ConvertToDouble: + sveId = NI_Sve_ConvertToDouble; + break; + case NI_AdvSimd_Arm64_ConvertToDoubleUpper: + sveId = NI_Sve_ConvertToDoubleUpper; + break; + case NI_AdvSimd_ConvertToSingle: + sveId = NI_Sve_ConvertToSingle; + break; + case NI_AdvSimd_ConvertToInt32RoundToZero: + sveId = NI_Sve_ConvertToInt32; + break; + case NI_AdvSimd_ConvertToUInt32RoundToZero: + sveId = NI_Sve_ConvertToUInt32; + break; + case NI_AdvSimd_Arm64_ConvertToInt64RoundToZero: + sveId = NI_Sve_ConvertToInt64; + break; + case NI_AdvSimd_Arm64_ConvertToUInt64RoundToZero: + sveId = NI_Sve_ConvertToUInt64; + break; + case NI_AdvSimd_Arm64_Divide: + sveId = NI_Sve_Divide; + break; + case NI_AdvSimd_Floor: + case NI_AdvSimd_Arm64_Floor: + sveId = NI_Sve_RoundToNegativeInfinity; + break; + case NI_AdvSimd_FusedMultiplyAdd: + case NI_AdvSimd_Arm64_FusedMultiplyAdd: + sveId = NI_Sve_FusedMultiplyAdd; + break; + case NI_AdvSimd_Max: + case NI_AdvSimd_Arm64_Max: + sveId = NI_Sve_Max; + break; + case NI_AdvSimd_Min: + case NI_AdvSimd_Arm64_Min: + sveId = NI_Sve_Min; + break; + case NI_AdvSimd_Multiply: + case NI_AdvSimd_Arm64_Multiply: + sveId = NI_Sve_Multiply; + break; + case NI_AdvSimd_MultiplyByScalar: + case NI_AdvSimd_Arm64_MultiplyByScalar: + sveId = NI_Sve_MultiplyByScalar; + break; + case NI_AdvSimd_Negate: + case NI_AdvSimd_Arm64_Negate: + sveId = NI_Sve_Negate; + break; + case NI_AdvSimd_Not: + sveId = NI_Sve_Not; + break; + case NI_AdvSimd_Or: + sveId = NI_Sve_Or; + break; + case NI_AdvSimd_RoundToNearest: + case NI_AdvSimd_Arm64_RoundToNearest: + sveId = NI_Sve_RoundToNearest; + break; + case NI_AdvSimd_RoundToZero: + case NI_AdvSimd_Arm64_RoundToZero: + sveId = NI_Sve_RoundToZero; + break; + case NI_AdvSimd_ShiftLogical: + case NI_AdvSimd_ShiftArithmetic: + sveId = NI_Sve_ShiftLeftLogical; + break; + case NI_AdvSimd_ShiftLeftLogical: + sveId = NI_Sve_ShiftLeftLogicalImm; + break; + case NI_AdvSimd_ShiftRightArithmetic: + sveId = NI_Sve_ShiftRightArithmeticImm; + break; + case NI_AdvSimd_ShiftRightLogical: + sveId = NI_Sve_ShiftRightLogicalImm; + break; + case NI_AdvSimd_SignExtendWideningLower: + sveId = NI_Sve_SignExtendWideningLower; + break; + case NI_AdvSimd_SignExtendWideningUpper: + sveId = NI_Sve_SignExtendWideningUpper; + break; + case NI_AdvSimd_Subtract: + case NI_AdvSimd_Arm64_Subtract: + sveId = NI_Sve_Subtract; + break; + case NI_AdvSimd_ZeroExtendWideningLower: + sveId = NI_Sve_ZeroExtendWideningLower; + break; + case NI_AdvSimd_ZeroExtendWideningUpper: + sveId = NI_Sve_ZeroExtendWideningUpper; + break; + case NI_Vector128_op_Equality: + sveId = NI_Vector_op_Equality; + break; + case NI_Vector128_op_Inequality: + sveId = NI_Vector_op_Inequality; + break; + case NI_AdvSimd_Xor: + sveId = NI_Sve_Xor; + break; + default: + assert(!"Did not find matching AdvSimd -> Sve"); + break; + } + } + // Make sure if we are using VL SIMD, we are not generating AdvSimd/NEON intrinsics + assert((simdType == TYP_SIMD8) || (simdType == TYP_SIMD16) || (sveId < FIRST_NI_AdvSimd) || + (sveId > LAST_NI_AdvSimd)); +#endif // TARGET_ARM64 + + return sveId; +} + //------------------------------------------------------------------------------ // GetHWIntrinsicIdForUnOp: Returns intrinsic ID based on the oper, base type, and simd size // @@ -29666,15 +30133,23 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty // Returns: // The intrinsic ID based on the oper, base type, and simd size // -NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp( - Compiler* comp, genTreeOps oper, GenTree* op1, var_types simdBaseType, unsigned simdSize, bool isScalar) +NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp(Compiler* comp, + genTreeOps oper, + GenTree* op1, + var_types simdBaseType, + unsigned simdSize, + bool isScalar ARM64_ARG(bool isScalable)) { var_types simdType = comp->getSIMDTypeForSize(simdSize); assert(varTypeIsArithmetic(simdBaseType)); assert(varTypeIsSIMD(simdType)); -#if defined(TARGET_XARCH) +#if defined(TARGET_ARM64) + assert(!isScalar || (simdSize == 8)); + assert(!isScalar || varTypeIsFloating(simdBaseType)); + assert((simdSize <= 16) || (Compiler::SizeMatchesVectorTLength(simdSize))); +#elif defined(TARGET_XARCH) if (simdSize == 64) { assert(!isScalar); @@ -29686,14 +30161,10 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp( assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); } else -#endif // TARGET_XARCH { -#if defined(TARGET_ARM64) - assert(!isScalar || (simdSize == 8)); -#endif // TARGET_ARM64 - assert(!isScalar || varTypeIsFloating(simdBaseType)); } +#endif // TARGET_ARM64 || TARGET_XARCH assert(op1 != nullptr); assert(op1->TypeIs(simdType)); @@ -29743,6 +30214,13 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp( } } +#if defined(TARGET_ARM64) + if (isScalable && Compiler::UseSveForType(simdType)) + { + id = GetScalableHWIntrinsicId(simdType, simdBaseType, id); + } +#endif + return id; } @@ -29761,13 +30239,13 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp( // Returns: // The intrinsic ID based on the oper, base type, and simd size // -NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, - genTreeOps oper, - GenTree* op1, - GenTree* op2, - var_types simdBaseType, - unsigned simdSize, - bool isScalar) +NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, + genTreeOps oper, + GenTree* op1, + GenTree* op2, + var_types simdBaseType, + unsigned simdSize, + bool isScalar ARM64_ARG(bool isScalable)) { var_types simdType = comp->getSIMDTypeForSize(simdSize); @@ -29778,7 +30256,11 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(op1->TypeIs(simdType)); assert(op2 != nullptr); -#if defined(TARGET_XARCH) +#if defined(TARGET_ARM64) + assert(!isScalar || (simdSize == 8)); + assert(!isScalar || varTypeIsFloating(simdBaseType)); + assert((simdSize <= 16) || (Compiler::SizeMatchesVectorTLength(simdSize))); +#elif defined(TARGET_XARCH) if (simdSize == 64) { assert(!isScalar); @@ -29790,14 +30272,10 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); } else -#endif // TARGET_XARCH { -#if defined(TARGET_ARM64) - assert(!isScalar || (simdSize == 8)); -#endif // TARGET_ARM64 - assert(!isScalar || varTypeIsFloating(simdBaseType)); } +#endif // TARGET_ARM64 || TARGET_XARCH NamedIntrinsic id = NI_Illegal; @@ -30102,17 +30580,24 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, id = NI_X86Base_MultiplyLow; } #elif defined(TARGET_ARM64) - if ((simdSize == 8) && (isScalar || (simdBaseType == TYP_DOUBLE))) + if (isScalable) { - id = NI_AdvSimd_MultiplyScalar; + id = varTypeIsSIMD(op2) ? NI_Sve_Multiply : NI_Illegal; } - else if (simdBaseType == TYP_DOUBLE) - { - id = op2->TypeIs(simdType) ? NI_AdvSimd_Arm64_Multiply : NI_AdvSimd_Arm64_MultiplyByScalar; - } - else if (!varTypeIsLong(simdBaseType)) + else { - id = op2->TypeIs(simdType) ? NI_AdvSimd_Multiply : NI_AdvSimd_MultiplyByScalar; + if ((simdSize == 8) && (simdBaseType == TYP_DOUBLE)) + { + id = NI_AdvSimd_MultiplyScalar; + } + else if (simdBaseType == TYP_DOUBLE) + { + id = op2->TypeIs(simdType) ? NI_AdvSimd_Arm64_Multiply : NI_AdvSimd_Arm64_MultiplyByScalar; + } + else if (!varTypeIsLong(simdBaseType)) + { + id = op2->TypeIs(simdType) ? NI_AdvSimd_Multiply : NI_AdvSimd_MultiplyByScalar; + } } #endif // !TARGET_XARCH && !TARGET_ARM64 break; @@ -30441,6 +30926,17 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, } } +#if defined(TARGET_ARM64) + // simdType can be `TYP_SIMD16` for three cases: + // - We originally had Vector128, then we should retain AdvSimd + // - We originally had VectorT, and UseSve=0, then we should retain AdvSimd + // - We originally had VectorT, and UseSve=1, then we should convert to Sve + if (isScalable && Compiler::UseSveForType(simdType)) + { + id = GetScalableHWIntrinsicId(simdType, simdBaseType, id); + } +#endif + return id; } @@ -30461,15 +30957,15 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, // Returns: // The intrinsic ID based on the oper, base type, and simd size // -NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, - genTreeOps oper, - var_types type, - GenTree* op1, - GenTree* op2, - var_types simdBaseType, - unsigned simdSize, - bool isScalar, - bool reverseCond) +NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, + genTreeOps oper, + var_types type, + GenTree* op1, + GenTree* op2, + var_types simdBaseType, + unsigned simdSize, + bool isScalar ARM64_ARG(bool isScalable), + bool reverseCond) { var_types simdType = comp->getSIMDTypeForSize(simdSize); assert(varTypeIsMask(type) || (type == simdType)); @@ -30485,17 +30981,25 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, if (varTypeIsMask(type)) { assert(!isScalar); +#if defined(TARGET_XARCH) assert(comp->canUseEvexEncodingDebugOnly()); +#endif } +#if !defined(TARGET_ARM64) else if (simdSize == 32) { assert(!isScalar); assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); } +#endif // !TARGET_ARM64 else #endif // TARGET_XARCH { - assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16)); + bool validSimdSize = (simdSize == 8) || (simdSize == 12) || (simdSize == 16); +#if defined(TARGET_ARM64) + validSimdSize |= (Compiler::SizeMatchesVectorTLength(simdSize)); +#endif + assert(validSimdSize); #if defined(TARGET_ARM64) assert(!isScalar || (simdSize == 8)); @@ -30800,6 +31304,11 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, { id = isScalar ? NI_X86Base_CompareScalarNotEqual : NI_X86Base_CompareNotEqual; } +#elif defined(TARGET_ARM64) + if (Compiler::UseSveForType(simdType) && isScalable) + { + id = NI_Sve_CompareNotEqualTo; + } #endif // TARGET_XARCH break; } @@ -30810,6 +31319,13 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, } } +#if defined(TARGET_ARM64) + if (Compiler::UseSveForType(simdType) && isScalable) + { + id = GetScalableHWIntrinsicId(simdType, simdBaseType, id); + } +#endif + return id; } @@ -30831,8 +31347,12 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(Compiler* comp, // type doesn't match with the type IR wants us to be producing. For example, the consuming node // may expect a TYP_SIMD16 but the underlying instruction may produce a TYP_MASK. // -var_types GenTreeHWIntrinsic::GetLookupTypeForCmpOp( - Compiler* comp, genTreeOps oper, var_types type, var_types simdBaseType, unsigned simdSize, bool reverseCond) +var_types GenTreeHWIntrinsic::GetLookupTypeForCmpOp(Compiler* comp, + genTreeOps oper, + var_types type, + var_types simdBaseType, + unsigned simdSize ARM64_ARG(bool isScalable), + bool reverseCond) { var_types simdType = comp->getSIMDTypeForSize(simdSize); assert(varTypeIsMask(type) || (type == simdType)); @@ -30885,7 +31405,28 @@ var_types GenTreeHWIntrinsic::GetLookupTypeForCmpOp( unreached(); } } -#endif // TARGET_XARCH +#elif defined(TARGET_ARM64) + switch (oper) + { + case GT_EQ: + case GT_GE: + case GT_LE: + case GT_NE: + case GT_GT: + case GT_LT: + { + if (Compiler::UseSveForType(type) && isScalable) + { + lookupType = TYP_MASK; + } + break; + } + default: + { + unreached(); + } + } +#endif // TARGET_XARCH || TARGET_ARM64 return lookupType; } @@ -30914,6 +31455,8 @@ bool GenTreeHWIntrinsic::ShouldConstantProp(GenTree* operand, GenTreeVecCon* vec switch (gtHWIntrinsicId) { #if defined(TARGET_ARM64) + case NI_Vector_op_Equality: + case NI_Vector_op_Inequality: case NI_Vector64_op_Equality: case NI_Vector64_op_Inequality: #endif // TARGET_ARM64 @@ -32273,12 +32816,12 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) resultNode = gtNewVconNode(retType, &simdVal); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) else if (tree->OperIsConvertVectorToMask()) { resultNode = gtFoldExprConvertVecCnsToMask(tree, cnsNode->AsVecCon()); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS else { @@ -32721,6 +33264,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) case NI_Vector128_op_Equality: #if defined(TARGET_ARM64) + case NI_Vector_op_Equality: case NI_Vector64_op_Equality: #elif defined(TARGET_XARCH) case NI_Vector256_op_Equality: @@ -32734,6 +33278,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) case NI_Vector128_op_Inequality: #if defined(TARGET_ARM64) + case NI_Vector_op_Inequality: case NI_Vector64_op_Inequality: #elif defined(TARGET_XARCH) case NI_Vector256_op_Inequality: @@ -33199,6 +33744,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) case NI_Vector128_op_Equality: #if defined(TARGET_ARM64) + case NI_Vector_op_Equality: case NI_Vector64_op_Equality: #elif defined(TARGET_XARCH) case NI_Vector256_op_Equality: @@ -33220,6 +33766,7 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) case NI_Vector128_op_Inequality: #if defined(TARGET_ARM64) + case NI_Vector_op_Inequality: case NI_Vector64_op_Inequality: #elif defined(TARGET_XARCH) case NI_Vector256_op_Inequality: diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 148cc40d0ba97b..3740baed6eb473 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6622,33 +6622,40 @@ struct GenTreeHWIntrinsic : public GenTreeJitIntrinsic static bool Equals(GenTreeHWIntrinsic* op1, GenTreeHWIntrinsic* op2); - static NamedIntrinsic GetHWIntrinsicIdForUnOp( - Compiler* comp, genTreeOps oper, GenTree* op1, var_types simdBaseType, unsigned simdSize, bool isScalar); - - static NamedIntrinsic GetHWIntrinsicIdForBinOp(Compiler* comp, - genTreeOps oper, - GenTree* op1, - GenTree* op2, - var_types simdBaseType, - unsigned simdSize, - bool isScalar); - - static NamedIntrinsic GetHWIntrinsicIdForCmpOp(Compiler* comp, - genTreeOps oper, - var_types type, - GenTree* op1, - GenTree* op2, - var_types simdBaseType, - unsigned simdSize, - bool isScalar, - bool reverseCond = false); - - static var_types GetLookupTypeForCmpOp(Compiler* comp, - genTreeOps oper, - var_types type, - var_types simdBaseType, - unsigned simdSize, - bool reverseCond = false); + // static NamedIntrinsic GetScalableHWIntrinsicId(unsigned simdSize, NamedIntrinsic id); + static NamedIntrinsic GetScalableHWIntrinsicId(var_types simdType, var_types simdBaseType, NamedIntrinsic id); + + static NamedIntrinsic GetHWIntrinsicIdForUnOp(Compiler* comp, + genTreeOps oper, + GenTree* op1, + var_types simdBaseType, + unsigned simdSize, + bool isScalar ARM64_ARG(bool isScalable)); + + static NamedIntrinsic GetHWIntrinsicIdForBinOp(Compiler* comp, + genTreeOps oper, + GenTree* op1, + GenTree* op2, + var_types simdBaseType, + unsigned simdSize, + bool isScalar ARM64_ARG(bool isScalable)); + + static NamedIntrinsic GetHWIntrinsicIdForCmpOp(Compiler* comp, + genTreeOps oper, + var_types type, + GenTree* op1, + GenTree* op2, + var_types simdBaseType, + unsigned simdSize, + bool isScalar ARM64_ARG(bool isScalable), + bool reverseCond = false); + + static var_types GetLookupTypeForCmpOp(Compiler* comp, + genTreeOps oper, + var_types type, + var_types simdBaseType, + unsigned simdSize ARM64_ARG(bool isScalable), + bool reverseCond = false); static genTreeOps GetOperForHWIntrinsicId(NamedIntrinsic id, var_types simdBaseType, bool* isScalar); @@ -6673,12 +6680,10 @@ struct GenTreeVecCon : public GenTree simd8_t gtSimd8Val; simd12_t gtSimd12Val; simd16_t gtSimd16Val; - -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) simd32_t gtSimd32Val; simd64_t gtSimd64Val; -#endif // TARGET_XARCH - +#endif // TARGET_XARCH || TARGET_ARM64 simd_t gtSimdVal; }; @@ -6709,6 +6714,7 @@ struct GenTreeVecCon : public GenTree case NI_Vector256_CreateScalarUnsafe: case NI_Vector512_CreateScalarUnsafe: #elif defined(TARGET_ARM64) + case NI_Vector_Create: case NI_Vector64_Create: case NI_Vector64_CreateScalar: case NI_Vector64_CreateScalarUnsafe: @@ -6929,7 +6935,7 @@ struct GenTreeVecCon : public GenTree break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = {}; @@ -6945,7 +6951,7 @@ struct GenTreeVecCon : public GenTree gtSimd64Val = result; break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -6985,7 +6991,7 @@ struct GenTreeVecCon : public GenTree break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = {}; @@ -7001,7 +7007,7 @@ struct GenTreeVecCon : public GenTree gtSimd64Val = result; break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7038,7 +7044,7 @@ struct GenTreeVecCon : public GenTree break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = {}; @@ -7054,7 +7060,7 @@ struct GenTreeVecCon : public GenTree gtSimd64Val = result; break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7082,7 +7088,7 @@ struct GenTreeVecCon : public GenTree return gtSimd16Val.IsAllBitsSet(); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { return gtSimd32Val.IsAllBitsSet(); @@ -7093,7 +7099,7 @@ struct GenTreeVecCon : public GenTree return gtSimd64Val.IsAllBitsSet(); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7130,7 +7136,7 @@ struct GenTreeVecCon : public GenTree return left->gtSimd16Val == right->gtSimd16Val; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { return left->gtSimd32Val == right->gtSimd32Val; @@ -7141,7 +7147,7 @@ struct GenTreeVecCon : public GenTree return left->gtSimd64Val == right->gtSimd64Val; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7173,7 +7179,7 @@ struct GenTreeVecCon : public GenTree return gtSimd16Val.IsZero(); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { return gtSimd32Val.IsZero(); @@ -7184,7 +7190,7 @@ struct GenTreeVecCon : public GenTree return gtSimd64Val.IsZero(); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7212,7 +7218,7 @@ struct GenTreeVecCon : public GenTree return EvaluateGetElementFloating(simdBaseType, gtSimd16Val, index); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { return EvaluateGetElementFloating(simdBaseType, gtSimd32Val, index); @@ -7222,7 +7228,7 @@ struct GenTreeVecCon : public GenTree { return EvaluateGetElementFloating(simdBaseType, gtSimd64Val, index); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7250,7 +7256,7 @@ struct GenTreeVecCon : public GenTree return EvaluateGetElementIntegral(simdBaseType, gtSimd16Val, index); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { return EvaluateGetElementIntegral(simdBaseType, gtSimd32Val, index); @@ -7260,7 +7266,7 @@ struct GenTreeVecCon : public GenTree { return EvaluateGetElementIntegral(simdBaseType, gtSimd64Val, index); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7336,7 +7342,7 @@ struct GenTreeVecCon : public GenTree // buffer will cause determinism issues with the compiler. memset(>SimdVal, 0, sizeof(gtSimdVal)); -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) assert(sizeof(simd_t) == sizeof(simd64_t)); #else assert(sizeof(simd_t) == sizeof(simd16_t)); @@ -9549,6 +9555,7 @@ inline bool GenTree::IsVectorCreate() const case NI_Vector256_Create: case NI_Vector512_Create: #elif defined(TARGET_ARMARCH) + case NI_Vector_Create: case NI_Vector64_Create: #endif return true; diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index fbb9b984bd4e06..cd0ee7c6ba14dc 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -958,7 +958,8 @@ static const HWIntrinsicIsaRange hwintrinsicIsaRangeArray[] = { { FIRST_NI_Sha1, LAST_NI_Sha1 }, // Sha1 { FIRST_NI_Sha256, LAST_NI_Sha256 }, // Sha256 { NI_Illegal, NI_Illegal }, // Atomics - { FIRST_NI_Vector64, LAST_NI_Vector64 }, // Vector64 + { FIRST_NI_Vector, LAST_NI_Vector }, + { FIRST_NI_Vector64, LAST_NI_Vector64 }, { FIRST_NI_Vector128, LAST_NI_Vector128 }, // Vector128 { NI_Illegal, NI_Illegal }, // Dczva { NI_Illegal, NI_Illegal }, // Rcpc @@ -1328,6 +1329,13 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, return NI_Illegal; } } + else if (isa == InstructionSet_Vector) + { + if (!isHWIntrinsicEnabled) + { + return NI_Illegal; + } + } #endif #if defined(TARGET_XARCH) @@ -1384,6 +1392,13 @@ unsigned HWIntrinsicInfo::lookupSimdSize(Compiler* comp, NamedIntrinsic id, CORI { return simdSize; } +#if defined(TARGET_ARM64) + else if ((FIRST_NI_Vector <= id) && (id <= LAST_NI_Vector)) + { + assert(Compiler::UseSveForVectorT()); + return Compiler::GetVectorTLength(); + } +#endif CORINFO_CLASS_HANDLE typeHnd = nullptr; @@ -1631,7 +1646,7 @@ static bool isSupportedBaseType(NamedIntrinsic intrinsic, CorInfoType baseJitTyp assert((isa == InstructionSet_Vector512) || (isa == InstructionSet_Vector256) || (isa == InstructionSet_Vector128)); #endif // TARGET_XARCH #ifdef TARGET_ARM64 - assert((isa == InstructionSet_Vector64) || (isa == InstructionSet_Vector128)); + assert((isa == InstructionSet_Vector64) || (isa == InstructionSet_Vector128) || (isa == InstructionSet_Vector)); #endif // TARGET_ARM64 #endif // DEBUG return false; @@ -2155,7 +2170,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, } #if defined(TARGET_ARM64) - if ((simdSize != 8) && (simdSize != 16)) + if ((simdSize != 8) && (simdSize != 16) && (!SizeMatchesVectorTLength(simdSize))) #elif defined(TARGET_XARCH) if ((simdSize != 16) && (simdSize != 32) && (simdSize != 64)) #endif // TARGET_* diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index c82a568c3b33a9..2dda9a6da308bf 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -120,6 +120,26 @@ static CORINFO_InstructionSet lookupInstructionSet(const char* className) { return InstructionSet_Vector128; } + else if (strncmp(className, "Vector`1", 8) == 0) + { + return InstructionSet_Vector; + } + else if (strncmp(className, "Vector256", 9) == 0) + { + return InstructionSet_ILLEGAL; + } + else if (strncmp(className, "Vector512", 9) == 0) + { + return InstructionSet_ILLEGAL; + } + else if (strncmp(className, "VectorMath", 10) == 0) + { + return InstructionSet_ILLEGAL; + } + else if (strncmp(className, "Vector", 6) == 0) + { + return InstructionSet_Vector; + } } return InstructionSet_ILLEGAL; @@ -493,6 +513,11 @@ void HWIntrinsicInfo::lookupImmBounds( immUpperBound = 7; break; + case NI_Sve_DuplicateScalarToVector: + immLowerBound = -128; + immUpperBound = 127; + break; + default: unreached(); } @@ -633,6 +658,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, bool isValidScalarIntrinsic = false; #endif + bool isScalable = false; bool isMinMaxIntrinsic = false; bool isMax = false; bool isMagnitude = false; @@ -641,24 +667,34 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, switch (intrinsic) { + case NI_Vector_Abs: case NI_Vector64_Abs: case NI_Vector128_Abs: { assert(sig->numArgs == 1); op1 = impSIMDPopStack(); retNode = gtNewSimdAbsNode(retType, op1, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_Abs) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_Add: + case NI_Vector_op_Addition: case NI_Vector64_op_Addition: case NI_Vector128_op_Addition: { + bool isScalable = (intrinsic == NI_Vector_Add) || (intrinsic == NI_Vector_op_Addition); assert(sig->numArgs == 2); op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_ADD, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_ADD, retType, op1, op2, simdBaseJitType, simdSize, isScalable); break; } @@ -672,7 +708,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, if (varTypeIsFloating(simdBaseType)) { - retNode = gtNewSimdBinOpNode(GT_ADD, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_ADD, retType, op1, op2, simdBaseJitType, simdSize, false); } else { @@ -689,9 +725,11 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } case NI_AdvSimd_BitwiseClear: + case NI_Vector_AndNot: case NI_Vector64_AndNot: case NI_Vector128_AndNot: { + bool isScalable = intrinsic == NI_Vector_AndNot; assert(sig->numArgs == 2); // We don't want to support creating AND_NOT nodes prior to LIR @@ -702,8 +740,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize)); - retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + GenTree* notNode = gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize, isScalable); + op2 = gtFoldExpr(notNode); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize, isScalable); break; } @@ -719,11 +758,24 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize)); - retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize); + op2 = gtFoldExpr(gtNewSimdUnOpNode(GT_NOT, retType, op2, simdBaseJitType, simdSize, false)); + retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize, false); break; } + case NI_Vector_As: + case NI_Vector_AsVectorByte: + case NI_Vector_AsVectorDouble: + case NI_Vector_AsVectorInt16: + case NI_Vector_AsVectorInt32: + case NI_Vector_AsVectorInt64: + case NI_Vector_AsVectorNInt: + case NI_Vector_AsVectorNUInt: + case NI_Vector_AsVectorSByte: + case NI_Vector_AsVectorSingle: + case NI_Vector_AsVectorUInt16: + case NI_Vector_AsVectorUInt32: + case NI_Vector_AsVectorUInt64: case NI_Vector64_As: case NI_Vector64_AsByte: case NI_Vector64_AsDouble: @@ -882,6 +934,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_op_BitwiseAnd: case NI_Vector64_op_BitwiseAnd: case NI_Vector128_op_BitwiseAnd: { @@ -890,10 +943,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_AND, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_BitwiseAnd); break; } + case NI_Vector_op_BitwiseOr: case NI_Vector64_op_BitwiseOr: case NI_Vector128_op_BitwiseOr: { @@ -902,10 +957,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_OR, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_BitwiseOr); break; } + case NI_Vector_Ceiling: case NI_Vector64_Ceiling: case NI_Vector128_Ceiling: { @@ -919,9 +976,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdCeilNode(retType, op1, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_Ceiling) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_ConditionalSelect: case NI_Vector64_ConditionalSelect: case NI_Vector128_ConditionalSelect: { @@ -935,6 +999,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_ConvertToDouble: case NI_Vector64_ConvertToDouble: case NI_Vector128_ConvertToDouble: { @@ -943,11 +1008,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, intrinsic = (simdSize == 8) ? NI_AdvSimd_Arm64_ConvertToDoubleScalar : NI_AdvSimd_Arm64_ConvertToDouble; + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, intrinsic); + op1 = impSIMDPopStack(); retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize); break; } + case NI_Vector_ConvertToInt32Native: case NI_Vector64_ConvertToInt32Native: case NI_Vector128_ConvertToInt32Native: { @@ -958,6 +1026,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, FALLTHROUGH; } + case NI_Vector_ConvertToInt32: case NI_Vector64_ConvertToInt32: case NI_Vector128_ConvertToInt32: { @@ -966,9 +1035,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_INT, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_ConvertToInt32) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_ConvertToInt64Native: case NI_Vector64_ConvertToInt64Native: case NI_Vector128_ConvertToInt64Native: { @@ -979,6 +1055,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, FALLTHROUGH; } + case NI_Vector_ConvertToInt64: case NI_Vector64_ConvertToInt64: case NI_Vector128_ConvertToInt64: { @@ -987,9 +1064,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_LONG, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_ConvertToInt64) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_ConvertToSingle: case NI_Vector64_ConvertToSingle: case NI_Vector128_ConvertToSingle: { @@ -998,9 +1082,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AdvSimd_ConvertToSingle, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_ConvertToSingle) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_ConvertToUInt32Native: case NI_Vector64_ConvertToUInt32Native: case NI_Vector128_ConvertToUInt32Native: { @@ -1011,6 +1102,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, FALLTHROUGH; } + case NI_Vector_ConvertToUInt32: case NI_Vector64_ConvertToUInt32: case NI_Vector128_ConvertToUInt32: { @@ -1019,9 +1111,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_UINT, simdBaseJitType, simdSize); + // if ((intrinsic == NI_Vector_ConvertToUInt32Native) || (intrinsic == NI_Vector_ConvertToUInt32)) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_ConvertToUInt64Native: case NI_Vector64_ConvertToUInt64Native: case NI_Vector128_ConvertToUInt64Native: { @@ -1032,6 +1131,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, FALLTHROUGH; } + case NI_Vector_ConvertToUInt64: case NI_Vector64_ConvertToUInt64: case NI_Vector128_ConvertToUInt64: { @@ -1040,6 +1140,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdCvtNativeNode(retType, op1, CORINFO_TYPE_ULONG, simdBaseJitType, simdSize); + // if ((intrinsic == NI_Vector_ConvertToUInt64Native) || (intrinsic == NI_Vector_ConvertToUInt64)) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } + break; + } + + case NI_Vector_Create: + { + assert(sig->numArgs == 1); + + op1 = impPopStack().val; + retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_Sve_DuplicateScalarToVector, simdBaseJitType, simdSize); break; } @@ -1231,6 +1346,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_CreateSequence: + { + assert(Compiler::UseSveForType(retType)); + + if ((simdBaseJitType != CORINFO_TYPE_FLOAT) && (simdBaseJitType != CORINFO_TYPE_DOUBLE)) + { + // There is no way to do floating point `initial and `step` in SVE, corresponding + // to the `Vector.CreateSequence(). + op2 = impPopStack().val; + op1 = impPopStack().val; + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, NI_Sve_Index, simdBaseJitType, simdSize); + } + break; + } + case NI_Vector64_CreateSequence: case NI_Vector128_CreateSequence: { @@ -1262,6 +1392,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_op_Division: case NI_Vector64_op_Division: case NI_Vector128_op_Division: { @@ -1284,10 +1415,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg1, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); - retNode = gtNewSimdBinOpNode(GT_DIV, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_DIV, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_Division); break; } + case NI_Vector_Dot: // TODO-VL : Fix DOT to use SVE case NI_Vector64_Dot: case NI_Vector128_Dot: { @@ -1301,11 +1434,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdDotProdNode(simdType, op1, op2, simdBaseJitType, simdSize); - retNode = gtNewSimdGetElementNode(retType, retNode, gtNewIconNode(0), simdBaseJitType, simdSize); + retNode = gtNewSimdGetElementNode(retType, retNode, gtNewIconNode(0), simdBaseJitType, simdSize, false); } break; } + case NI_Vector_Equals: case NI_Vector64_Equals: case NI_Vector128_Equals: { @@ -1314,10 +1448,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_Equals, true); break; } + case NI_Vector_op_Equality: case NI_Vector64_op_Equality: case NI_Vector128_op_Equality: { @@ -1326,10 +1462,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAllNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAllNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_Equality); + // if (intrinsic == NI_Vector_op_Equality) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_EqualsAny: case NI_Vector64_EqualsAny: case NI_Vector128_EqualsAny: { @@ -1338,7 +1482,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAnyNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAnyNode(GT_EQ, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_EqualsAny); + // if (intrinsic == NI_Vector_EqualsAny) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } @@ -1509,6 +1660,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_Floor: case NI_Vector64_Floor: case NI_Vector128_Floor: { @@ -1522,9 +1674,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdFloorNode(retType, op1, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_Floor) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_FusedMultiplyAdd: case NI_Vector64_FusedMultiplyAdd: case NI_Vector128_FusedMultiplyAdd: { @@ -1542,9 +1701,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdFmaNode(retType, op1, op2, op3, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_FusedMultiplyAdd) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } + break; + } + + case NI_Vector_ToScalar: + { + op1 = impSIMDPopStack(); + + // Even for SVE, to scalar always would fetch 0th element from the overlapping SIMD register. + retNode = gtNewSimdToScalarNode(genActualType(simdBaseType), op1, simdBaseJitType, 16); break; } + case NI_Vector_get_AllBitsSet: case NI_Vector64_get_AllBitsSet: case NI_Vector128_get_AllBitsSet: { @@ -1553,6 +1728,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_get_Indices: + { + if ((simdBaseJitType != CORINFO_TYPE_FLOAT) && (simdBaseJitType != CORINFO_TYPE_DOUBLE)) + { + GenTree* start = gtNewIconNode(0, TYP_INT); + GenTree* step = gtNewIconNode(1, TYP_INT); + retNode = gtNewSimdHWIntrinsicNode(retType, start, step, NI_Sve_Index, simdBaseJitType, simdSize); + } + break; + } case NI_Vector64_get_Indices: case NI_Vector128_get_Indices: { @@ -1561,6 +1746,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_get_One: case NI_Vector64_get_One: case NI_Vector128_get_One: { @@ -1569,6 +1755,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_get_Zero: case NI_Vector64_get_Zero: case NI_Vector128_get_Zero: { @@ -1577,6 +1764,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_GetElement: case NI_Vector64_GetElement: case NI_Vector128_GetElement: { @@ -1586,7 +1774,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impSIMDPopStack(); - retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_GetElement); break; } @@ -1608,6 +1797,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_GreaterThan: case NI_Vector64_GreaterThan: case NI_Vector128_GreaterThan: { @@ -1616,10 +1806,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpNode(GT_GT, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpNode(GT_GT, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_GreaterThan, true); break; } + case NI_Vector_GreaterThanAll: case NI_Vector64_GreaterThanAll: case NI_Vector128_GreaterThanAll: { @@ -1628,10 +1820,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAllNode(GT_GT, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAllNode(GT_GT, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_GreaterThanAll); + // if (intrinsic == NI_Vector_GreaterThanAll) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_GreaterThanAny: case NI_Vector64_GreaterThanAny: case NI_Vector128_GreaterThanAny: { @@ -1640,10 +1840,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAnyNode(GT_GT, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAnyNode(GT_GT, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_GreaterThanAny); + // if (intrinsic == NI_Vector_GreaterThanAny) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_GreaterThanOrEqual: case NI_Vector64_GreaterThanOrEqual: case NI_Vector128_GreaterThanOrEqual: { @@ -1652,10 +1860,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpNode(GT_GE, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpNode(GT_GE, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_GreaterThanOrEqual, true); + // if (intrinsic == NI_Vector_GreaterThanOrEqual) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_GreaterThanOrEqualAll: case NI_Vector64_GreaterThanOrEqualAll: case NI_Vector128_GreaterThanOrEqualAll: { @@ -1664,10 +1880,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAllNode(GT_GE, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAllNode(GT_GE, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_GreaterThanOrEqualAll); + // if (intrinsic == NI_Vector_GreaterThanOrEqualAll) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_GreaterThanOrEqualAny: case NI_Vector64_GreaterThanOrEqualAny: case NI_Vector128_GreaterThanOrEqualAny: { @@ -1676,7 +1900,14 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAnyNode(GT_GE, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAnyNode(GT_GE, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_GreaterThanOrEqualAny); + // if (intrinsic == NI_Vector_GreaterThanOrEqualAny) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } @@ -1724,21 +1955,24 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_IsNaN: case NI_Vector64_IsNaN: case NI_Vector128_IsNaN: { assert(sig->numArgs == 1); op1 = impSIMDPopStack(); - retNode = gtNewSimdIsNaNNode(retType, op1, simdBaseJitType, simdSize); + retNode = gtNewSimdIsNaNNode(retType, op1, simdBaseJitType, simdSize, intrinsic == NI_Vector_IsNaN); break; } + case NI_Vector_IsNegative: case NI_Vector64_IsNegative: case NI_Vector128_IsNegative: { assert(sig->numArgs == 1); - op1 = impSIMDPopStack(); - retNode = gtNewSimdIsNegativeNode(retType, op1, simdBaseJitType, simdSize); + op1 = impSIMDPopStack(); + retNode = + gtNewSimdIsNegativeNode(retType, op1, simdBaseJitType, simdSize, intrinsic == NI_Vector_IsNegative); break; } @@ -1777,21 +2011,25 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_IsPositive: case NI_Vector64_IsPositive: case NI_Vector128_IsPositive: { assert(sig->numArgs == 1); - op1 = impSIMDPopStack(); - retNode = gtNewSimdIsPositiveNode(retType, op1, simdBaseJitType, simdSize); + op1 = impSIMDPopStack(); + retNode = + gtNewSimdIsPositiveNode(retType, op1, simdBaseJitType, simdSize, intrinsic == NI_Vector_IsPositive); break; } + case NI_Vector_IsPositiveInfinity: case NI_Vector64_IsPositiveInfinity: case NI_Vector128_IsPositiveInfinity: { assert(sig->numArgs == 1); op1 = impSIMDPopStack(); - retNode = gtNewSimdIsPositiveInfinityNode(retType, op1, simdBaseJitType, simdSize); + retNode = gtNewSimdIsPositiveInfinityNode(retType, op1, simdBaseJitType, simdSize, + intrinsic == NI_Vector_IsPositiveInfinity); break; } @@ -1804,15 +2042,17 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_IsZero: case NI_Vector64_IsZero: case NI_Vector128_IsZero: { assert(sig->numArgs == 1); op1 = impSIMDPopStack(); - retNode = gtNewSimdIsZeroNode(retType, op1, simdBaseJitType, simdSize); + retNode = gtNewSimdIsZeroNode(retType, op1, simdBaseJitType, simdSize, intrinsic == NI_Vector_IsZero); break; } + case NI_Vector_LessThan: case NI_Vector64_LessThan: case NI_Vector128_LessThan: { @@ -1821,10 +2061,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpNode(GT_LT, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpNode(GT_LT, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_LessThan, true); + // if (intrinsic == NI_Vector_LessThan) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_LessThanAll: case NI_Vector64_LessThanAll: case NI_Vector128_LessThanAll: { @@ -1833,10 +2081,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAllNode(GT_LT, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAllNode(GT_LT, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_LessThanAll); + // if (intrinsic == NI_Vector_LessThanAll) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_LessThanAny: case NI_Vector64_LessThanAny: case NI_Vector128_LessThanAny: { @@ -1845,10 +2101,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAnyNode(GT_LT, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAnyNode(GT_LT, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_LessThanAny); + // if (intrinsic == NI_Vector_LessThanAny) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_LessThanOrEqual: case NI_Vector64_LessThanOrEqual: case NI_Vector128_LessThanOrEqual: { @@ -1857,10 +2121,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpNode(GT_LE, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpNode(GT_LE, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_LessThanOrEqual, true); break; } + case NI_Vector_LessThanOrEqualAll: case NI_Vector64_LessThanOrEqualAll: case NI_Vector128_LessThanOrEqualAll: { @@ -1869,10 +2135,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAllNode(GT_LE, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAllNode(GT_LE, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_LessThanOrEqualAll); + // if (intrinsic == NI_Vector_LessThanOrEqualAll) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_LessThanOrEqualAny: case NI_Vector64_LessThanOrEqualAny: case NI_Vector128_LessThanOrEqualAny: { @@ -1881,12 +2155,20 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAnyNode(GT_LE, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAnyNode(GT_LE, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_LessThanOrEqualAny); + // if (intrinsic == NI_Vector_LessThanOrEqualAny) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } case NI_AdvSimd_LoadVector64: case NI_AdvSimd_LoadVector128: + case NI_Vector_LoadUnsafe: case NI_Vector64_LoadUnsafe: case NI_Vector128_LoadUnsafe: { @@ -1918,6 +2200,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_LoadAligned: case NI_Vector64_LoadAligned: case NI_Vector128_LoadAligned: { @@ -1943,6 +2226,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_LoadAlignedNonTemporal: case NI_Vector64_LoadAlignedNonTemporal: case NI_Vector128_LoadAlignedNonTemporal: { @@ -1968,6 +2252,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_Max: + { + isScalable = true; + FALLTHROUGH; + } + // case NI_Vector_MaxNumber: case NI_Vector64_Max: case NI_Vector128_Max: { @@ -1995,6 +2285,11 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_MaxNative: + { + isScalable = true; + FALLTHROUGH; + } case NI_Vector64_MaxNative: case NI_Vector128_MaxNative: { @@ -2010,9 +2305,20 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, isMinMaxIntrinsic = true; isMax = true; isNumber = true; + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_Min: + // case NI_Vector_MinNumber: + { + isScalable = true; + FALLTHROUGH; + } case NI_Vector64_Min: case NI_Vector128_Min: { @@ -2034,9 +2340,15 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, isMinMaxIntrinsic = true; isMagnitude = true; isNumber = true; + // } break; } + case NI_Vector_MinNative: + { + isScalable = true; + FALLTHROUGH; + } case NI_Vector64_MinNative: case NI_Vector128_MinNative: { @@ -2050,6 +2362,31 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { isMinMaxIntrinsic = true; isNumber = true; + // if (intrinsic == NI_Vector_MinNative) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } + break; + } + + case NI_Vector_op_Multiply: + { + assert(sig->numArgs == 2); + + CORINFO_ARG_LIST_HANDLE arg1 = sig->args; + CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(arg1); + var_types argType = TYP_UNKNOWN; + CORINFO_CLASS_HANDLE argClass = NO_CLASS_HANDLE; + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass))); + op2 = getArgForHWIntrinsic(argType, argClass); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg1, &argClass))); + op1 = getArgForHWIntrinsic(argType, argClass); + + retNode = gtNewSimdBinOpNode(GT_MUL, retType, op1, op2, simdBaseJitType, simdSize, true); break; } @@ -2069,10 +2406,11 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg1, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); - retNode = gtNewSimdBinOpNode(GT_MUL, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_MUL, retType, op1, op2, simdBaseJitType, simdSize, false); break; } + case NI_Vector_MultiplyAddEstimate: case NI_Vector64_MultiplyAddEstimate: case NI_Vector128_MultiplyAddEstimate: { @@ -2099,15 +2437,24 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, if (varTypeIsFloating(simdBaseType)) { retNode = gtNewSimdFmaNode(retType, op1, op2, op3, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_MultiplyAddEstimate) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } } else { - GenTree* mulNode = gtNewSimdBinOpNode(GT_MUL, retType, op1, op2, simdBaseJitType, simdSize); - retNode = gtNewSimdBinOpNode(GT_ADD, retType, mulNode, op3, simdBaseJitType, simdSize); + GenTree* mulNode = gtNewSimdBinOpNode(GT_MUL, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_MultiplyAddEstimate); + retNode = gtNewSimdBinOpNode(GT_ADD, retType, mulNode, op3, simdBaseJitType, simdSize, + intrinsic == NI_Vector_MultiplyAddEstimate); } break; } + case NI_Vector_Narrow: case NI_Vector64_Narrow: case NI_Vector128_Narrow: { @@ -2153,24 +2500,35 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_op_UnaryNegation: case NI_Vector64_op_UnaryNegation: case NI_Vector128_op_UnaryNegation: { assert(sig->numArgs == 1); op1 = impSIMDPopStack(); - retNode = gtNewSimdUnOpNode(GT_NEG, retType, op1, simdBaseJitType, simdSize); + retNode = gtNewSimdUnOpNode(GT_NEG, retType, op1, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_UnaryNegation); break; } + case NI_Vector_op_OnesComplement: case NI_Vector64_op_OnesComplement: case NI_Vector128_op_OnesComplement: { assert(sig->numArgs == 1); op1 = impSIMDPopStack(); - retNode = gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize); + retNode = gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_OnesComplement); + // if (intrinsic == NI_Vector_op_OnesComplement) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_op_Inequality: case NI_Vector64_op_Inequality: case NI_Vector128_op_Inequality: { @@ -2179,10 +2537,18 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdCmpOpAnyNode(GT_NE, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdCmpOpAnyNode(GT_NE, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_Inequality); + // if (intrinsic == NI_Vector_op_Inequality) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + case NI_Vector_op_UnaryPlus: case NI_Vector64_op_UnaryPlus: case NI_Vector128_op_UnaryPlus: { @@ -2191,6 +2557,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_op_Subtraction: case NI_Vector64_op_Subtraction: case NI_Vector128_op_Subtraction: { @@ -2199,10 +2566,23 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_SUB, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_SUB, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_Subtraction); break; } + case NI_Vector_ShiftLeft: + case NI_Vector_op_LeftShift: + { + assert(sig->numArgs == 2); + + op2 = impPopStack().val; + op1 = impSIMDPopStack(); + + retNode = gtNewSimdBinOpNode(GT_LSH, retType, op1, op2, simdBaseJitType, simdSize, true); + retNode->AsHWIntrinsic()->SetAuxiliaryJitType(simdBaseJitType); + break; + } case NI_Vector64_op_LeftShift: case NI_Vector128_op_LeftShift: { @@ -2211,7 +2591,21 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_LSH, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_LSH, retType, op1, op2, simdBaseJitType, simdSize, false); + break; + } + + case NI_Vector_ShiftRightLogical: + case NI_Vector_op_RightShift: + { + assert(sig->numArgs == 2); + genTreeOps op = varTypeIsUnsigned(simdBaseType) ? GT_RSZ : GT_RSH; + + op2 = impPopStack().val; + op1 = impSIMDPopStack(); + + retNode = gtNewSimdBinOpNode(op, retType, op1, op2, simdBaseJitType, simdSize, true); + retNode->AsHWIntrinsic()->SetAuxiliaryJitType(simdBaseJitType); break; } @@ -2224,7 +2618,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(op, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(op, retType, op1, op2, simdBaseJitType, simdSize, false); break; } @@ -2236,10 +2630,11 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_RSZ, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_RSZ, retType, op1, op2, simdBaseJitType, simdSize, false); break; } + case NI_Vector_Round: case NI_Vector64_Round: case NI_Vector128_Round: { @@ -2256,6 +2651,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdRoundNode(retType, op1, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_Round) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } @@ -2417,6 +2818,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_StoreUnsafe: case NI_Vector64_StoreUnsafe: case NI_Vector128_StoreUnsafe: { @@ -2458,6 +2860,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_StoreAligned: case NI_Vector64_StoreAligned: case NI_Vector128_StoreAligned: { @@ -2488,6 +2891,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_StoreAlignedNonTemporal: case NI_Vector64_StoreAlignedNonTemporal: case NI_Vector128_StoreAlignedNonTemporal: { @@ -2639,7 +3043,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, if (varTypeIsFloating(simdBaseType)) { - retNode = gtNewSimdBinOpNode(GT_SUB, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_SUB, retType, op1, op2, simdBaseJitType, simdSize, false); } else { @@ -2655,6 +3059,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_Sum: case NI_Vector64_Sum: case NI_Vector128_Sum: { @@ -2664,6 +3069,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_Truncate: case NI_Vector64_Truncate: case NI_Vector128_Truncate: { @@ -2677,9 +3083,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdTruncNode(retType, op1, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_Truncate) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + // case NI_Vector_WidenLower: case NI_Vector64_WidenLower: case NI_Vector128_WidenLower: { @@ -2688,9 +3101,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdWidenLowerNode(retType, op1, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_WidenLower) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } + // case NI_Vector_WidenUpper: case NI_Vector64_WidenUpper: case NI_Vector128_WidenUpper: { @@ -2699,6 +3119,12 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op1 = impSIMDPopStack(); retNode = gtNewSimdWidenUpperNode(retType, op1, simdBaseJitType, simdSize); + // if (intrinsic == NI_Vector_WidenUpper) + //{ + // intrinsic = GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, + // retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + // retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + // } break; } @@ -2764,6 +3190,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector_op_ExclusiveOr: case NI_Vector64_op_ExclusiveOr: case NI_Vector128_op_ExclusiveOr: { @@ -2772,7 +3199,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, op2 = impSIMDPopStack(); op1 = impSIMDPopStack(); - retNode = gtNewSimdBinOpNode(GT_XOR, retType, op1, op2, simdBaseJitType, simdSize); + retNode = gtNewSimdBinOpNode(GT_XOR, retType, op1, op2, simdBaseJitType, simdSize, + intrinsic == NI_Vector_op_ExclusiveOr); break; } @@ -3399,21 +3827,141 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, if (isNative) { assert(!isMagnitude && !isNumber); - retNode = gtNewSimdMinMaxNativeNode(retType, op1, op2, simdBaseJitType, simdSize, isMax); + retNode = + gtNewSimdMinMaxNativeNode(retType, op1, op2, simdBaseJitType, simdSize, isMax ARM64_ARG(isScalable)); } else { - retNode = gtNewSimdMinMaxNode(retType, op1, op2, simdBaseJitType, simdSize, isMax, isMagnitude, isNumber); + retNode = gtNewSimdMinMaxNode(retType, op1, op2, simdBaseJitType, simdSize, isMax, isMagnitude, + isNumber ARM64_ARG(isScalable)); } } +#ifdef TARGET_ARM64 + if ((retNode != nullptr) && (intrinsic >= FIRST_NI_Vector) && (intrinsic <= LAST_NI_Vector)) + { + // For VectorT, map the intrinsics + switch (intrinsic) + { + case NI_Vector_Abs: + case NI_Vector_Ceiling: + case NI_Vector_ConditionalSelect: + case NI_Vector_ConvertToDouble: + case NI_Vector_ConvertToInt32Native: + case NI_Vector_ConvertToInt32: + case NI_Vector_ConvertToInt64Native: + case NI_Vector_ConvertToInt64: + case NI_Vector_ConvertToSingle: + case NI_Vector_ConvertToUInt32Native: + case NI_Vector_ConvertToUInt32: + case NI_Vector_ConvertToUInt64Native: + case NI_Vector_ConvertToUInt64: + case NI_Vector_Floor: + case NI_Vector_FusedMultiplyAdd: + case NI_Vector_Max: + case NI_Vector_MaxNative: + case NI_Vector_Min: + case NI_Vector_MinNative: + case NI_Vector_MultiplyAddEstimate: + case NI_Vector_Round: + case NI_Vector_op_Subtraction: + case NI_Vector_Sum: + case NI_Vector_Truncate: + // case NI_Vector_WidenLower: + // case NI_Vector_WidenUpper: + { + if (retNode->OperIsHWIntrinsic()) + { + intrinsic = + GenTreeHWIntrinsic::GetScalableHWIntrinsicId(retType, simdBaseType, + retNode->AsHWIntrinsic()->GetHWIntrinsicId()); + retNode->AsHWIntrinsic()->ChangeHWIntrinsicId(intrinsic); + } + break; + } + case NI_Vector_Add: + case NI_Vector_op_Addition: + case NI_Vector_AndNot: + case NI_Vector_op_BitwiseAnd: + case NI_Vector_op_BitwiseOr: + case NI_Vector_op_Division: + case NI_Vector_op_Multiply: + case NI_Vector_op_ExclusiveOr: + { + // gtNewSimdBinOpNode should handle this + NamedIntrinsic sveIntrinsic = retNode->AsHWIntrinsic()->GetHWIntrinsicId(); + assert((FIRST_NI_Sve <= sveIntrinsic) && (sveIntrinsic <= LAST_NI_Sve)); + break; + } + case NI_Vector_Equals: + case NI_Vector_op_Equality: + case NI_Vector_EqualsAny: + case NI_Vector_GreaterThan: + case NI_Vector_GreaterThanAll: + case NI_Vector_GreaterThanAny: + case NI_Vector_GreaterThanOrEqual: + case NI_Vector_GreaterThanOrEqualAll: + case NI_Vector_GreaterThanOrEqualAny: + case NI_Vector_LessThan: + case NI_Vector_LessThanAll: + case NI_Vector_LessThanAny: + case NI_Vector_LessThanOrEqual: + case NI_Vector_LessThanOrEqualAll: + case NI_Vector_LessThanOrEqualAny: + case NI_Vector_op_Inequality: + { + // gtNewSimdCmpOpNode should handle this + NamedIntrinsic sveIntrinsic = retNode->AsHWIntrinsic()->GetHWIntrinsicId(); + assert(((FIRST_NI_Sve <= sveIntrinsic) && (sveIntrinsic <= LAST_NI_Sve)) || + ((FIRST_NI_Vector <= sveIntrinsic) && (sveIntrinsic <= LAST_NI_Vector)) || + (sveIntrinsic == NI_Sve_ConvertMaskToVector) || (sveIntrinsic == NI_Sve_ConvertVectorToMask)); + break; + } + case NI_Vector_op_OnesComplement: + { + // gtNewSimdUnOpNode should handle this + break; + } + case NI_Vector_get_One: + case NI_Vector_get_Zero: + { + // This are constants + break; + } + case NI_Vector_As: + case NI_Vector_AsVectorByte: + case NI_Vector_AsVectorDouble: + case NI_Vector_AsVectorInt16: + case NI_Vector_AsVectorInt32: + case NI_Vector_AsVectorInt64: + case NI_Vector_AsVectorNInt: + case NI_Vector_AsVectorNUInt: + case NI_Vector_AsVectorSByte: + case NI_Vector_AsVectorSingle: + case NI_Vector_AsVectorUInt16: + case NI_Vector_AsVectorUInt32: + case NI_Vector_AsVectorUInt64: + case NI_Vector_get_Indices: + { + // no-op for these + break; + } + default: + { + // TODO-VL: Enable this + // unreached(); + break; + } + } + } +#endif assert(!isScalar || isValidScalarIntrinsic); return retNode; } //------------------------------------------------------------------------ -// gtNewSimdAllTrueMaskNode: Create a mask with all bits set to true +// gtNewSimdAllTrueMaskNode: Create a AllTrue mask node // // Arguments: // simdBaseJitType -- the base jit type of the nodes being masked @@ -3421,16 +3969,29 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, // Return Value: // The mask // -GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType) +GenTree* Compiler::gtNewSimdAllTrueMaskNode(CorInfoType simdBaseJitType, unsigned simdSize) { // Import as a constant mask var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); GenTreeMskCon* mskCon = gtNewMskConNode(TYP_MASK); - // TODO-SVE: For agnostic VL, vector type may not be simd16_t + bool found = false; - bool found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); + switch (simdSize) + { + case 16: + found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); + break; + case 32: + found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); + break; + case 64: + found = EvaluateSimdPatternToMask(simdBaseType, &mskCon->gtSimdMaskVal, SveMaskPatternAll); + break; + default: + unreached(); + } assert(found); return mskCon; diff --git a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp index ca38c26ab7c845..c481ef05afcbb5 100644 --- a/src/coreclr/jit/hwintrinsiccodegenarm64.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenarm64.cpp @@ -591,6 +591,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Sve_ConvertToInt64: case NI_Sve_ConvertToUInt64: case NI_Sve_ConvertToDouble: + case NI_Sve_ConvertToDoubleUpper: { embOpt = emitTypeSize(intrinEmbMask.baseType) == EA_4BYTE ? INS_OPTS_S_TO_D : INS_OPTS_SCALABLE_D; @@ -810,6 +811,20 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) emitInsHelper(targetReg, maskReg, embMaskOp2Reg); break; + case NI_Sve_MultiplyByScalar: + { + if (targetReg != embMaskOp1Reg) + { + GetEmitter()->emitIns_R_R_R(INS_sve_movprfx, EA_SCALABLE, targetReg, maskReg, + embMaskOp1Reg, opt); + } + assert(intrinEmbMask.op2->IsCnsFltOrDbl()); + double imm = intrinEmbMask.op2->AsDblCon()->DconValue(); + assert((imm == 0.5) || (imm == 2.0)); + GetEmitter()->emitIns_R_R_F(insEmbMask, emitSize, targetReg, op1Reg, imm, opt); + break; + } + default: assert(targetReg != embMaskOp2Reg); @@ -2684,6 +2699,62 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Sve_Index: + { + if ((op1Reg == REG_NA) && (op2Reg == REG_NA)) + { + int start = (int)intrin.op1->AsIntCon()->gtIconVal; + int step = (int)intrin.op2->AsIntCon()->gtIconVal; + GetEmitter()->emitInsSve_R_I_I(ins, EA_SCALABLE, targetReg, start, step, + emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType))); + } + else if ((op1Reg != REG_NA) && (op2Reg != REG_NA)) + { + emitAttr scalarSize = emitActualTypeSize(node->GetSimdBaseType()); + GetEmitter()->emitInsSve_R_R_R(ins, scalarSize, targetReg, op1Reg, op2Reg, + emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType))); + } + else if (op1Reg != REG_NA) + { + assert(op2Reg == REG_NA); + int step = (int)intrin.op2->AsIntCon()->gtIconVal; + GetEmitter()->emitInsSve_R_R_I(ins, EA_SCALABLE, targetReg, op1Reg, step, + emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType))); + } + else + { + assert(op1Reg == REG_NA); + + int start = (int)intrin.op1->AsIntCon()->gtIconVal; + GetEmitter()->emitInsSve_R_R_I(ins, EA_SCALABLE, targetReg, op2Reg, start, + emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType)), + INS_SCALABLE_OPTS_IMM_FIRST); + } + break; + } + case NI_Sve_DuplicateScalarToVector: + { + if (op1Reg == REG_NA) + { + GetEmitter()->emitIns_R_I(ins, emitTypeSize(intrin.baseType), targetReg, + intrin.op1->AsIntCon()->IconValue(), + emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType))); + } + else + { + if (varTypeIsIntegral(intrin.op1)) + { + GetEmitter()->emitIns_R_R(ins, emitTypeSize(intrin.baseType), targetReg, op1Reg, + emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType))); + } + else + { + GetEmitter()->emitIns_R_R_I(ins, emitTypeSize(intrin.baseType), targetReg, op1Reg, 0, + emitter::optGetSveInsOpt(emitTypeSize(intrin.baseType))); + } + } + break; + } case NI_Sve2_AddCarryWideningLower: case NI_Sve2_AddCarryWideningUpper: @@ -2719,8 +2790,22 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) GetEmitter()->emitInsSve_R_R_R(ins, emitSize, targetReg, op3Reg, op1Reg, INS_OPTS_SCALABLE_D); break; + case NI_Sve_MultiplyByScalar: + { + if (targetReg != op2Reg) + { + GetEmitter()->emitInsSve_R_R(INS_sve_movprfx, EA_SCALABLE, targetReg, op2Reg); + } + assert(node->IsCnsFltOrDbl()); + unsigned imm = node->AsDblCon()->DconValue() == 0.5 ? 0 : 1; + GetEmitter()->emitInsSve_R_R_I(ins, emitSize, targetReg, op1Reg, opt); + break; + } default: + { unreached(); + break; + } } } diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index bf13fc05b21104..c743fa4b07d3e1 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -10,13 +10,126 @@ // clang-format off #ifdef FEATURE_HW_INTRINSICS + +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// Vector +#define FIRST_NI_Vector NI_Vector_Abs +HARDWARE_INTRINSIC(Vector, Abs, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Add, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, AndNot, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, As, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorByte, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorDouble, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorInt16, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorNInt, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorNUInt, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorSByte, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorSingle, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorUInt16, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorUInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, AsVectorUInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, BitwiseAnd, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_Commutative) +HARDWARE_INTRINSIC(Vector, BitwiseOr, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_Commutative) +HARDWARE_INTRINSIC(Vector, Ceiling, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, ConditionalSelect, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, ConvertToDouble, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToInt32Native, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToInt64Native, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToSingle, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToUInt32, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToUInt32Native, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToUInt64, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ConvertToUInt64Native, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, Create, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector, CreateSequence, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Division, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Dot, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, Equals, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, EqualsAll, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, EqualsAny, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, Floor, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, FusedMultiplyAdd, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, GetElement, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SupportsContainment) +HARDWARE_INTRINSIC(Vector, GreaterThan, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, GreaterThanAll, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, GreaterThanAny, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, GreaterThanOrEqual, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, GreaterThanOrEqualAll, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, GreaterThanOrEqualAny, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, IsNaN, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, IsNegative, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, IsPositive, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, IsPositiveInfinity, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, IsZero, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, LessThan, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, LessThanAll, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, LessThanAny, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, LessThanOrEqual, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, LessThanOrEqualAll, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, LessThanOrEqualAny, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, LoadAligned, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, LoadAlignedNonTemporal, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, LoadUnsafe, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Max, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, MaxNative, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, MaxNumber, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Min, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, MinNative, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, MinNumber, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Multiply, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, MultiplyAddEstimate, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Narrow, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Negate, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, OnesComplement, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Round, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, ShiftLeft, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, ShiftRightLogical, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, SquareRoot, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, StoreAligned, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, StoreAlignedNonTemporal, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, StoreUnsafe, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, Subtract, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, Sum, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, ToScalar, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar) +HARDWARE_INTRINSIC(Vector, Truncate, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +// No good equivalent in SVE to simulate WidenLower i.e. ConvertToDouble /WidenUpper i.e ConvertToDoubleUpper +//HARDWARE_INTRINSIC(Vector, WidenLower, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +//HARDWARE_INTRINSIC(Vector, WidenUpper, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(Vector, Xor, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, get_AllBitsSet, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, get_Indices, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, get_One, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, get_Zero, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_Addition, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_BitwiseAnd, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_Commutative) +HARDWARE_INTRINSIC(Vector, op_BitwiseOr, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId|HW_Flag_Commutative) +HARDWARE_INTRINSIC(Vector, op_Division, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) +HARDWARE_INTRINSIC(Vector, op_Equality, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) +HARDWARE_INTRINSIC(Vector, op_ExclusiveOr, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_Inequality, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_Commutative|HW_Flag_CanBenefitFromConstantProp) +HARDWARE_INTRINSIC(Vector, op_LeftShift, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_Multiply, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_OnesComplement, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_RightShift, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_Subtraction, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_UnaryNegation, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +HARDWARE_INTRINSIC(Vector, op_UnaryPlus, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_InvalidNodeId) +#define LAST_NI_Vector NI_Vector_op_UnaryPlus + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SVE Intrinsics #define FIRST_NI_Sve NI_Sve_Abs -HARDWARE_INTRINSIC(Sve, Abs, -1, -1, {INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_abs, INS_invalid, INS_sve_fabs, INS_sve_fabs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, Abs, -1, -1, {INS_sve_abs, INS_sve_abs, INS_sve_abs, INS_sve_abs, INS_sve_abs, INS_sve_abs, INS_sve_abs, INS_sve_abs, INS_sve_fabs, INS_sve_fabs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, AbsoluteCompareGreaterThan, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_facgt, INS_sve_facgt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, AbsoluteCompareGreaterThanOrEqual, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_facge, INS_sve_facge}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, AbsoluteCompareLessThan, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_faclt, INS_sve_faclt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_ZeroingMaskedOperation) @@ -27,7 +140,7 @@ HARDWARE_INTRINSIC(Sve, AddAcross, HARDWARE_INTRINSIC(Sve, AddRotateComplex, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcadd, INS_sve_fcadd}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasImmediateOperand) HARDWARE_INTRINSIC(Sve, AddSaturate, -1, 2, {INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_sve_sqadd, INS_sve_uqadd, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable) HARDWARE_INTRINSIC(Sve, AddSequentialAcross, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fadda, INS_sve_fadda}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_ReduceOperation) -HARDWARE_INTRINSIC(Sve, And, -1, -1, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) +HARDWARE_INTRINSIC(Sve, And, -1, -1, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, AndAcross, -1, -1, {INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) HARDWARE_INTRINSIC(Sve, BitwiseClear, -1, -1, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, BooleanNot, -1, -1, {INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -49,6 +162,7 @@ HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElement, HARDWARE_INTRINSIC(Sve, ConditionalExtractLastActiveElementAndReplicate, -1, 3, {INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb, INS_sve_clastb}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, ConditionalSelect, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_SupportsContainment|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, ConvertToDouble, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_scvtf, INS_sve_ucvtf, INS_sve_scvtf, INS_sve_ucvtf, INS_sve_fcvt, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, ConvertToDoubleUpper, -1, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcvtlt, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ConvertToInt32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcvtzs, INS_sve_fcvtzs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ConvertToInt64, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcvtzs, INS_sve_fcvtzs}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ConvertToSingle, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_scvtf, INS_sve_ucvtf, INS_sve_scvtf, INS_sve_ucvtf, INS_invalid, INS_sve_fcvt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -96,6 +210,7 @@ HARDWARE_INTRINSIC(Sve, CreateWhileLessThanOrEqualMask8Bit, HARDWARE_INTRINSIC(Sve, Divide, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sdiv, INS_sve_udiv, INS_sve_sdiv, INS_sve_udiv, INS_sve_fdiv, INS_sve_fdiv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, DotProduct, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sdot, INS_sve_udot, INS_sve_sdot, INS_sve_udot, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, DotProductBySelectedScalar, -1, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sdot, INS_sve_udot, INS_sve_sdot, INS_sve_udot, INS_invalid, INS_invalid}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasRMWSemantics|HW_Flag_LowVectorOperation) +HARDWARE_INTRINSIC(Sve, DuplicateScalarToVector, -1, 1, {INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SupportsContainment|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, DuplicateSelectedScalarToVector, -1, 2, {INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup, INS_sve_dup}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_HasImmediateOperand) HARDWARE_INTRINSIC(Sve, ExtractAfterLastActiveElement, -1, 2, {INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, ExtractAfterLastActiveElementScalar, 0, 2, {INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta, INS_sve_lasta}, HW_Category_Scalar, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) @@ -146,6 +261,7 @@ HARDWARE_INTRINSIC(Sve, GetFfrSByte, HARDWARE_INTRINSIC(Sve, GetFfrUInt16, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffect_Other) HARDWARE_INTRINSIC(Sve, GetFfrUInt32, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffect_Other) HARDWARE_INTRINSIC(Sve, GetFfrUInt64, -1, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_rdffr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_SpecialSideEffect_Other) +HARDWARE_INTRINSIC(Sve, Index, -1, 2, {INS_sve_index, INS_sve_index, INS_sve_index, INS_sve_index, INS_sve_index, INS_sve_index, INS_sve_index, INS_sve_index, INS_sve_index, INS_sve_index}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_SupportsContainment) HARDWARE_INTRINSIC(Sve, InsertIntoShiftedVector, -1, 2, {INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr, INS_sve_insr}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, LeadingSignCount, -1, -1, {INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_sve_cls, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, LeadingZeroCount, -1, -1, {INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_sve_clz, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -201,14 +317,14 @@ HARDWARE_INTRINSIC(Sve, LoadVectorUInt16NonFaultingZeroExtendToInt32, HARDWARE_INTRINSIC(Sve, LoadVectorUInt16NonFaultingZeroExtendToInt64, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldnf1h, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialSideEffectMask) HARDWARE_INTRINSIC(Sve, LoadVectorUInt16NonFaultingZeroExtendToUInt32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldnf1h, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialSideEffectMask) HARDWARE_INTRINSIC(Sve, LoadVectorUInt16NonFaultingZeroExtendToUInt64, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldnf1h, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendFirstFaulting, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldff1h, INS_sve_ldff1h, INS_sve_ldff1h, INS_sve_ldff1h, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialCodeGen|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendFirstFaulting, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldff1h, INS_sve_ldff1h, INS_sve_ldff1h, INS_sve_ldff1h, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialCodeGen|HW_Flag_SpecialSideEffectMask) HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToInt32, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToInt64, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToUInt32, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorUInt16ZeroExtendToUInt64, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1h, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorUInt32NonFaultingZeroExtendToInt64, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldnf1w, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialSideEffectMask) HARDWARE_INTRINSIC(Sve, LoadVectorUInt32NonFaultingZeroExtendToUInt64, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldnf1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialSideEffectMask) -HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendFirstFaulting, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldff1w, INS_sve_ldff1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialCodeGen|HW_Flag_SpecialSideEffectMask) +HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendFirstFaulting, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ldff1w, INS_sve_ldff1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation|HW_Flag_SpecialCodeGen|HW_Flag_SpecialSideEffectMask) HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendToInt64, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, LoadVectorUInt32ZeroExtendToUInt64, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_ld1w, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ZeroingMaskedOperation) HARDWARE_INTRINSIC(Sve, Max, -1, -1, {INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_smax, INS_sve_umax, INS_sve_fmax, INS_sve_fmax}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) @@ -219,16 +335,17 @@ HARDWARE_INTRINSIC(Sve, Min, HARDWARE_INTRINSIC(Sve, MinAcross, -1, -1, {INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_sminv, INS_sve_uminv, INS_sve_fminv, INS_sve_fminv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) HARDWARE_INTRINSIC(Sve, MinNumber, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fminnm, INS_sve_fminnm}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, MinNumberAcross, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fminnmv, INS_sve_fminnmv}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) -HARDWARE_INTRINSIC(Sve, Multiply, -1, 2, {INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_fmul, INS_sve_fmul}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) +HARDWARE_INTRINSIC(Sve, Multiply, -1, 2, {INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_mul, INS_sve_fmul, INS_sve_fmul}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, MultiplyAdd, -1, -1, {INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_sve_mla, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, MultiplyAddRotateComplex, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcmla, INS_sve_fcmla}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasImmediateOperand) HARDWARE_INTRINSIC(Sve, MultiplyAddRotateComplexBySelectedScalar, -1, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fcmla, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_LowVectorOperation|HW_Flag_HasRMWSemantics|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(Sve, MultiplyByScalar, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmul, INS_sve_fmul}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_SpecialCodeGen|HW_Flag_SupportsContainment) HARDWARE_INTRINSIC(Sve, MultiplyBySelectedScalar, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmul, INS_sve_fmul}, HW_Category_SIMDByIndexedElement, HW_Flag_Scalable|HW_Flag_HasImmediateOperand|HW_Flag_LowVectorOperation) HARDWARE_INTRINSIC(Sve, MultiplyExtended, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fmulx, INS_sve_fmulx}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, MultiplySubtract, -1, -1, {INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_sve_mls, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_FmaIntrinsic|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, Negate, -1, -1, {INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_neg, INS_invalid, INS_sve_fneg, INS_sve_fneg}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) -HARDWARE_INTRINSIC(Sve, Not, -1, -1, {INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation) -HARDWARE_INTRINSIC(Sve, Or, -1, -1, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) +HARDWARE_INTRINSIC(Sve, Not, -1, -1, {INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not, INS_sve_not}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation) +HARDWARE_INTRINSIC(Sve, Or, -1, -1, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, OrAcross, -1, -1, {INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) HARDWARE_INTRINSIC(Sve, PopCount, -1, -1, {INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, Prefetch16Bit, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_sve_prfh, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_SpecialSideEffect_Other) @@ -270,9 +387,12 @@ HARDWARE_INTRINSIC(Sve, Scatter8BitNarrowing, HARDWARE_INTRINSIC(Sve, Scatter8BitWithByteOffsetsNarrowing, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_st1b, INS_sve_st1b, INS_sve_st1b, INS_sve_st1b, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, SetFfr, -1, 1, {INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_sve_wrffr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialSideEffect_Other|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, ShiftLeftLogical, -1, -1, {INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics) +HARDWARE_INTRINSIC(Sve, ShiftLeftLogicalImm, -1, -1, {INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_sve_lsl, INS_invalid, INS_invalid}, HW_Category_ShiftLeftByImmediate, HW_Flag_Scalable|HW_Flag_HasImmediateOperand) HARDWARE_INTRINSIC(Sve, ShiftRightArithmetic, -1, -1, {INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve, ShiftRightArithmeticForDivide, -1, -1, {INS_sve_asrd, INS_invalid, INS_sve_asrd, INS_invalid, INS_sve_asrd, INS_invalid, INS_sve_asrd, INS_invalid, INS_invalid, INS_invalid}, HW_Category_ShiftRightByImmediate, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_HasImmediateOperand) +HARDWARE_INTRINSIC(Sve, ShiftRightArithmeticImm, -1, -1, {INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_sve_asr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_ShiftRightByImmediate, HW_Flag_Scalable|HW_Flag_HasImmediateOperand) HARDWARE_INTRINSIC(Sve, ShiftRightLogical, -1, -1, {INS_invalid, INS_sve_lsr, INS_invalid, INS_sve_lsr, INS_invalid, INS_sve_lsr, INS_invalid, INS_sve_lsr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics) +HARDWARE_INTRINSIC(Sve, ShiftRightLogicalImm, -1, -1, {INS_invalid, INS_sve_lsr, INS_invalid, INS_sve_lsr, INS_invalid, INS_sve_lsr, INS_invalid, INS_sve_lsr, INS_invalid, INS_invalid}, HW_Category_ShiftRightByImmediate, HW_Flag_Scalable|HW_Flag_HasImmediateOperand) HARDWARE_INTRINSIC(Sve, SignExtend16, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sxth, INS_invalid, INS_sve_sxth, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, SignExtend32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_sxtw, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, SignExtend8, -1, -1, {INS_invalid, INS_invalid, INS_sve_sxtb, INS_invalid, INS_sve_sxtb, INS_invalid, INS_sve_sxtb, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -296,7 +416,7 @@ HARDWARE_INTRINSIC(Sve, TrigonometricStartingValue, HARDWARE_INTRINSIC(Sve, UnzipEven, -1, 2, {INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1, INS_sve_uzp1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, UnzipOdd, -1, 2, {INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2, INS_sve_uzp2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, VectorTableLookup, -1, 2, {INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl, INS_sve_tbl}, HW_Category_SIMD, HW_Flag_Scalable) -HARDWARE_INTRINSIC(Sve, Xor, -1, -1, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) +HARDWARE_INTRINSIC(Sve, Xor, -1, -1, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, XorAcross, -1, -1, {INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_sve_eorv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) HARDWARE_INTRINSIC(Sve, ZeroExtend16, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_uxth, INS_invalid, INS_sve_uxth, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, ZeroExtend32, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_uxtw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) @@ -321,7 +441,7 @@ HARDWARE_INTRINSIC(Sve2, AbsoluteDifferenceWideningUpper, HARDWARE_INTRINSIC(Sve2, AddCarryWideningLower, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_adclb, INS_invalid, INS_sve_adclb, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve2, AddCarryWideningUpper, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_adclt, INS_invalid, INS_sve_adclt, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasRMWSemantics|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve2, BitwiseClearXor, -1, 3, {INS_sve_bcax, INS_sve_bcax, INS_sve_bcax, INS_sve_bcax, INS_sve_bcax, INS_sve_bcax, INS_sve_bcax, INS_sve_bcax, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) -HARDWARE_INTRINSIC(Sve2, BitwiseSelect, -1, 3, {INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) +HARDWARE_INTRINSIC(Sve2, BitwiseSelect, -1, 3, {INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl, INS_sve_bsl}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve2, BitwiseSelectLeftInverted, -1, 3, {INS_sve_bsl1n, INS_sve_bsl1n, INS_sve_bsl1n, INS_sve_bsl1n, INS_sve_bsl1n, INS_sve_bsl1n, INS_sve_bsl1n, INS_sve_bsl1n, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve2, BitwiseSelectRightInverted, -1, 3, {INS_sve_bsl2n, INS_sve_bsl2n, INS_sve_bsl2n, INS_sve_bsl2n, INS_sve_bsl2n, INS_sve_bsl2n, INS_sve_bsl2n, INS_sve_bsl2n, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_HasRMWSemantics) HARDWARE_INTRINSIC(Sve2, InterleavingXorEvenOdd, -1, 3, {INS_sve_eorbt, INS_sve_eorbt, INS_sve_eorbt, INS_sve_eorbt, INS_sve_eorbt, INS_sve_eorbt, INS_sve_eorbt, INS_sve_eorbt, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_HasRMWSemantics) @@ -365,8 +485,8 @@ HARDWARE_INTRINSIC(Sve, StoreAndZipx4, // Predicate variants of intrinsics, these are specialized for operating on TYP_MASK type values. HARDWARE_INTRINSIC(Sve, And_Predicates, -1, 2, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, BitwiseClear_Predicates, -1, 2, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Sve, Or_Predicates, -1, 2, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) -HARDWARE_INTRINSIC(Sve, Xor_Predicates, -1, 2, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, Or_Predicates, -1, 2, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) +HARDWARE_INTRINSIC(Sve, Xor_Predicates, -1, 2, {INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor, INS_sve_eor}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_EmbeddedMaskedOperation|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, ConditionalSelect_Predicates, -1, 3, {INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel, INS_sve_sel}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask|HW_Flag_ExplicitMaskedOperation|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Sve, ZipHigh_Predicates, -1, 2, {INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2, INS_sve_zip2}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Sve, ZipLow_Predicates, -1, 2, {INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1, INS_sve_zip1}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ReturnsPerElementMask) diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index 660ddae95cc46f..5d078678d404f1 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -3910,6 +3910,13 @@ GenTree* Compiler::impImportStaticReadOnlyField(CORINFO_FIELD_HANDLE field, CORI } else #endif // TARGET_XARCH +#ifdef TARGET_ARM64 + if (UseSveForType(simdType)) + { + hwAccelerated = compOpportunisticallyDependsOn(InstructionSet_Sve); + } + else +#endif // TARGET_ARM64 { // SIMD8, SIMD12, SIMD16 are covered by baseline ISA requirement assert((simdType == TYP_SIMD8) || (simdType == TYP_SIMD12) || (simdType == TYP_SIMD16)); diff --git a/src/coreclr/jit/importercalls.cpp b/src/coreclr/jit/importercalls.cpp index 87d01292ac0746..7414552eb168b7 100644 --- a/src/coreclr/jit/importercalls.cpp +++ b/src/coreclr/jit/importercalls.cpp @@ -3253,7 +3253,7 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd, // handled by the AltJit, so limit only the platform specific intrinsics assert((LAST_NI_Vector128 + 1) == FIRST_NI_AdvSimd); - if (ni < LAST_NI_Vector128) + if ((ni < LAST_NI_Vector128) || ((ni >= FIRST_NI_Vector) && (ni < LAST_NI_Vector))) #else #error Unsupported platform #endif @@ -4952,11 +4952,12 @@ GenTree* Compiler::impIntrinsic(CORINFO_CLASS_HANDLE clsHnd, if (isNative) { assert(!isMagnitude && !isNumber); - retNode = gtNewSimdMinMaxNativeNode(callType, op1, op2, callJitType, 0, isMax); + retNode = gtNewSimdMinMaxNativeNode(callType, op1, op2, callJitType, 0, isMax ARM64_ARG(false)); } else { - retNode = gtNewSimdMinMaxNode(callType, op1, op2, callJitType, 0, isMax, isMagnitude, isNumber); + retNode = gtNewSimdMinMaxNode(callType, op1, op2, callJitType, 0, isMax, isMagnitude, + isNumber ARM64_ARG(false)); } #endif // FEATURE_HW_INTRINSICS @@ -10378,38 +10379,46 @@ NamedIntrinsic Compiler::lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method) uint32_t size = getVectorTByteLength(); assert((size == 16) || (size == 32) || (size == 64)); + bool useSizeAgnosticVector = false; +#ifdef TARGET_ARM64 + useSizeAgnosticVector = compExactlyDependsOn(InstructionSet_Sve) && UseSveForVectorT(); +#endif const char* lookupClassName = className; - switch (size) + if (!useSizeAgnosticVector) { - case 16: + switch (size) { - lookupClassName = isVectorT ? "Vector128`1" : "Vector128"; - break; - } + case 16: + { + lookupClassName = isVectorT ? "Vector128`1" : "Vector128"; + break; + } - case 32: - { - lookupClassName = isVectorT ? "Vector256`1" : "Vector256"; - break; - } + case 32: + { + lookupClassName = isVectorT ? "Vector256`1" : "Vector256"; + break; + } - case 64: - { - lookupClassName = isVectorT ? "Vector512`1" : "Vector512"; - break; - } + case 64: + { + lookupClassName = isVectorT ? "Vector512`1" : "Vector512"; + break; + } - default: - { - unreached(); + default: + { + unreached(); + } } } const char* lookupMethodName = methodName; - if ((strncmp(methodName, "As", 2) == 0) && (methodName[2] != '\0')) + if (!useSizeAgnosticVector && + ((strncmp(methodName, "As", 2) == 0) && (methodName[2] != '\0'))) { if (strncmp(methodName + 2, "Vector", 6) == 0) { diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index 0f9ac5643e9a92..eeeb04e57d3fff 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -98,13 +98,15 @@ GenTree* Compiler::impExpandHalfConstEquals( #ifdef FEATURE_HW_INTRINSICS if (varTypeIsSIMD(type)) { - return gtNewSimdBinOpNode(oper, type, op1, op2, CORINFO_TYPE_NATIVEUINT, genTypeSize(type)); + return gtNewSimdBinOpNode(oper, type, op1, op2, CORINFO_TYPE_NATIVEUINT, + genTypeSize(type) ARM64_ARG(false)); } if (varTypeIsSIMD(op1)) { // E.g. a comparison of SIMD ops returning TYP_INT; assert(varTypeIsSIMD(op2)); - return gtNewSimdCmpOpAllNode(oper, type, op1, op2, CORINFO_TYPE_NATIVEUINT, genTypeSize(op1)); + return gtNewSimdCmpOpAllNode(oper, type, op1, op2, CORINFO_TYPE_NATIVEUINT, + genTypeSize(op1) ARM64_ARG(false)); } #endif return gtNewOperNode(oper, type, op1, op2); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index d41a7607c22ed2..86fdf213331eed 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -581,6 +581,9 @@ void CodeGen::inst_Mov(var_types dstType, #ifdef TARGET_ARM GetEmitter()->emitIns_Mov(ins, size, dstReg, srcReg, canSkip, flags); +#elif defined(TARGET_ARM64) + bool isScalable = (size == EA_SCALABLE) || (Compiler::UseStrictSveForType(dstType)); + GetEmitter()->emitIns_Mov(ins, size, dstReg, srcReg, canSkip, isScalable ? INS_OPTS_SCALABLE_B : INS_OPTS_NONE); #else GetEmitter()->emitIns_Mov(ins, size, dstReg, srcReg, canSkip); #endif @@ -1932,6 +1935,12 @@ instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false* return INS_sve_ldr; #endif } +#ifdef TARGET_ARM64 + else if (Compiler::UseStrictSveForType(srcType)) + { + return INS_sve_ldr; + } +#endif // TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(srcType)); @@ -2020,6 +2029,12 @@ instruction CodeGen::ins_Copy(var_types dstType) return INS_sve_mov; #endif } +#ifdef TARGET_ARM64 + else if (Compiler::UseStrictSveForType(dstType)) + { + return INS_sve_mov; + } +#endif // TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(dstType)); @@ -2143,6 +2158,12 @@ instruction CodeGen::ins_Copy(regNumber srcReg, var_types dstType) return INS_sve_mov; #endif } +#ifdef TARGET_ARM64 + else if (Compiler::UseStrictSveForType(dstType)) + { + return INS_sve_mov; + } +#endif #endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(dstType)); @@ -2256,6 +2277,12 @@ instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false return INS_sve_str; #endif } +#ifdef TARGET_ARM64 + else if (Compiler::UseStrictSveForType(dstType)) + { + return INS_sve_str; + } +#endif // TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS assert(varTypeUsesFloatReg(dstType)); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 4c6a30de6bc50b..fc321ae6dbdfbf 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -383,6 +383,9 @@ CONFIG_INTEGER(JitStressPromotedEvexEncoding, "JitStressPromotedEvexEncoding", 0 CONFIG_INTEGER(JitStressEvexEncoding, "JitStressEvexEncoding", 0) #endif +#if defined(TARGET_ARM64) +CONFIG_INTEGER(UseSveForVectorT, "UseSveForVectorT", 1) // Prefer SVE instructions for VectorT +#endif // // Hardware Intrinsic ISAs; keep in sync with clrconfigvalues.h // diff --git a/src/coreclr/jit/lclmorph.cpp b/src/coreclr/jit/lclmorph.cpp index 7366dfe9bce1fd..11e320c4bc822a 100644 --- a/src/coreclr/jit/lclmorph.cpp +++ b/src/coreclr/jit/lclmorph.cpp @@ -1698,8 +1698,9 @@ class LocalAddressVisitor final : public GenTreeVisitor { // Handle case 1 or the float field of case 2 GenTree* indexNode = m_compiler->gtNewIconNode(offset / genTypeSize(elementType)); - hwiNode = m_compiler->gtNewSimdGetElementNode(elementType, lclNode, indexNode, - CORINFO_TYPE_FLOAT, genTypeSize(varDsc)); + hwiNode = + m_compiler->gtNewSimdGetElementNode(elementType, lclNode, indexNode, CORINFO_TYPE_FLOAT, + genTypeSize(varDsc) ARM64_ARG(false)); break; } @@ -1777,7 +1778,7 @@ class LocalAddressVisitor final : public GenTreeVisitor GenTree* indexNode1 = m_compiler->gtNewIconNode(3, TYP_INT); simdLclNode = m_compiler->gtNewSimdGetElementNode(TYP_FLOAT, simdLclNode, indexNode1, - CORINFO_TYPE_FLOAT, 16); + CORINFO_TYPE_FLOAT, 16 ARM64_ARG(false)); GenTree* indexNode2 = m_compiler->gtNewIconNode(3, TYP_INT); hwiNode = m_compiler->gtNewSimdWithElementNode(TYP_SIMD16, elementNode, indexNode2, simdLclNode, diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 246cccee168018..90966494a8221c 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -3128,10 +3128,10 @@ void Compiler::lvaSortByRefCount() case TYP_SIMD8: case TYP_SIMD12: case TYP_SIMD16: -#ifdef TARGET_XARCH +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: case TYP_SIMD64: -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #ifdef FEATURE_MASKED_HW_INTRINSICS case TYP_MASK: #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index e1025850ef7c8f..6627c96aa09611 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -2695,7 +2695,7 @@ bool Lowering::LowerCallMemcmp(GenTreeCall* call, GenTree** next) loadWidth = 16; loadType = TYP_SIMD16; } -#ifdef TARGET_XARCH +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) else if ((loadWidth == 32) || (MaxUnrollSize == 64)) { loadWidth = 32; @@ -2706,7 +2706,7 @@ bool Lowering::LowerCallMemcmp(GenTreeCall* call, GenTree** next) loadWidth = 64; loadType = TYP_SIMD64; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #endif // FEATURE_SIMD else { @@ -2725,10 +2725,10 @@ bool Lowering::LowerCallMemcmp(GenTreeCall* call, GenTree** next) { assert(type == TYP_INT); return comp->gtNewSimdCmpOpAllNode(oper, TYP_INT, op1, op2, CORINFO_TYPE_NATIVEUINT, - genTypeSize(op1)); + genTypeSize(op1) ARM64_ARG(false)); } return comp->gtNewSimdBinOpNode(oper, op1->TypeGet(), op1, op2, CORINFO_TYPE_NATIVEUINT, - genTypeSize(op1)); + genTypeSize(op1) ARM64_ARG(false)); } #endif return comp->gtNewOperNode(oper, type, op1, op2); @@ -10311,10 +10311,9 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) case TYP_SIMD16: tryReusingPrevValue = true; break; - -#endif // TARGET_ARM64 -#endif // FEATURE_HW_INTRINSICS -#endif // TARGET_64BIT +#endif // TARGET_AMD64 +#endif // FEATURE_HW_INTRINSICS +#endif // TARGET_64BIT // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to // be converted to TYP_INT/TYP_LONG for constant value. diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index b012d8cacb269a..1fda2528fdb66a 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -435,6 +435,9 @@ class Lowering final : public Phase #ifdef FEATURE_HW_INTRINSICS GenTree* LowerHWIntrinsic(GenTreeHWIntrinsic* node); void LowerHWIntrinsicCC(GenTreeHWIntrinsic* node, NamedIntrinsic newIntrinsicId, GenCondition condition); +#if defined(TARGET_ARM64) + GenTree* LowerHWIntrinsicCmpOpVL(GenTreeHWIntrinsic* node, genTreeOps cmpOp); +#endif GenTree* LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp); GenTree* LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 1d69b329e760bd..4003682a9aa98b 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1613,6 +1613,10 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) switch (intrinsicId) { +#ifdef TARGET_ARM64 + // TODO-VL: Remove this entry because this is not handled properly inside LowerHWIntrinsicCreate + case NI_Vector_Create: +#endif case NI_Vector64_Create: case NI_Vector128_Create: case NI_Vector64_CreateScalar: @@ -1755,6 +1759,16 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_Vector_op_Equality: + { + return LowerHWIntrinsicCmpOpVL(node, GT_EQ); + } + + case NI_Vector_op_Inequality: + { + return LowerHWIntrinsicCmpOpVL(node, GT_NE); + } + case NI_Vector64_op_Equality: case NI_Vector128_op_Equality: { @@ -1971,7 +1985,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) var_types simdType = Compiler::getSIMDTypeForSize(simdSize); bool foundUse = BlockRange().TryGetUse(node, &use); - GenTree* trueMask = comp->gtNewSimdAllTrueMaskNode(simdBaseJitType); + GenTree* trueMask = comp->gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); GenTree* falseVal = comp->gtNewZeroConNode(simdType); var_types nodeType = simdType; @@ -2041,6 +2055,171 @@ bool Lowering::IsValidConstForMovImm(GenTreeHWIntrinsic* node) return false; } +//---------------------------------------------------------------------------------------------- +// Lowering::LowerHWIntrinsicCmpOpVL: Lowers a Vector comparison intrinsic +// +// Arguments: +// node - The hardware intrinsic node. +// cmpOp - The comparison operation, currently must be GT_EQ or GT_NE +// +GenTree* Lowering::LowerHWIntrinsicCmpOpVL(GenTreeHWIntrinsic* node, genTreeOps cmpOp) +{ + NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); + CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + assert(Compiler::UseSveForType(simdType)); + + assert((intrinsicId == NI_Vector_op_Equality) || (intrinsicId == NI_Vector_op_Inequality)); + + assert(varTypeIsSIMD(simdType)); + assert(varTypeIsArithmetic(simdBaseType)); + assert(simdSize != 0); + assert(node->TypeIs(TYP_INT)); + assert((cmpOp == GT_EQ) || (cmpOp == GT_NE)); + + // We have the following (with the appropriate simd size and where the intrinsic could be op_Inequality): + // /--* op2 mask + // /--* op1 mask + // node = * HWINTRINSIC simd T op_Equality + + GenTree* op1 = node->Op(1); + GenTree* op2 = node->Op(2); + + // Optimize comparison against Vector.Zero via CNTP: + // + // bool eq = v == Vector.Zero + // + // to: + // + // bool eq = Sve.GetActiveElementCount(v) == 0; + // + + GenTree* op = nullptr; + GenTree* opZero = nullptr; + if (op1->IsFalseMask()) + { + op = op2; + opZero = op1; + } + else if (op2->IsFalseMask()) + { + op = op1; + opZero = op2; + } + + // Currently only `some == Vector.Zero` is handled + if (op != nullptr) + { + + NamedIntrinsic elementCountIntrinsicId; + int elementsCnt = 0; + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + elementCountIntrinsicId = NI_Sve_Count8BitElements; + elementsCnt = simdSize; + break; + case TYP_SHORT: + case TYP_USHORT: + elementCountIntrinsicId = NI_Sve_Count16BitElements; + elementsCnt = simdSize / 2; + break; + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + elementCountIntrinsicId = NI_Sve_Count32BitElements; + elementsCnt = simdSize / 4; + break; + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + elementCountIntrinsicId = NI_Sve_Count64BitElements; + elementsCnt = simdSize / 8; + break; + default: + unreached(); + } + + GenTree* cntNode; + + if (cmpOp == GT_EQ) + { + if (comp->IsTargetAbi(CORINFO_NATIVEAOT_ABI)) + { + GenTree* svePattern = comp->gtNewIconNode(31, TYP_LONG); + BlockRange().InsertBefore(node, svePattern); + + cntNode = comp->gtNewSimdHWIntrinsicNode(TYP_LONG, svePattern, elementCountIntrinsicId, + CORINFO_TYPE_LONG, simdSize); + } + else + { + cntNode = comp->gtNewIconNode(elementsCnt, TYP_LONG); + } + } + else + { + // For inequality, we need to just check if all lanes are 0 + cntNode = comp->gtNewIconNode(0, TYP_LONG); + } + + BlockRange().InsertBefore(node, cntNode); + BlockRange().Remove(opZero); + + LowerNode(cntNode); + + node->ChangeOper(cmpOp); + node->gtType = TYP_INT; + node->AsOp()->gtOp1 = op1; + node->AsOp()->gtOp2 = cntNode; + + LowerNodeCC(node, (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE); + + node->gtType = TYP_VOID; + node->ClearUnusedValue(); + LowerNode(node); + return node->gtNext; + } + + GenTree* cmp = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, NI_Sve_CompareEqual, simdBaseJitType, simdSize); + BlockRange().InsertBefore(node, cmp); + + // Save cmp into a temp as we're going to need to pass it GetActiveElementCount + node->Op(1) = cmp; + LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); + ReplaceWithLclVar(tmp1Use); + GenTree* cmpResult = node->Op(1); + LowerNode(cmpResult); + + GenTree* allTrue = comp->gtNewSimdAllTrueMaskNode(simdBaseJitType, simdSize); + GenTree* activeElemCnt = comp->gtNewSimdHWIntrinsicNode(TYP_LONG, allTrue, cmpResult, NI_Sve_GetActiveElementCount, + simdBaseJitType, simdSize); + GenTree* cntNode = comp->gtNewIconNode(0, TYP_LONG); + BlockRange().InsertBefore(node, allTrue); + BlockRange().InsertBefore(node, activeElemCnt); + BlockRange().InsertBefore(node, cntNode); + + LowerNode(activeElemCnt); + LowerNode(cntNode); + + LowerNode(cmp); + + node->ChangeOper(cmpOp); + node->gtType = TYP_INT; + node->AsOp()->gtOp1 = activeElemCnt; + node->AsOp()->gtOp2 = cntNode; + + LowerNodeCC(node, (cmpOp == GT_EQ) ? GenCondition::EQ : GenCondition::NE); + + node->gtType = TYP_VOID; + node->ClearUnusedValue(); + LowerNode(node); + return node->gtNext; +} + //---------------------------------------------------------------------------------------------- // Lowering::LowerHWIntrinsicCmpOp: Lowers a Vector128 or Vector256 comparison intrinsic // @@ -4092,6 +4271,59 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) MakeSrcContained(node, intrin.op5); } break; + case NI_Sve_DuplicateScalarToVector: + assert(!hasImmediateOperand); + if (intrin.op1->IsCnsIntOrI()) + { + ssize_t iconValue = intrin.op1->AsIntCon()->IconValue(); + if (emitter::isValidSimm<8>(iconValue) || emitter::isValidSimm_MultipleOf<8, 256>(iconValue)) + { + MakeSrcContained(node, intrin.op1); + } + } + break; + case NI_Sve_Index: + { + assert(!hasImmediateOperand); + assert(varTypeIsIntegral(intrin.op1)); + assert(varTypeIsIntegral(intrin.op2)); + if (intrin.op1->IsCnsIntOrI() && emitter::isValidSimm<5>(intrin.op1->AsIntCon()->IconValue())) + { + MakeSrcContained(node, intrin.op1); + } + if (intrin.op2->IsCnsIntOrI() && emitter::isValidSimm<5>(intrin.op2->AsIntCon()->IconValue())) + { + MakeSrcContained(node, intrin.op2); + } + break; + } + case NI_Sve_ShiftLeftLogicalImm: + { + assert(!hasImmediateOperand); + if (intrin.op2->IsCnsIntOrI() && + emitter::isValidVectorShiftAmount(intrin.op2->AsIntCon()->IconValue(), + emitTypeSize(intrin.baseType), false)) + { + MakeSrcContained(node, intrin.op2); + } + break; + } + case NI_Sve_ShiftRightArithmeticImm: + case NI_Sve_ShiftRightLogicalImm: + { + assert(!hasImmediateOperand); + if (intrin.op2->IsCnsIntOrI() && emitter::isValidVectorShiftAmount(intrin.op2->AsIntCon()->IconValue(), + emitTypeSize(intrin.baseType), true)) + { + MakeSrcContained(node, intrin.op2); + } + break; + } + case NI_Sve_MultiplyByScalar: + { + MakeSrcContained(node, intrin.op2); + break; + } default: unreached(); diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 915019445fe619..0bf8cefdd3d71e 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -1657,10 +1657,10 @@ bool LinearScan::isRegCandidate(LclVarDsc* varDsc) case TYP_SIMD8: case TYP_SIMD12: case TYP_SIMD16: -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: case TYP_SIMD64: -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #ifdef FEATURE_MASKED_HW_INTRINSICS case TYP_MASK: #endif // FEATURE_MASKED_HW_INTRINSICS @@ -6036,6 +6036,14 @@ void LinearScan::allocateRegisters() allocate = false; lclVarInterval->isPartiallySpilled = true; } +#elif defined(TARGET_ARM64) + else if (Compiler::UseStrictSveForType(lclVarInterval->registerType)) + { + // TODO-VL: Need to do this for allocateRegistersMinimal too? + allocate = false; + lclVarInterval->isPartiallySpilled = true; + setIntervalAsSpilled(currentInterval); // Just mark it spill at this point. + } #endif // TARGET_XARCH else { @@ -6048,6 +6056,13 @@ void LinearScan::allocateRegisters() if (lclVarInterval->isPartiallySpilled) { lclVarInterval->isPartiallySpilled = false; +#if defined(TARGET_ARM64) + if (Compiler::UseStrictSveForType(lclVarInterval->registerType)) + { + // TODO-VL: Need to do this for allocateRegistersMinimal too? + allocate = false; + } +#endif // TARGET_ARM64 } else { @@ -7524,8 +7539,9 @@ void LinearScan::insertUpperVectorSave(GenTree* tree, // while on x86 we can spill directly to memory. regNumber spillReg = refPosition->assignedReg(); #ifdef TARGET_ARM64 - bool spillToMem = refPosition->spillAfter; - assert(spillReg != REG_NA); + bool isVariableVL = Compiler::UseStrictSveForType(varDsc->TypeGet()); + bool spillToMem = refPosition->spillAfter || isVariableVL; + assert((spillReg != REG_NA) || isVariableVL); #else bool spillToMem = (spillReg == REG_NA); assert(!refPosition->spillAfter); @@ -7626,7 +7642,7 @@ void LinearScan::insertUpperVectorRestore(GenTree* tree, simdUpperRestore->gtFlags |= GTF_NOREG_AT_USE; #else simdUpperRestore->gtFlags |= GTF_SPILLED; - assert(refPosition->assignedReg() != REG_NA); + assert((refPosition->assignedReg() != REG_NA) || (Compiler::UseStrictSveForType(restoreLcl->TypeGet()))); restoreReg = refPosition->assignedReg(); #endif } @@ -10760,7 +10776,14 @@ void LinearScan::lsraDispNode(GenTree* tree, LsraTupleDumpMode mode, bool hasDes { if (mode == LinearScan::LSRA_DUMP_POST && tree->gtFlags & GTF_SPILLED) { + +#ifdef TARGET_ARM64 + // TODO-VL: Evaluate this + assert(tree->gtHasReg(compiler) || + (tree->OperIs(GT_INTRINSIC) && (tree->AsIntrinsic()->gtIntrinsicName == NI_SIMD_UpperRestore))); +#else assert(tree->gtHasReg(compiler)); +#endif } lsraGetOperandString(tree, mode, operandString, operandStringLength); printf("%-15s =", operandString); diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 7cc1a231391c71..d94f4607811df7 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1736,6 +1736,9 @@ void LinearScan::BuildHWIntrinsicImmediate(GenTreeHWIntrinsic* intrinsicTree, co case NI_Sve_MultiplyAddRotateComplex: needBranchTargetReg = !intrin.op4->isContainedIntOrIImmed(); break; + case NI_Sve_DuplicateScalarToVector: + needBranchTargetReg = !intrin.op1->isContainedIntOrIImmed(); + break; default: unreached(); @@ -2166,6 +2169,7 @@ SingleTypeRegSet LinearScan::getOperandCandidates(GenTreeHWIntrinsic* intrinsicT case NI_Sve_MultiplyAddRotateComplexBySelectedScalar: isLowVectorOpNum = (opNum == 3); break; + case NI_Sve_MultiplyByScalar: case NI_Sve_MultiplyBySelectedScalar: isLowVectorOpNum = (opNum == 2); break; diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index e4764ad1a38ab2..081759a3bf2d4b 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1500,6 +1500,12 @@ void LinearScan::buildUpperVectorSaveRefPositions(GenTree* tree, } } + bool forceRegOptional = false; +#ifdef TARGET_XARCH + forceRegOptional = true; +#elif TARGET_ARM64 + forceRegOptional = Compiler::UseStrictSveForType(tree->TypeGet()); +#endif if (enregisterLocalVars && !VarSetOps::IsEmpty(compiler, largeVectorVars)) { // We assume that the kill set includes at least some callee-trash registers, but @@ -1541,9 +1547,7 @@ void LinearScan::buildUpperVectorSaveRefPositions(GenTree* tree, varInterval->isPartiallySpilled = true; pos->skipSaveRestore = blockAlwaysReturn; pos->liveVarUpperSave = VarSetOps::IsMember(compiler, liveLargeVectors, varIndex); -#ifdef TARGET_XARCH - pos->regOptional = true; -#endif + pos->regOptional = forceRegOptional; } } } diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index 32bd726984d28a..fd8d417642ab40 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -9284,7 +9284,7 @@ GenTree* Compiler::fgOptimizeRelationalComparisonWithConst(GenTreeOp* cmp) // and if the call is one of these, attempt to optimize. // This is post-order, meaning that it will not morph the children. // -GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) +GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node ARM64_ARG(bool isScalable)) { assert(opts.OptimizationEnabled()); @@ -9314,6 +9314,7 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) switch (intrinsicId) { #if defined(TARGET_ARM64) + case NI_Vector_Create: case NI_Vector64_Create: #endif // TARGET_ARM64 case NI_Vector128_Create: @@ -9558,7 +9559,7 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic subIntrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, GT_SUB, op2, op1, simdBaseType, simdSize, - isScalar); + isScalar ARM64_ARG(isScalable)); node->ChangeHWIntrinsicId(subIntrinsic, op2, op1); return fgMorphHWIntrinsicRequired(node); @@ -9590,7 +9591,8 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) DEBUG_DESTROY_NODE(op2); DEBUG_DESTROY_NODE(node); - node = gtNewSimdUnOpNode(GT_NEG, retType, op1, simdBaseJitType, simdSize)->AsHWIntrinsic(); + node = gtNewSimdUnOpNode(GT_NEG, retType, op1, simdBaseJitType, simdSize ARM64_ARG(isScalable)) + ->AsHWIntrinsic(); #if defined(TARGET_XARCH) if (varTypeIsFloating(simdBaseType)) @@ -9632,7 +9634,7 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic subIntrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, GT_SUB, op1, op2, simdBaseType, simdSize, - isScalar); + isScalar ARM64_ARG(isScalable)); node->ChangeHWIntrinsicId(subIntrinsic, op1, op2); return fgMorphHWIntrinsicRequired(node); @@ -9872,12 +9874,12 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) const bool reverseCond = true; var_types lookupType = - GenTreeHWIntrinsic::GetLookupTypeForCmpOp(this, op1Oper, op1RetType, op1SimdBaseType, op1SimdSize, - reverseCond); + GenTreeHWIntrinsic::GetLookupTypeForCmpOp(this, op1Oper, op1RetType, op1SimdBaseType, + op1SimdSize ARM64_ARG(isScalable), reverseCond); NamedIntrinsic newId = GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(this, op1Oper, lookupType, cmpOp1, cmpOp2, - op1SimdBaseType, op1SimdSize, op1IsScalar, - reverseCond); + op1SimdBaseType, op1SimdSize, + op1IsScalar ARM64_ARG(isScalable), reverseCond); if (newId != NI_Illegal) { @@ -9965,7 +9967,8 @@ GenTree* Compiler::fgOptimizeHWIntrinsic(GenTreeHWIntrinsic* node) } NamedIntrinsic addIntrinsic = - GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, GT_ADD, op1, op2, simdBaseType, simdSize, isScalar); + GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, GT_ADD, op1, op2, simdBaseType, simdSize, + isScalar ARM64_ARG(isScalable)); node->ChangeHWIntrinsicId(addIntrinsic, op1, op2); return fgMorphHWIntrinsicRequired(node); @@ -11527,6 +11530,11 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) CorInfoType simdBaseJitType = tree->GetSimdBaseJitType(); var_types simdBaseType = tree->GetSimdBaseType(); unsigned simdSize = tree->GetSimdSize(); +#if defined(TARGET_ARM64) + bool isScalable = + (((FIRST_NI_Vector <= tree->GetHWIntrinsicId()) && (tree->GetHWIntrinsicId() <= LAST_NI_Vector)) || + ((FIRST_NI_Sve <= tree->GetHWIntrinsicId()) && (tree->GetHWIntrinsicId() <= LAST_NI_Sve))); +#endif bool isScalar = false; genTreeOps oper = tree->GetOperForHWIntrinsicId(&isScalar); @@ -11553,12 +11561,12 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) { // Move constant vectors from op1 to op2 for comparison operations // Noting that we can't handle scalar operations since they can copy upper bits from op1 - - genTreeOps newOper = GenTree::SwapRelop(oper); - var_types lookupType = - GenTreeHWIntrinsic::GetLookupTypeForCmpOp(this, newOper, retType, simdBaseType, simdSize); - NamedIntrinsic newId = GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(this, newOper, lookupType, op2, op1, - simdBaseType, simdSize, isScalar); + genTreeOps newOper = GenTree::SwapRelop(oper); + var_types lookupType = GenTreeHWIntrinsic::GetLookupTypeForCmpOp(this, newOper, retType, simdBaseType, + simdSize ARM64_ARG(isScalable)); + NamedIntrinsic newId = + GenTreeHWIntrinsic::GetHWIntrinsicIdForCmpOp(this, newOper, lookupType, op2, op1, simdBaseType, + simdSize, isScalar ARM64_ARG(isScalable)); if (newId != NI_Illegal) { @@ -11605,7 +11613,7 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) NamedIntrinsic addIntrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, GT_ADD, op1, op2, simdBaseType, simdSize, - isScalar); + isScalar ARM64_ARG(isScalable)); tree->ChangeHWIntrinsicId(addIntrinsic, op1, op2); return fgMorphHWIntrinsicRequired(tree); @@ -11616,7 +11624,7 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) { #if defined(TARGET_ARM64) // xarch doesn't have a native GT_NEG representation for integers and itself uses (Zero - v1) - op2 = gtNewSimdUnOpNode(GT_NEG, retType, op2, simdBaseJitType, simdSize); + op2 = gtNewSimdUnOpNode(GT_NEG, retType, op2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); DEBUG_DESTROY_NODE(op1); DEBUG_DESTROY_NODE(tree); @@ -11626,7 +11634,7 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) } else { - op2 = gtNewSimdUnOpNode(GT_NEG, retType, op2, simdBaseJitType, simdSize); + op2 = gtNewSimdUnOpNode(GT_NEG, retType, op2, simdBaseJitType, simdSize ARM64_ARG(isScalable)); #if defined(TARGET_XARCH) if (varTypeIsFloating(simdBaseType)) @@ -11641,7 +11649,7 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) NamedIntrinsic addIntrinsic = GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(this, GT_ADD, op2, op1, simdBaseType, simdSize, - isScalar); + isScalar ARM64_ARG(isScalable)); tree->ChangeHWIntrinsicId(addIntrinsic, op2, op1); @@ -11667,7 +11675,7 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) if (op2->IsVectorAllBitsSet()) { // xarch doesn't have a native GT_NOT representation and itself uses (v1 ^ AllBitsSet) - op1 = gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize); + op1 = gtNewSimdUnOpNode(GT_NOT, retType, op1, simdBaseJitType, simdSize ARM64_ARG(isScalable)); DEBUG_DESTROY_NODE(op2); DEBUG_DESTROY_NODE(tree); @@ -11678,7 +11686,7 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) if (varTypeIsFloating(simdBaseType) && op2->IsVectorNegativeZero(simdBaseType)) { // xarch doesn't have a native GT_NEG representation for floating-point and itself uses (v1 ^ -0.0) - op1 = gtNewSimdUnOpNode(GT_NEG, retType, op1, simdBaseJitType, simdSize); + op1 = gtNewSimdUnOpNode(GT_NEG, retType, op1, simdBaseJitType, simdSize ARM64_ARG(isScalable)); DEBUG_DESTROY_NODE(op2); DEBUG_DESTROY_NODE(tree); @@ -11697,7 +11705,7 @@ GenTree* Compiler::fgMorphHWIntrinsicRequired(GenTreeHWIntrinsic* tree) if (opts.OptimizationEnabled()) { - return fgOptimizeHWIntrinsic(tree); + return fgOptimizeHWIntrinsic(tree ARM64_ARG(isScalable)); } return tree; } diff --git a/src/coreclr/jit/optcse.cpp b/src/coreclr/jit/optcse.cpp index 3a6f8e17d454d5..3197fba7ff3cc0 100644 --- a/src/coreclr/jit/optcse.cpp +++ b/src/coreclr/jit/optcse.cpp @@ -4625,7 +4625,7 @@ bool CSE_Heuristic::PromotionCheck(CSE_Candidate* candidate) // int spillSimdRegInProlog = 1; -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) // If we have a SIMD32/64 that is live across a call we have even higher spill costs // if (candidate->Expr()->TypeIs(TYP_SIMD32, TYP_SIMD64)) @@ -4640,7 +4640,7 @@ bool CSE_Heuristic::PromotionCheck(CSE_Candidate* candidate) // cse_use_cost += 2; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 extra_yes_cost = (BB_UNITY_WEIGHT_UNSIGNED * spillSimdRegInProlog) * 3; } diff --git a/src/coreclr/jit/promotiondecomposition.cpp b/src/coreclr/jit/promotiondecomposition.cpp index 7f2dbf0257fa2f..63287425f29dd7 100644 --- a/src/coreclr/jit/promotiondecomposition.cpp +++ b/src/coreclr/jit/promotiondecomposition.cpp @@ -373,7 +373,7 @@ class DecompositionPlan primitiveType = TYP_SIMD16; } break; -#ifdef TARGET_XARCH +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case 32: if (m_compiler->getPreferredVectorByteLength() >= 32) { @@ -387,7 +387,7 @@ class DecompositionPlan primitiveType = TYP_SIMD64; } break; -#endif +#endif // TARGET_XARCH || TARGET_ARM64 #endif } } diff --git a/src/coreclr/jit/regset.cpp b/src/coreclr/jit/regset.cpp index 3d9354b040f48e..4b40cc0a1c1f15 100644 --- a/src/coreclr/jit/regset.cpp +++ b/src/coreclr/jit/regset.cpp @@ -350,12 +350,20 @@ void RegSet::rsSpillTree(regNumber reg, GenTree* tree, unsigned regIdx /* =0 */) var_types tempType = RegSet::tmpNormalizeType(treeType); regMaskTP mask; bool floatSpill = false; + bool maskSpill = false; if (isFloatRegType(treeType)) { floatSpill = true; mask = genRegMaskFloat(reg ARM_ARG(treeType)); } +#if defined(TARGET_ARM64) + if (varTypeUsesMaskReg(treeType)) + { + maskSpill = true; + mask = genRegMask(reg); + } +#endif else { mask = genRegMask(reg); @@ -427,6 +435,10 @@ void RegSet::rsSpillTree(regNumber reg, GenTree* tree, unsigned regIdx /* =0 */) // Generate the code to spill the register var_types storeType = floatSpill ? treeType : tempType; +#if defined(TARGET_ARM64) + storeType = maskSpill ? treeType : storeType; +#endif + m_rsCompiler->codeGen->spillReg(storeType, temp, reg); // Mark the tree node as having been spilled @@ -604,6 +616,15 @@ var_types RegSet::tmpNormalizeType(var_types type) { type = TYP_SIMD16; } + +#if defined(TARGET_ARM64) + if (Compiler::UseSveForType(type)) + { + // TODO-VL: temporary work around to allow scalable registers + type = TYP_SIMD16; + } +#endif + #endif // defined(FEATURE_SIMD) && !defined(TARGET_64BIT) return type; @@ -682,6 +703,13 @@ void RegSet::tmpPreAllocateTemps(var_types type, unsigned count) assert(type == tmpNormalizeType(type)); unsigned size = genTypeSize(type); +#ifdef TARGET_ARM64 + if (Compiler::UseSveForType(type)) + { + size = 16; // SIMD registers overlap with SVE registers + } +#endif + // If TYP_STRUCT ever gets in here we do bad things (tmpSlot returns -1) noway_assert(size >= sizeof(int)); diff --git a/src/coreclr/jit/scopeinfo.cpp b/src/coreclr/jit/scopeinfo.cpp index df9b0083798d1a..d1d6c3af94c7d8 100644 --- a/src/coreclr/jit/scopeinfo.cpp +++ b/src/coreclr/jit/scopeinfo.cpp @@ -292,10 +292,10 @@ void CodeGenInterface::siVarLoc::siFillStackVarLoc( case TYP_SIMD8: case TYP_SIMD12: case TYP_SIMD16: -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: case TYP_SIMD64: -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #endif // FEATURE_SIMD #ifdef TARGET_64BIT case TYP_LONG: @@ -432,10 +432,10 @@ void CodeGenInterface::siVarLoc::siFillRegisterVarLoc( case TYP_SIMD8: case TYP_SIMD12: case TYP_SIMD16: -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: case TYP_SIMD64: -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) case TYP_MASK: #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 9841bdeb38c93c..3e83c95022ba9f 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -192,7 +192,7 @@ struct simd16_t }; static_assert_no_msg(sizeof(simd16_t) == 16); -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) struct simd32_t { union @@ -303,7 +303,7 @@ struct simd64_t } }; static_assert_no_msg(sizeof(simd64_t) == 64); -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) struct simdmask_t @@ -365,7 +365,7 @@ struct simdmask_t static_assert_no_msg(sizeof(simdmask_t) == 8); #endif // FEATURE_MASKED_HW_INTRINSICS -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) typedef simd64_t simd_t; #else typedef simd16_t simd_t; diff --git a/src/coreclr/jit/targetarm64.h b/src/coreclr/jit/targetarm64.h index 678a05e181e40d..511c5c9d41ddbb 100644 --- a/src/coreclr/jit/targetarm64.h +++ b/src/coreclr/jit/targetarm64.h @@ -26,8 +26,8 @@ #define FEATURE_MULTIREG_ARGS 1 // Support for passing a single argument in more than one register #define FEATURE_MULTIREG_RET 1 // Support for returning a single value in more than one register #define FEATURE_STRUCT_CLASSIFIER 0 // Uses a classifier function to determine is structs are passed/returned in more than one register - #define MAX_PASS_SINGLEREG_BYTES 16 // Maximum size of a struct passed in a single register (16-byte vector). - #define MAX_PASS_MULTIREG_BYTES 64 // Maximum size of a struct that could be passed in more than one register (max is 4 16-byte vectors using an HVA) + #define MAX_PASS_SINGLEREG_BYTES 16 // Maximum size of a struct passed in a single register (16-byte vector). //TODO-VL: This can be VL now? + #define MAX_PASS_MULTIREG_BYTES 64 // Maximum size of a struct that could be passed in more than one register (max is 4 16-byte vectors using an HVA) //TODO-VL: This can be VL now? #define MAX_RET_MULTIREG_BYTES 64 // Maximum size of a struct that could be returned in more than one register (Max is an HVA of 4 16-byte vectors) #define MAX_ARG_REG_COUNT 4 // Maximum registers used to pass a single argument in multiple registers. (max is 4 128-bit vectors using an HVA) #define MAX_RET_REG_COUNT 4 // Maximum registers used to return a value. diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index 865c177bc7bc32..ac8fbafa7acb10 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -60,10 +60,10 @@ DEF_TP(STRUCT ,"struct" , TYP_STRUCT, 0, 0, 0, 1, 4, VTR_INT, available DEF_TP(SIMD8 ,"simd8" , TYP_SIMD8, 8, 8, 8, 2, 8, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) DEF_TP(SIMD12 ,"simd12" , TYP_SIMD12, 12,16, 16, 4,16, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, 16,16, 16, 4,16, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, 32,32, 32, 8,16, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, 64,64, 64, 16,16, VTR_FLOAT, availableDoubleRegs, RBM_FLT_CALLEE_SAVED, RBM_FLT_CALLEE_TRASH, VTF_S|VTF_VEC) -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) DEF_TP(MASK ,"mask" , TYP_MASK, 8, 8, 8, 2, 8, VTR_MASK, availableMaskRegs, RBM_MSK_CALLEE_SAVED, RBM_MSK_CALLEE_TRASH, VTF_S) #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 79f596806cfb1a..a930ac32413dc2 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -434,7 +434,7 @@ ValueNumStore::ValueNumStore(Compiler* comp, CompAllocator alloc) , m_simd8CnsMap(nullptr) , m_simd12CnsMap(nullptr) , m_simd16CnsMap(nullptr) -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) , m_simd32CnsMap(nullptr) , m_simd64CnsMap(nullptr) #endif // TARGET_XARCH @@ -1706,7 +1706,7 @@ ValueNumStore::Chunk::Chunk(CompAllocator alloc, ValueNum* pNextBaseVN, var_type break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { m_defs = new (alloc) Alloc::Type[ChunkSize]; @@ -1718,7 +1718,7 @@ ValueNumStore::Chunk::Chunk(CompAllocator alloc, ValueNum* pNextBaseVN, var_type m_defs = new (alloc) Alloc::Type[ChunkSize]; break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) case TYP_MASK: @@ -1883,7 +1883,7 @@ ValueNum ValueNumStore::VNForSimd16Con(const simd16_t& cnsVal) return VnForConst(cnsVal, GetSimd16CnsMap(), TYP_SIMD16); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) ValueNum ValueNumStore::VNForSimd32Con(const simd32_t& cnsVal) { return VnForConst(cnsVal, GetSimd32CnsMap(), TYP_SIMD32); @@ -1893,7 +1893,7 @@ ValueNum ValueNumStore::VNForSimd64Con(const simd64_t& cnsVal) { return VnForConst(cnsVal, GetSimd64CnsMap(), TYP_SIMD64); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) ValueNum ValueNumStore::VNForSimdMaskCon(const simdmask_t& cnsVal) @@ -1990,7 +1990,7 @@ ValueNum ValueNumStore::VNForGenericCon(var_types typ, uint8_t* cnsVal) READ_VALUE(simd16_t); return VNForSimd16Con(val); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { READ_VALUE(simd32_t); @@ -2001,7 +2001,7 @@ ValueNum ValueNumStore::VNForGenericCon(var_types typ, uint8_t* cnsVal) READ_VALUE(simd64_t); return VNForSimd64Con(val); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) case TYP_MASK: @@ -2112,7 +2112,7 @@ ValueNum ValueNumStore::VNZeroForType(var_types typ) return VNForSimd16Con(simd16_t::Zero()); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { return VNForSimd32Con(simd32_t::Zero()); @@ -2122,7 +2122,7 @@ ValueNum ValueNumStore::VNZeroForType(var_types typ) { return VNForSimd64Con(simd64_t::Zero()); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) case TYP_MASK: @@ -2209,7 +2209,7 @@ ValueNum ValueNumStore::VNAllBitsForType(var_types typ, unsigned elementCount) return VNForSimd16Con(simd16_t::AllBitsSet()); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { return VNForSimd32Con(simd32_t::AllBitsSet()); @@ -2219,7 +2219,7 @@ ValueNum ValueNumStore::VNAllBitsForType(var_types typ, unsigned elementCount) { return VNForSimd64Con(simd64_t::AllBitsSet()); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) case TYP_MASK: @@ -2326,7 +2326,7 @@ ValueNum ValueNumStore::VNBroadcastForSimdType(var_types simdType, var_types sim return VNForSimd16Con(result); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = BroadcastConstantToSimd(this, simdBaseType, valVN); @@ -2339,7 +2339,7 @@ ValueNum ValueNumStore::VNBroadcastForSimdType(var_types simdType, var_types sim return VNForSimd64Con(result); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -2365,7 +2365,7 @@ ValueNum ValueNumStore::VNForSimdType(unsigned simdSize, CorInfoType simdBaseJit bool ValueNumStore::VNIsVectorNaN(var_types simdType, var_types simdBaseType, ValueNum valVN) { - assert(varTypeIsSIMD(simdType)); + assert(varTypeIsSIMDOrMask(simdType)); simd_t vector = {}; @@ -2392,7 +2392,7 @@ bool ValueNumStore::VNIsVectorNaN(var_types simdType, var_types simdBaseType, Va break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t tmp = GetConstantSimd32(valVN); @@ -2406,7 +2406,16 @@ bool ValueNumStore::VNIsVectorNaN(var_types simdType, var_types simdBaseType, Va memcpy(&vector, &tmp, genTypeSize(simdType)); break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 + +#if defined(FEATURE_MASKED_HW_INTRINSICS) + case TYP_MASK: + { + simdmask_t tmp = GetConstantSimdMask(valVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } +#endif // FEATURE_MASKED_HW_INTRINSICS default: { @@ -2458,7 +2467,7 @@ bool ValueNumStore::VNIsVectorNegativeZero(var_types simdType, var_types simdBas break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t tmp = GetConstantSimd32(valVN); @@ -2472,7 +2481,16 @@ bool ValueNumStore::VNIsVectorNegativeZero(var_types simdType, var_types simdBas memcpy(&vector, &tmp, genTypeSize(simdType)); break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 + +#if defined(FEATURE_MASKED_HW_INTRINSICS) + case TYP_MASK: + { + simdmask_t tmp = GetConstantSimdMask(valVN); + memcpy(&vector, &tmp, genTypeSize(simdType)); + break; + } +#endif // FEATURE_MASKED_HW_INTRINSICS default: { @@ -4019,7 +4037,7 @@ simd16_t ValueNumStore::GetConstantSimd16(ValueNum argVN) return ConstantValue(argVN); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) // Given a simd32 constant value number return its value as a simd32. // simd32_t ValueNumStore::GetConstantSimd32(ValueNum argVN) @@ -4039,7 +4057,7 @@ simd64_t ValueNumStore::GetConstantSimd64(ValueNum argVN) return ConstantValue(argVN); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) // Given a simdmask constant value number return its value as a simdmask. @@ -7497,7 +7515,7 @@ simd16_t GetConstantSimd16(ValueNumStore* vns, var_types baseType, ValueNum argV return BroadcastConstantToSimd(vns, baseType, argVN); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) simd32_t GetConstantSimd32(ValueNumStore* vns, var_types baseType, ValueNum argVN) { assert(vns->IsVNConstant(argVN)); @@ -7521,7 +7539,7 @@ simd64_t GetConstantSimd64(ValueNumStore* vns, var_types baseType, ValueNum argV return BroadcastConstantToSimd(vns, baseType, argVN); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 ValueNum EvaluateUnarySimd( ValueNumStore* vns, genTreeOps oper, bool scalar, var_types simdType, var_types baseType, ValueNum arg0VN) @@ -7555,7 +7573,7 @@ ValueNum EvaluateUnarySimd( return vns->VNForSimd16Con(result); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t arg0 = GetConstantSimd32(vns, baseType, arg0VN); @@ -7573,7 +7591,7 @@ ValueNum EvaluateUnarySimd( EvaluateUnarySimd(oper, scalar, baseType, &result, arg0); return vns->VNForSimd64Con(result); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7622,7 +7640,7 @@ ValueNum EvaluateBinarySimd(ValueNumStore* vns, return vns->VNForSimd16Con(result); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t arg0 = GetConstantSimd32(vns, baseType, arg0VN); @@ -7642,7 +7660,7 @@ ValueNum EvaluateBinarySimd(ValueNumStore* vns, EvaluateBinarySimd(oper, scalar, baseType, &result, arg0, arg1); return vns->VNForSimd64Con(result); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7747,7 +7765,7 @@ ValueNum EvaluateSimdGetElement( return EvaluateSimdGetElement(vns, baseType, vns->GetConstantSimd16(arg0VN), arg1); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { return EvaluateSimdGetElement(vns, baseType, vns->GetConstantSimd32(arg0VN), arg1); @@ -7757,7 +7775,7 @@ ValueNum EvaluateSimdGetElement( { return EvaluateSimdGetElement(vns, baseType, vns->GetConstantSimd64(arg0VN), arg1); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -7793,7 +7811,7 @@ ValueNum EvaluateSimdCvtMaskToVector(ValueNumStore* vns, var_types simdType, var return vns->VNForSimd16Con(result); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = {}; @@ -7807,7 +7825,16 @@ ValueNum EvaluateSimdCvtMaskToVector(ValueNumStore* vns, var_types simdType, var EvaluateSimdCvtMaskToVector(baseType, &result, arg0); return vns->VNForSimd64Con(result); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 + +#if defined(FEATURE_MASKED_HW_INTRINSICS) + case TYP_MASK: + { + simdmask_t result = {}; + EvaluateSimdCvtMaskToVector(baseType, &result, arg0); + return vns->VNForSimdMaskCon(result); + } +#endif // FEATURE_MASKED_HW_INTRINSICS default: { @@ -7843,7 +7870,7 @@ ValueNum EvaluateSimdCvtVectorToMask(ValueNumStore* vns, var_types simdType, var break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t arg0 = GetConstantSimd32(vns, baseType, arg0VN); @@ -7857,7 +7884,7 @@ ValueNum EvaluateSimdCvtVectorToMask(ValueNumStore* vns, var_types simdType, var EvaluateSimdCvtVectorToMask(baseType, &result, arg0); break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -8827,6 +8854,7 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( case NI_Vector128_op_Equality: #if defined(TARGET_ARM64) + case NI_Vector_op_Equality: case NI_Vector64_op_Equality: #elif defined(TARGET_XARCH) case NI_Vector256_op_Equality: @@ -8835,8 +8863,17 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( { if (varTypeIsFloating(baseType)) { - // Handle `(x == NaN) == false` and `(NaN == x) == false` for floating-point types - var_types simdType = Compiler::getSIMDTypeForSize(simdSize); + var_types simdType; + + if (varTypeIsMask(TypeOfVN(cnsVN))) + { + simdType = TYP_MASK; + } + else + { + // Handle `(x == NaN) == false` and `(NaN == x) == false` for floating-point types + simdType = Compiler::getSIMDTypeForSize(simdSize); + } if (VNIsVectorNaN(simdType, baseType, cnsVN)) { @@ -8848,6 +8885,7 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( case NI_Vector128_op_Inequality: #if defined(TARGET_ARM64) + case NI_Vector_op_Inequality: case NI_Vector64_op_Inequality: #elif defined(TARGET_XARCH) case NI_Vector256_op_Inequality: @@ -8954,6 +8992,7 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( { case NI_Vector128_op_Equality: #if defined(TARGET_ARM64) + case NI_Vector_op_Equality: case NI_Vector64_op_Equality: #elif defined(TARGET_XARCH) case NI_Vector256_op_Equality: @@ -8971,6 +9010,7 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( case NI_Vector128_op_Inequality: #if defined(TARGET_ARM64) + case NI_Vector_op_Inequality: case NI_Vector64_op_Inequality: #elif defined(TARGET_XARCH) case NI_Vector256_op_Inequality: @@ -9027,7 +9067,7 @@ ValueNum EvaluateSimdWithElementFloating( return vns->VNForSimd16Con(result); } -#if defined TARGET_XARCH +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = {}; @@ -9041,7 +9081,7 @@ ValueNum EvaluateSimdWithElementFloating( EvaluateWithElementFloating(baseType, &result, vns->GetConstantSimd64(arg0VN), arg1, arg2); return vns->VNForSimd64Con(result); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -9081,7 +9121,7 @@ ValueNum EvaluateSimdWithElementIntegral( return vns->VNForSimd16Con(result); } -#if defined TARGET_XARCH +#if defined TARGET_XARCH || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t result = {}; @@ -9095,7 +9135,7 @@ ValueNum EvaluateSimdWithElementIntegral( EvaluateWithElementIntegral(baseType, &result, vns->GetConstantSimd64(arg0VN), arg1, arg2); return vns->VNForSimd64Con(result); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 default: { @@ -10101,7 +10141,7 @@ void ValueNumStore::vnDump(Compiler* comp, ValueNum vn, bool isPtr) break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t cnsVal = GetConstantSimd32(vn); @@ -10119,7 +10159,7 @@ void ValueNumStore::vnDump(Compiler* comp, ValueNum vn, bool isPtr) cnsVal.u64[6], cnsVal.u64[7]); break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) case TYP_MASK: @@ -11709,7 +11749,7 @@ void Compiler::fgValueNumberTreeConst(GenTree* tree) break; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) case TYP_SIMD32: { simd32_t simd32Val; @@ -11727,7 +11767,7 @@ void Compiler::fgValueNumberTreeConst(GenTree* tree) tree->gtVNPair.SetBoth(vnStore->VNForSimd64Con(simd64Val)); break; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) case TYP_MASK: diff --git a/src/coreclr/jit/valuenum.h b/src/coreclr/jit/valuenum.h index 79f8ae3ff655fe..be7e9cbd05f8e8 100644 --- a/src/coreclr/jit/valuenum.h +++ b/src/coreclr/jit/valuenum.h @@ -382,10 +382,10 @@ class ValueNumStore simd8_t GetConstantSimd8(ValueNum argVN); simd12_t GetConstantSimd12(ValueNum argVN); simd16_t GetConstantSimd16(ValueNum argVN); -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) simd32_t GetConstantSimd32(ValueNum argVN); simd64_t GetConstantSimd64(ValueNum argVN); -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) simdmask_t GetConstantSimdMask(ValueNum argVN); #endif // FEATURE_MASKED_HW_INTRINSICS @@ -468,10 +468,10 @@ class ValueNumStore ValueNum VNForSimd8Con(const simd8_t& cnsVal); ValueNum VNForSimd12Con(const simd12_t& cnsVal); ValueNum VNForSimd16Con(const simd16_t& cnsVal); -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) ValueNum VNForSimd32Con(const simd32_t& cnsVal); ValueNum VNForSimd64Con(const simd64_t& cnsVal); -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) ValueNum VNForSimdMaskCon(const simdmask_t& cnsVal); #endif // FEATURE_MASKED_HW_INTRINSICS @@ -1903,7 +1903,7 @@ class ValueNumStore return m_simd16CnsMap; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) struct Simd32PrimitiveKeyFuncs : public JitKeyFuncsDefEquals { static bool Equals(const simd32_t& x, const simd32_t& y) @@ -1981,7 +1981,7 @@ class ValueNumStore } return m_simd64CnsMap; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) struct SimdMaskPrimitiveKeyFuncs : public JitKeyFuncsDefEquals @@ -2181,7 +2181,7 @@ struct ValueNumStore::VarTypConv typedef simd16_t Type; typedef simd16_t Lang; }; -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) template <> struct ValueNumStore::VarTypConv { @@ -2195,7 +2195,7 @@ struct ValueNumStore::VarTypConv typedef simd64_t Type; typedef simd64_t Lang; }; -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) template <> @@ -2266,7 +2266,7 @@ FORCEINLINE simd16_t ValueNumStore::SafeGetConstantValue(Chunk* c, uns return reinterpret_cast::Lang*>(c->m_defs)[offset]; } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) template <> FORCEINLINE simd32_t ValueNumStore::SafeGetConstantValue(Chunk* c, unsigned offset) { @@ -2280,7 +2280,7 @@ FORCEINLINE simd64_t ValueNumStore::SafeGetConstantValue(Chunk* c, uns assert(c->m_typ == TYP_SIMD64); return reinterpret_cast::Lang*>(c->m_defs)[offset]; } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) template <> @@ -2333,7 +2333,7 @@ FORCEINLINE simd16_t ValueNumStore::ConstantValueInternal(ValueNum vn return SafeGetConstantValue(c, offset); } -#if defined(TARGET_XARCH) +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) template <> FORCEINLINE simd32_t ValueNumStore::ConstantValueInternal(ValueNum vn DEBUGARG(bool coerce)) { @@ -2361,7 +2361,7 @@ FORCEINLINE simd64_t ValueNumStore::ConstantValueInternal(ValueNum vn return SafeGetConstantValue(c, offset); } -#endif // TARGET_XARCH +#endif // TARGET_XARCH || TARGET_ARM64 #if defined(FEATURE_MASKED_HW_INTRINSICS) template <> diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index e214d1f8a346ff..b60508ee5bcb44 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -75,6 +75,21 @@ inline bool varTypeIsSIMD(T vt) #endif } +template +inline bool varTypeIsNeonSIMD(T vt) +{ +#ifdef FEATURE_SIMD + bool result = varTypeIsSIMD(vt); +#ifdef TARGET_ARM64 + result = result && ((vt == TYP_SIMD8) || (vt == TYP_SIMD16)); +#endif // TARGET_ARM64 + return result; +#else + // Always return false if FEATURE_SIMD is not enabled + return false; +#endif +} + template inline bool varTypeIsMask(T vt) { diff --git a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs index 8069076421aaa2..49ec9ad5f5b8f7 100644 --- a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs +++ b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs @@ -40,6 +40,7 @@ public static class ReadyToRunInstructionSetHelper case InstructionSet.ARM64_Sha256: return ReadyToRunInstructionSet.Sha256; case InstructionSet.ARM64_Sha256_Arm64: return ReadyToRunInstructionSet.Sha256; case InstructionSet.ARM64_Atomics: return ReadyToRunInstructionSet.Atomics; + case InstructionSet.ARM64_Vector: return null; case InstructionSet.ARM64_Vector64: return null; case InstructionSet.ARM64_Vector128: return null; case InstructionSet.ARM64_Dczva: return null; diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs index 06b34f71550601..8b0f00b432c27c 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl.cs @@ -3296,7 +3296,7 @@ private CorInfoHFAElemType getHFAType(CORINFO_CLASS_STRUCT_* hClass) ValueTypeShapeCharacteristics.Float32Aggregate => CorInfoHFAElemType.CORINFO_HFA_ELEM_FLOAT, ValueTypeShapeCharacteristics.Float64Aggregate => CorInfoHFAElemType.CORINFO_HFA_ELEM_DOUBLE, ValueTypeShapeCharacteristics.Vector64Aggregate => CorInfoHFAElemType.CORINFO_HFA_ELEM_VECTOR64, - ValueTypeShapeCharacteristics.Vector128Aggregate => CorInfoHFAElemType.CORINFO_HFA_ELEM_VECTOR128, + ValueTypeShapeCharacteristics.Vector128Aggregate => CorInfoHFAElemType.CORINFO_HFA_ELEM_VECTOR128, //TODO-VL: Need for VL too? _ => CorInfoHFAElemType.CORINFO_HFA_ELEM_NONE }; } @@ -4143,6 +4143,14 @@ private ushort getRelocTypeHint(void* target) } } +#pragma warning disable CA1822 // Mark members as static + private uint getTargetVectorLength() +#pragma warning restore CA1822 // Mark members as static + { + // Temporary. Can use Sve.GetActiveElementCount or equivalent + return 0; + } + private uint getExpectedTargetArchitecture() { TargetArchitecture arch = _compilation.TypeSystemContext.Target.Architecture; diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoImpl_generated.cs b/src/coreclr/tools/Common/JitInterface/CorInfoImpl_generated.cs index 6be49af5d42404..1ff68fe4375cc5 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoImpl_generated.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoImpl_generated.cs @@ -2618,6 +2618,21 @@ private static uint _getExpectedTargetArchitecture(IntPtr thisHandle, IntPtr* pp } } + [UnmanagedCallersOnly] + private static uint _getTargetVectorLength(IntPtr thisHandle, IntPtr* ppException) + { + var _this = GetThis(thisHandle); + try + { + return _this.getTargetVectorLength(); + } + catch (Exception ex) + { + *ppException = _this.AllocException(ex); + return default; + } + } + [UnmanagedCallersOnly] private static uint _getJitFlags(IntPtr thisHandle, IntPtr* ppException, CORJIT_FLAGS* flags, uint sizeInBytes) { @@ -2651,7 +2666,7 @@ private static uint _getJitFlags(IntPtr thisHandle, IntPtr* ppException, CORJIT_ private static IntPtr GetUnmanagedCallbacks() { - void** callbacks = (void**)Marshal.AllocCoTaskMem(sizeof(IntPtr) * 179); + void** callbacks = (void**)Marshal.AllocCoTaskMem(sizeof(IntPtr) * 180); callbacks[0] = (delegate* unmanaged)&_isIntrinsic; callbacks[1] = (delegate* unmanaged)&_notifyMethodInfoUsage; @@ -2830,8 +2845,9 @@ private static IntPtr GetUnmanagedCallbacks() callbacks[174] = (delegate* unmanaged)&_recordRelocation; callbacks[175] = (delegate* unmanaged)&_getRelocTypeHint; callbacks[176] = (delegate* unmanaged)&_getExpectedTargetArchitecture; - callbacks[177] = (delegate* unmanaged)&_getJitFlags; - callbacks[178] = (delegate* unmanaged)&_getSpecialCopyHelper; + callbacks[177] = (delegate* unmanaged)&_getTargetVectorLength; + callbacks[178] = (delegate* unmanaged)&_getJitFlags; + callbacks[179] = (delegate* unmanaged)&_getSpecialCopyHelper; return (IntPtr)callbacks; } diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs b/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs index 3d92b216eb0012..84ef0c96264c4a 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs @@ -25,6 +25,7 @@ public enum InstructionSet ARM64_Sha1 = InstructionSet_ARM64.Sha1, ARM64_Sha256 = InstructionSet_ARM64.Sha256, ARM64_Atomics = InstructionSet_ARM64.Atomics, + ARM64_Vector = InstructionSet_ARM64.Vector, ARM64_Vector64 = InstructionSet_ARM64.Vector64, ARM64_Vector128 = InstructionSet_ARM64.Vector128, ARM64_Dczva = InstructionSet_ARM64.Dczva, @@ -150,24 +151,25 @@ public enum InstructionSet_ARM64 Sha1 = 7, Sha256 = 8, Atomics = 9, - Vector64 = 10, - Vector128 = 11, - Dczva = 12, - Rcpc = 13, - VectorT128 = 14, - Rcpc2 = 15, - Sve = 16, - Sve2 = 17, - ArmBase_Arm64 = 18, - AdvSimd_Arm64 = 19, - Aes_Arm64 = 20, - Crc32_Arm64 = 21, - Dp_Arm64 = 22, - Rdm_Arm64 = 23, - Sha1_Arm64 = 24, - Sha256_Arm64 = 25, - Sve_Arm64 = 26, - Sve2_Arm64 = 27, + Vector = 10, + Vector64 = 11, + Vector128 = 12, + Dczva = 13, + Rcpc = 14, + VectorT128 = 15, + Rcpc2 = 16, + Sve = 17, + Sve2 = 18, + ArmBase_Arm64 = 19, + AdvSimd_Arm64 = 20, + Aes_Arm64 = 21, + Crc32_Arm64 = 22, + Dp_Arm64 = 23, + Rdm_Arm64 = 24, + Sha1_Arm64 = 25, + Sha256_Arm64 = 26, + Sve_Arm64 = 27, + Sve2_Arm64 = 28, } public enum InstructionSet_RiscV64 @@ -404,6 +406,7 @@ public static InstructionSet ConvertToImpliedInstructionSetForVectorInstructionS case TargetArchitecture.ARM64: switch (input) { + case InstructionSet.ARM64_Vector: return InstructionSet.ARM64_Sve; case InstructionSet.ARM64_Vector64: return InstructionSet.ARM64_AdvSimd; case InstructionSet.ARM64_Vector128: return InstructionSet.ARM64_AdvSimd; } @@ -503,6 +506,8 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target resultflags.AddInstructionSet(InstructionSet.ARM64_AdvSimd); if (resultflags.HasInstructionSet(InstructionSet.ARM64_Sve2)) resultflags.AddInstructionSet(InstructionSet.ARM64_Sve); + if (resultflags.HasInstructionSet(InstructionSet.ARM64_Vector)) + resultflags.AddInstructionSet(InstructionSet.ARM64_Sve); break; case TargetArchitecture.RiscV64: @@ -771,6 +776,8 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe resultflags.AddInstructionSet(InstructionSet.ARM64_Sve); if (resultflags.HasInstructionSet(InstructionSet.ARM64_Sve)) resultflags.AddInstructionSet(InstructionSet.ARM64_Sve2); + if (resultflags.HasInstructionSet(InstructionSet.ARM64_Sve)) + resultflags.AddInstructionSet(InstructionSet.ARM64_Vector); break; case TargetArchitecture.RiscV64: @@ -1002,6 +1009,7 @@ public static IEnumerable ArchitectureToValidInstructionSets yield return new InstructionSetInfo("sha1", "Sha1", InstructionSet.ARM64_Sha1, true); yield return new InstructionSetInfo("sha2", "Sha256", InstructionSet.ARM64_Sha256, true); yield return new InstructionSetInfo("lse", "", InstructionSet.ARM64_Atomics, true); + yield return new InstructionSetInfo("Vector", "", InstructionSet.ARM64_Vector, false); yield return new InstructionSetInfo("Vector64", "", InstructionSet.ARM64_Vector64, false); yield return new InstructionSetInfo("Vector128", "", InstructionSet.ARM64_Vector128, false); yield return new InstructionSetInfo("Dczva", "", InstructionSet.ARM64_Dczva, false); diff --git a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt index 5d067eb3311e1c..e185676e087b48 100644 --- a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt +++ b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt @@ -208,6 +208,7 @@ instructionset ,ARM64 ,Rdm , ,24 ,Rdm instructionset ,ARM64 ,Sha1 , ,19 ,Sha1 ,sha1 instructionset ,ARM64 ,Sha256 , ,20 ,Sha256 ,sha2 instructionset ,ARM64 , ,Atomics ,21 ,Atomics ,lse +instructionset ,ARM64 , , , ,Vector , instructionset ,ARM64 , , , ,Vector64 , instructionset ,ARM64 , , , ,Vector128 , instructionset ,ARM64 , , , ,Dczva , @@ -228,6 +229,7 @@ instructionset64bit,ARM64 ,Sha256 instructionset64bit,ARM64 ,Sve instructionset64bit,ARM64 ,Sve2 +vectorinstructionset,ARM64,Vector vectorinstructionset,ARM64,Vector64 vectorinstructionset,ARM64,Vector128 @@ -243,6 +245,7 @@ implication ,ARM64 ,Vector128 ,AdvSimd implication ,ARM64 ,VectorT128 ,AdvSimd implication ,ARM64 ,Sve ,AdvSimd implication ,ARM64 ,Sve2 ,Sve +implication ,ARM64 ,Vector ,Sve ; Definition of Riscv64 instruction sets definearch ,RiscV64 ,64Bit ,RiscV64, RiscV64 diff --git a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt index 7b5f8a7d79f292..fd525212d1f3c6 100644 --- a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt +++ b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/ThunkInput.txt @@ -343,5 +343,6 @@ FUNCTIONS void recordRelocation(void* location, void* locationRW, void* target, uint16_t fRelocType, int32_t addlDelta) uint16_t getRelocTypeHint(void* target) uint32_t getExpectedTargetArchitecture() + uint32_t getTargetVectorLength() uint32_t getJitFlags(CORJIT_FLAGS* flags, uint32_t sizeInBytes) CORINFO_METHOD_HANDLE getSpecialCopyHelper(CORINFO_CLASS_HANDLE type) = 0; diff --git a/src/coreclr/tools/aot/jitinterface/jitinterface_generated.h b/src/coreclr/tools/aot/jitinterface/jitinterface_generated.h index f72f9cdeef2ac4..aa6f0155269035 100644 --- a/src/coreclr/tools/aot/jitinterface/jitinterface_generated.h +++ b/src/coreclr/tools/aot/jitinterface/jitinterface_generated.h @@ -188,6 +188,7 @@ struct JitInterfaceCallbacks void (* recordRelocation)(void * thisHandle, CorInfoExceptionClass** ppException, void* location, void* locationRW, void* target, uint16_t fRelocType, int32_t addlDelta); uint16_t (* getRelocTypeHint)(void * thisHandle, CorInfoExceptionClass** ppException, void* target); uint32_t (* getExpectedTargetArchitecture)(void * thisHandle, CorInfoExceptionClass** ppException); + uint32_t (* getTargetVectorLength)(void * thisHandle, CorInfoExceptionClass** ppException); uint32_t (* getJitFlags)(void * thisHandle, CorInfoExceptionClass** ppException, CORJIT_FLAGS* flags, uint32_t sizeInBytes); CORINFO_METHOD_HANDLE (* getSpecialCopyHelper)(void * thisHandle, CorInfoExceptionClass** ppException, CORINFO_CLASS_HANDLE type); @@ -1933,6 +1934,14 @@ class JitInterfaceWrapper : public ICorJitInfo return temp; } + virtual uint32_t getTargetVectorLength() +{ + CorInfoExceptionClass* pException = nullptr; + uint32_t temp = _callbacks->getTargetVectorLength(_thisHandle, &pException); + if (pException != nullptr) throw pException; + return temp; +} + virtual uint32_t getJitFlags( CORJIT_FLAGS* flags, uint32_t sizeInBytes) diff --git a/src/coreclr/tools/superpmi/superpmi-shared/lwmlist.h b/src/coreclr/tools/superpmi/superpmi-shared/lwmlist.h index b65429f3153bd4..b23a29a04900c1 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/lwmlist.h +++ b/src/coreclr/tools/superpmi/superpmi-shared/lwmlist.h @@ -126,6 +126,7 @@ LWM(GetReadyToRunHelper, GetReadyToRunHelper_TOKENin, GetReadyToRunHelper_TOKENo LWM(GetReadyToRunDelegateCtorHelper, GetReadyToRunDelegateCtorHelper_TOKENIn, Agnostic_CORINFO_LOOKUP) LWM(GetRelocTypeHint, DWORDLONG, DWORD) LWM(GetExpectedTargetArchitecture, DWORD, DWORD) +LWM(GetTargetVectorLength, DWORD, DWORD) LWM(GetSharedCCtorHelper, DWORDLONG, DWORD) LWM(GetStringConfigValue, DWORD, DWORD) LWM(GetSystemVAmd64PassStructInRegisterDescriptor, DWORDLONG, Agnostic_GetSystemVAmd64PassStructInRegisterDescriptor) diff --git a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp index 55d5f732db35fb..b4129e5d6c66c8 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.cpp @@ -6606,6 +6606,29 @@ WORD MethodContext::repGetRelocTypeHint(void* target) return retVal; } +void MethodContext::recGetTargetVectorLength(DWORD result) +{ + if (GetTargetVectorLength == nullptr) + GetTargetVectorLength = new LightWeightMap(); + + DWORD key = 0; // There is only ever a single entry to this map + GetTargetVectorLength->Add(key, result); + DEBUG_REC(dmpGetTargetVectorLength(key, result)); +} +void MethodContext::dmpGetTargetVectorLength(DWORD key, DWORD result) +{ + printf("GetTargetVectorLength key %u, res %u", key, result); +} +DWORD MethodContext::repGetTargetVectorLength() +{ + DWORD key = 0; + + DWORD value = LookupByKeyOrMiss(GetTargetVectorLength, key, ": key %08X", key); + + DEBUG_REP(dmpGetTargetVectorLength(key, value)); + return value; +} + void MethodContext::recGetExpectedTargetArchitecture(DWORD result) { if (GetExpectedTargetArchitecture == nullptr) diff --git a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.h b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.h index 06c1c1e33bd842..2df71f447e6f86 100644 --- a/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.h +++ b/src/coreclr/tools/superpmi/superpmi-shared/methodcontext.h @@ -810,6 +810,10 @@ class MethodContext void dmpGetRelocTypeHint(DWORDLONG key, DWORD value); WORD repGetRelocTypeHint(void* target); + void recGetTargetVectorLength(DWORD result); + void dmpGetTargetVectorLength(DWORD key, DWORD result); + DWORD repGetTargetVectorLength(); + void recGetExpectedTargetArchitecture(DWORD result); void dmpGetExpectedTargetArchitecture(DWORD key, DWORD result); DWORD repGetExpectedTargetArchitecture(); @@ -1224,6 +1228,7 @@ enum mcPackets Packet_GetAsyncResumptionStub = 231, Packet_GetCookieForInterpreterCalliSig = 232, Packet_GetHelperFtn = 233, + Packet_GetTargetVectorLength = 234, }; void SetDebugDumpVariables(); diff --git a/src/coreclr/tools/superpmi/superpmi-shim-collector/icorjitinfo.cpp b/src/coreclr/tools/superpmi/superpmi-shim-collector/icorjitinfo.cpp index eab692b4d37cab..3ef3c74574acc5 100644 --- a/src/coreclr/tools/superpmi/superpmi-shim-collector/icorjitinfo.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shim-collector/icorjitinfo.cpp @@ -2043,6 +2043,14 @@ uint16_t interceptor_ICJI::getRelocTypeHint(void* target) return result; } +uint32_t interceptor_ICJI::getTargetVectorLength() +{ + mc->cr->AddCall("getTargetVectorLength"); + DWORD result = original_ICorJitInfo->getTargetVectorLength(); + mc->recGetTargetVectorLength(result); + return result; +} + // For what machine does the VM expect the JIT to generate code? The VM // returns one of the IMAGE_FILE_MACHINE_* values. Note that if the VM // is cross-compiling (such as the case for crossgen2), it will return a diff --git a/src/coreclr/tools/superpmi/superpmi-shim-counter/icorjitinfo_generated.cpp b/src/coreclr/tools/superpmi/superpmi-shim-counter/icorjitinfo_generated.cpp index ffcf2dc749695c..e74b00bf1a8297 100644 --- a/src/coreclr/tools/superpmi/superpmi-shim-counter/icorjitinfo_generated.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shim-counter/icorjitinfo_generated.cpp @@ -1447,6 +1447,12 @@ uint32_t interceptor_ICJI::getExpectedTargetArchitecture() return original_ICorJitInfo->getExpectedTargetArchitecture(); } +uint32_t interceptor_ICJI::getTargetVectorLength() +{ + mcs->AddCall("getTargetVectorLength"); + return original_ICorJitInfo->getTargetVectorLength(); +} + uint32_t interceptor_ICJI::getJitFlags( CORJIT_FLAGS* flags, uint32_t sizeInBytes) diff --git a/src/coreclr/tools/superpmi/superpmi-shim-simple/icorjitinfo_generated.cpp b/src/coreclr/tools/superpmi/superpmi-shim-simple/icorjitinfo_generated.cpp index 718959681da80a..dbaa2b6bffdcee 100644 --- a/src/coreclr/tools/superpmi/superpmi-shim-simple/icorjitinfo_generated.cpp +++ b/src/coreclr/tools/superpmi/superpmi-shim-simple/icorjitinfo_generated.cpp @@ -1270,6 +1270,11 @@ uint32_t interceptor_ICJI::getExpectedTargetArchitecture() return original_ICorJitInfo->getExpectedTargetArchitecture(); } +uint32_t interceptor_ICJI::getTargetVectorLength() +{ + return original_ICorJitInfo->getTargetVectorLength(); +} + uint32_t interceptor_ICJI::getJitFlags( CORJIT_FLAGS* flags, uint32_t sizeInBytes) diff --git a/src/coreclr/tools/superpmi/superpmi/icorjitinfo.cpp b/src/coreclr/tools/superpmi/superpmi/icorjitinfo.cpp index fafda8ea9fda18..6eab28352d6965 100644 --- a/src/coreclr/tools/superpmi/superpmi/icorjitinfo.cpp +++ b/src/coreclr/tools/superpmi/superpmi/icorjitinfo.cpp @@ -1847,6 +1847,13 @@ uint16_t MyICJI::getRelocTypeHint(void* target) return result; } +uint32_t MyICJI::getTargetVectorLength() +{ + jitInstance->mc->cr->AddCall("getTargetVectorLength"); + DWORD result = jitInstance->mc->repGetTargetVectorLength(); + return result; +} + // For what machine does the VM expect the JIT to generate code? The VM // returns one of the IMAGE_FILE_MACHINE_* values. Note that if the VM // is cross-compiling (such as the case for crossgen2), it will return a diff --git a/src/coreclr/vm/callingconvention.h b/src/coreclr/vm/callingconvention.h index 98e6eee147fcd6..0bafc2fb7790f6 100644 --- a/src/coreclr/vm/callingconvention.h +++ b/src/coreclr/vm/callingconvention.h @@ -60,6 +60,10 @@ struct ArgLocDesc case CORINFO_HFA_ELEM_DOUBLE: return 8; case CORINFO_HFA_ELEM_VECTOR64: return 8; case CORINFO_HFA_ELEM_VECTOR128: return 16; +#if defined(TARGET_ARM64) + case CORINFO_HFA_ELEM_VECTOR256: return 32; // TODO-VL: Need to return the cached value + case CORINFO_HFA_ELEM_VECTOR512: return 64; // TODO-VL: Need to return the cached value +#endif default: _ASSERTE(!"Invalid HFA Type"); return 0; } } diff --git a/src/coreclr/vm/class.cpp b/src/coreclr/vm/class.cpp index f29cafd3c5b055..70f892cb14c7b9 100644 --- a/src/coreclr/vm/class.cpp +++ b/src/coreclr/vm/class.cpp @@ -1727,7 +1727,27 @@ CorInfoHFAElemType MethodTable::GetHFAType() int vectorSize = pMT->GetVectorSize(); if (vectorSize != 0) { - return (vectorSize == 8) ? CORINFO_HFA_ELEM_VECTOR64 : CORINFO_HFA_ELEM_VECTOR128; + if (vectorSize == 8) + { + return CORINFO_HFA_ELEM_VECTOR64; + } + else if (vectorSize == 16) + { + return CORINFO_HFA_ELEM_VECTOR128; + } + else if (vectorSize == 32) + { + return CORINFO_HFA_ELEM_VECTOR256; + } + else if (vectorSize == 64) + { + return CORINFO_HFA_ELEM_VECTOR512; + } + else + { + _ASSERTE("Invalid vectorSize"); + return CORINFO_HFA_ELEM_VECTOR128; + } } PTR_FieldDesc pFirstField = pMT->GetApproxFieldDescListRaw(); @@ -1832,7 +1852,29 @@ EEClass::CheckForHFA() int thisElemSize = pMT->GetVectorSize(); if (thisElemSize != 0) { - fieldHFAType = (thisElemSize == 8) ? CORINFO_HFA_ELEM_VECTOR64 : CORINFO_HFA_ELEM_VECTOR128; + if (thisElemSize == 8) + { + fieldHFAType = CORINFO_HFA_ELEM_VECTOR64; + } + else if (thisElemSize == 16) + { + fieldHFAType = CORINFO_HFA_ELEM_VECTOR128; + } +#ifdef TARGET_ARM64 + else if (thisElemSize == 32) + { + fieldHFAType = CORINFO_HFA_ELEM_VECTOR256; + } + else if (thisElemSize == 64) + { + fieldHFAType = CORINFO_HFA_ELEM_VECTOR512; + } +#endif // TARGET_ARM64 + else + { + _ASSERTE ("Invalid element size"); + fieldHFAType = CORINFO_HFA_ELEM_VECTOR128; + } } else #endif // TARGET_ARM64 @@ -1903,6 +1945,12 @@ EEClass::CheckForHFA() case CORINFO_HFA_ELEM_VECTOR128: elemSize = 16; break; + case CORINFO_HFA_ELEM_VECTOR256: + elemSize = 32; + break; + case CORINFO_HFA_ELEM_VECTOR512: + elemSize = 64; + break; #endif default: // ELEMENT_TYPE_END diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp index bff4f38957bcf6..6b821ce426d1a2 100644 --- a/src/coreclr/vm/codeman.cpp +++ b/src/coreclr/vm/codeman.cpp @@ -1423,10 +1423,7 @@ void EEJitManager::SetCpuInfo() uint32_t maxVectorTLength = (maxVectorTBitWidth / 8); uint64_t sveLengthFromOS = GetSveLengthFromOS(); - // For now, enable SVE only when the system vector length is 16 bytes (128-bits) - // TODO: https://github.com/dotnet/runtime/issues/101477 - if (sveLengthFromOS == 16) - // if ((maxVectorTLength >= sveLengthFromOS) || (maxVectorTBitWidth == 0)) + if ((maxVectorTLength >= sveLengthFromOS) || (maxVectorTBitWidth == 0)) { CPUCompileFlags.Set(InstructionSet_Sve); diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 784043ae4b4cb4..fa7817b55e91dd 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -11949,6 +11949,33 @@ WORD CEEJitInfo::getRelocTypeHint(void * target) return (WORD)-1; } +#ifdef TARGET_ARM64 +extern "C" uint64_t GetSveLengthFromOS(); +#endif + +uint32_t CEEJitInfo::getTargetVectorLength() +{ + LIMITED_METHOD_CONTRACT; + + #ifdef TARGET_ARM64 + CORJIT_FLAGS corjitFlags = ExecutionManager::GetEEJitManager()->GetCPUCompileFlags(); + if (corjitFlags.IsSet(InstructionSet_Sve) || corjitFlags.IsSet(InstructionSet_Sve_Arm64)) + { + return (uint32_t)GetSveLengthFromOS(); + } + else if (corjitFlags.IsSet(InstructionSet_AdvSimd) || corjitFlags.IsSet(InstructionSet_AdvSimd_Arm64)) + { + return 16; + } + else + { + return 0; + } + #else + UNREACHABLE(); // only called on Arm64 + #endif +} + uint32_t CEEJitInfo::getExpectedTargetArchitecture() { LIMITED_METHOD_CONTRACT; @@ -14898,6 +14925,27 @@ uint32_t CEEInfo::getExpectedTargetArchitecture() return IMAGE_FILE_MACHINE_NATIVE; } +uint32_t CEEInfo::getTargetVectorLength() +{ + #ifdef TARGET_ARM64 + CORJIT_FLAGS corjitFlags = ExecutionManager::GetEEJitManager()->GetCPUCompileFlags(); + if (corjitFlags.IsSet(InstructionSet_Sve) || corjitFlags.IsSet(InstructionSet_Sve_Arm64)) + { + return (uint32_t)GetSveLengthFromOS(); + } + else if (corjitFlags.IsSet(InstructionSet_AdvSimd) || corjitFlags.IsSet(InstructionSet_AdvSimd_Arm64)) + { + return 16; + } + else + { + return 0; + } + #else + UNREACHABLE(); // only called on Arm64 + #endif +} + void CEEInfo::setBoundaries(CORINFO_METHOD_HANDLE ftn, ULONG32 cMap, ICorDebugInfo::OffsetMapping *pMap) { diff --git a/src/coreclr/vm/jitinterface.h b/src/coreclr/vm/jitinterface.h index df07408dd942b3..b9c5d6a970e573 100644 --- a/src/coreclr/vm/jitinterface.h +++ b/src/coreclr/vm/jitinterface.h @@ -714,7 +714,7 @@ class CEEJitInfo final : public CEECodeGenInfo int32_t addlDelta) override; uint16_t getRelocTypeHint(void * target) override; - + uint32_t getTargetVectorLength() override final; uint32_t getExpectedTargetArchitecture() override; void BackoutJitData(EECodeGenManager * jitMgr) override; diff --git a/src/coreclr/vm/methodtablebuilder.cpp b/src/coreclr/vm/methodtablebuilder.cpp index 66d9199dcfc410..1b451826f8ca5c 100644 --- a/src/coreclr/vm/methodtablebuilder.cpp +++ b/src/coreclr/vm/methodtablebuilder.cpp @@ -1161,6 +1161,10 @@ MethodTableBuilder::CopyParentVtable() } } +#ifdef TARGET_ARM64 +extern "C" uint64_t GetSveLengthFromOS(); +#endif + //******************************************************************************* // Determine if this is the special SIMD type System.Numerics.Vector, whose // size is determined dynamically based on the hardware and the presence of JIT @@ -1173,7 +1177,7 @@ BOOL MethodTableBuilder::CheckIfSIMDAndUpdateSize() { STANDARD_VM_CONTRACT; -#if defined(TARGET_X86) || defined(TARGET_AMD64) +#if defined(TARGET_X86) || defined(TARGET_AMD64) || defined(TARGET_ARM64) if (!bmtProp->fIsIntrinsicType) return false; @@ -1192,6 +1196,7 @@ BOOL MethodTableBuilder::CheckIfSIMDAndUpdateSize() CORJIT_FLAGS CPUCompileFlags = ExecutionManager::GetEEJitManager()->GetCPUCompileFlags(); uint32_t numInstanceFieldBytes = 16; +#if defined(TARGET_X86) || defined(TARGET_AMD64) if (CPUCompileFlags.IsSet(InstructionSet_VectorT512)) { numInstanceFieldBytes = 64; @@ -1200,13 +1205,29 @@ BOOL MethodTableBuilder::CheckIfSIMDAndUpdateSize() { numInstanceFieldBytes = 32; } +#elif defined(TARGET_ARM64) + if (CPUCompileFlags.IsSet(InstructionSet_Sve_Arm64)) + { +//#ifdef _DEBUG +// if (CLRConfig::GetConfigValue(CLRConfig::INTERNAL_UseSveForVectorT) != 0) +// { +// // For testing purpose, pretend the vector length is 32 bytes +// numInstanceFieldBytes = 32; +// } +// else +//#endif + { + numInstanceFieldBytes = (uint32_t)GetSveLengthFromOS(); + } + } +#endif // TARGET_X86 || TARGET_AMD64 || TARGET_ARM64 if (numInstanceFieldBytes != 16) { bmtFP->NumInstanceFieldBytes = numInstanceFieldBytes; return true; } -#endif // TARGET_X86 || TARGET_AMD64 +#endif // TARGET_X86 || TARGET_AMD64 || TARGET_ARM64 return false; }