diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2ec6ed5e5be4ff..05edb2adb5f170 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -29008,6 +29008,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_AndMask: #elif defined(TARGET_ARM64) case NI_AdvSimd_And: + case NI_Sve_And: #endif { return GT_AND; @@ -29017,6 +29018,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_NotMask: #elif defined(TARGET_ARM64) case NI_AdvSimd_Not: + case NI_Sve_Not: #endif { return GT_NOT; @@ -29030,6 +29032,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_XorMask: #elif defined(TARGET_ARM64) case NI_AdvSimd_Xor: + case NI_Sve_Xor: #endif { return GT_XOR; @@ -29043,6 +29046,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_OrMask: #elif defined(TARGET_ARM64) case NI_AdvSimd_Or: + case NI_Sve_Or: #endif { return GT_OR; @@ -29056,6 +29060,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_AndNotMask: #elif defined(TARGET_ARM64) case NI_AdvSimd_BitwiseClear: + case NI_Sve_BitwiseClear: #endif { return GT_AND_NOT; @@ -29069,6 +29074,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty #elif defined(TARGET_ARM64) case NI_AdvSimd_Add: case NI_AdvSimd_Arm64_Add: + case NI_Sve_Add: #endif { return GT_ADD; @@ -29100,6 +29106,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_Divide: #elif defined(TARGET_ARM64) case NI_AdvSimd_Arm64_Divide: + case NI_Sve_Divide: #endif { return GT_DIV; @@ -29133,6 +29140,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty #elif defined(TARGET_ARM64) case NI_AdvSimd_Multiply: case NI_AdvSimd_Arm64_Multiply: + case NI_Sve_Multiply: #endif { return GT_MUL; @@ -29173,6 +29181,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty #if defined(TARGET_ARM64) case NI_AdvSimd_Negate: case NI_AdvSimd_Arm64_Negate: + case NI_Sve_Negate: { return GT_NEG; } @@ -29210,6 +29219,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_ShiftLeftLogicalVariable: #elif defined(TARGET_ARM64) case NI_AdvSimd_ShiftLeftLogical: + case NI_Sve_ShiftLeftLogical: #endif { return GT_LSH; @@ -29234,6 +29244,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_ShiftRightArithmeticVariable: #elif defined(TARGET_ARM64) case NI_AdvSimd_ShiftRightArithmetic: + case NI_Sve_ShiftRightArithmetic: #endif { return GT_RSH; @@ -29258,6 +29269,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty case NI_AVX512_ShiftRightLogicalVariable: #elif defined(TARGET_ARM64) case NI_AdvSimd_ShiftRightLogical: + case NI_Sve_ShiftRightLogical: #endif { return GT_RSZ; @@ -29282,6 +29294,7 @@ genTreeOps GenTreeHWIntrinsic::GetOperForHWIntrinsicId(NamedIntrinsic id, var_ty #elif defined(TARGET_ARM64) case NI_AdvSimd_Subtract: case NI_AdvSimd_Arm64_Subtract: + case NI_Sve_Subtract: #endif { return GT_SUB; @@ -29519,6 +29532,8 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForUnOp( { #if defined(TARGET_ARM64) assert(!isScalar || (simdSize == 8)); + // TODO-SVE: Add scalable length support + assert((simdSize == 8) || (simdSize == 16)); #endif // TARGET_ARM64 assert(!isScalar || varTypeIsFloating(simdBaseType)); @@ -29617,6 +29632,8 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, { #if defined(TARGET_ARM64) assert(!isScalar || (simdSize == 8)); + // TODO-SVE: Add scalable length support + assert((simdSize == 8) || (simdSize == 16)); #endif // TARGET_ARM64 assert(!isScalar || varTypeIsFloating(simdBaseType)); @@ -29735,6 +29752,8 @@ NamedIntrinsic GenTreeHWIntrinsic::GetHWIntrinsicIdForBinOp(Compiler* comp, id = NI_X86Base_AndNot; } #elif defined(TARGET_ARM64) + // TODO-SVE: Add scalable length support + assert(simdSize == 16 || simdSize == 8); id = NI_AdvSimd_BitwiseClear; #endif // !TARGET_XARCH && !TARGET_ARM64 @@ -32559,9 +32578,9 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) } else { -#if defined(TARGET_XARCH) if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ)) { +#if defined(TARGET_XARCH) if (otherNode->TypeIs(TYP_SIMD16)) { if (!HWIntrinsicInfo::IsVariableShift(ni)) @@ -32585,8 +32604,23 @@ GenTree* Compiler::gtFoldExprHWIntrinsic(GenTreeHWIntrinsic* tree) otherNode->AsVecCon()->EvaluateBroadcastInPlace(simdBaseType, shiftAmount); } } - } +#elif defined(TARGET_ARM64) + CorInfoType auxJitType = tree->GetAuxiliaryJitType(); + if (auxJitType != CORINFO_TYPE_UNDEF && + genTypeSize(JITtype2varType(auxJitType)) != genTypeSize(simdBaseType)) + { + // Handle the "wide elements" variant of shift, where otherNode is a vector of ulongs, + // which is looped over to read the shift values. The values can safely be narrowed + // to the result type. + assert(auxJitType == CORINFO_TYPE_ULONG); + assert(tree->TypeIs(TYP_SIMD16)); + + simd16_t result = {}; + NarrowSimdLong(simdBaseType, &result, otherNode->AsVecCon()->gtSimd16Val); + otherNode->AsVecCon()->gtSimd16Val = result; + } #endif // TARGET_XARCH + } if (otherNode->IsIntegralConst()) { diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 9e94fb991a6aa1..ba89a5d8c8f03c 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -732,6 +732,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } case NI_AdvSimd_BitwiseClear: + case NI_Sve_BitwiseClear: case NI_Vector64_AndNot: case NI_Vector128_AndNot: { diff --git a/src/coreclr/jit/hwintrinsiclistarm64sve.h b/src/coreclr/jit/hwintrinsiclistarm64sve.h index 4fc73e41aa168c..75e9603c8946ea 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64sve.h +++ b/src/coreclr/jit/hwintrinsiclistarm64sve.h @@ -29,7 +29,7 @@ HARDWARE_INTRINSIC(Sve, AddSaturate, HARDWARE_INTRINSIC(Sve, AddSequentialAcross, -1, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_fadda, INS_sve_fadda}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_ReduceOperation) HARDWARE_INTRINSIC(Sve, And, -1, -1, {INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_sve_and, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, AndAcross, -1, -1, {INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_sve_andv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReduceOperation) -HARDWARE_INTRINSIC(Sve, BitwiseClear, -1, -1, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) +HARDWARE_INTRINSIC(Sve, BitwiseClear, -1, -1, {INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_sve_bic, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialImport|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation|HW_Flag_HasAllMaskVariant) HARDWARE_INTRINSIC(Sve, BooleanNot, -1, -1, {INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_sve_cnot, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, Compact, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact, INS_sve_compact}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation) HARDWARE_INTRINSIC(Sve, CompareEqual, -1, -1, {INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_cmpeq, INS_sve_fcmeq, INS_sve_fcmeq}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_ReturnsPerElementMask|HW_Flag_ZeroingMaskedOperation) diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index eb8ea9dee83a5e..c8f677347601cc 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -1590,6 +1590,9 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) // // We want to similarly handle (~op1 | op2) and (op1 | ~op2) + // TODO-SVE: Add scalable length support + assert(node->gtType == TYP_SIMD16 || node->gtType == TYP_SIMD8); + bool transform = false; GenTree* op1 = node->Op(1); diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 159f626eefa8de..2a904c0a2a2eb4 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -2065,6 +2065,69 @@ SveMaskPattern EvaluateSimdMaskToPattern(var_types baseType, simdmask_t arg0) } } } + +template +void NarrowSimdLong(TSimd* result, const TSimd& arg0) +{ + uint32_t count = sizeof(TSimd) / sizeof(uint64_t); + + for (uint32_t i = 0; i < count; i++) + { + uint64_t input0; + memcpy(&input0, &arg0.u8[(i * sizeof(TBase) / sizeof(uint64_t)) * sizeof(uint64_t)], sizeof(uint64_t)); + + // Saturate to largest value for TBase + if (input0 > (TBase)-1) + { + input0 = (TBase)-1; + } + + memcpy(&result->u8[i * sizeof(TBase)], &input0, sizeof(TBase)); + } +} + +template +void NarrowSimdLong(var_types baseType, TSimd* result, const TSimd& arg0) +{ + switch (baseType) + { + case TYP_FLOAT: + case TYP_INT: + case TYP_UINT: + { + NarrowSimdLong(result, arg0); + break; + } + + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + { + NarrowSimdLong(result, arg0); + break; + } + + case TYP_BYTE: + case TYP_UBYTE: + { + NarrowSimdLong(result, arg0); + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + NarrowSimdLong(result, arg0); + break; + } + + default: + { + unreached(); + } + } +} + #endif // TARGET_ARM64 #endif // FEATURE_MASKED_HW_INTRINSICS diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 7cd1f48a075d11..da7f28a59a8a67 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -8349,9 +8349,9 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( } } -#if defined(TARGET_XARCH) if ((oper == GT_LSH) || (oper == GT_RSH) || (oper == GT_RSZ)) { +#if defined(TARGET_XARCH) if (TypeOfVN(arg1VN) == TYP_SIMD16) { if (!HWIntrinsicInfo::IsVariableShift(ni)) @@ -8377,8 +8377,25 @@ ValueNum ValueNumStore::EvalHWIntrinsicFunBinary( } } } - } +#elif defined(TARGET_ARM64) + CorInfoType auxJitType = tree->GetAuxiliaryJitType(); + if (auxJitType != CORINFO_TYPE_UNDEF && + genTypeSize(JITtype2varType(auxJitType)) != genTypeSize(baseType)) + { + // Handle the "wide elements" variant of shift, where arg1 is a vector of ulongs, + // which is looped over to read the shift values. The values can safely be narrowed + // to the result type. + assert(auxJitType == CORINFO_TYPE_ULONG); + assert(tree->TypeIs(TYP_SIMD16)); + + simd16_t arg1 = GetConstantSimd16(arg1VN); + + simd16_t result = {}; + NarrowSimdLong(baseType, &result, arg1); + arg1VN = VNForSimd16Con(result); + } #endif // TARGET_XARCH + } return EvaluateBinarySimd(this, oper, isScalar, type, baseType, arg0VN, arg1VN); } diff --git a/src/tests/JIT/opt/SVE/PredicateInstructions.cs b/src/tests/JIT/opt/SVE/PredicateInstructions.cs index 287e90b30e4fe6..787e4c00a50f88 100644 --- a/src/tests/JIT/opt/SVE/PredicateInstructions.cs +++ b/src/tests/JIT/opt/SVE/PredicateInstructions.cs @@ -112,7 +112,9 @@ static Vector AndMask(Vector a, Vector b) [MethodImpl(MethodImplOptions.NoInlining)] static Vector BitwiseClearMask(Vector a, Vector b) { - //ARM64-FULL-LINE: bic {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b + //TODO-SVE: Restore check for SVE once >128bits is supported + //ARM64-FULL-LINE: {{bic .*}} + // {{p[0-9]+}}.b, {{p[0-9]+}}/z, {{p[0-9]+}}.b, {{p[0-9]+}}.b return Sve.ConditionalSelect( Sve.CreateTrueMaskInt16(), Sve.BitwiseClear(Sve.CompareGreaterThan(a, b), Sve.CompareEqual(a, b)),