-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AArch64][GlobalISel] Add codegen for simd fpcvt instructions #156892
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-aarch64 Author: None (Lukacma) ChangesThis patch adds codegen support for fcvt instructions that keep the result in 32-bit or 64-bit SIMD&FP registers in both SelectionDAG and GlobalISel. For a long time, LLVM primarily generated fpcvt instructions, which store the result in GPRs, resulting in extra moves when the value was used by NEON instructions that operate on SIMD&FP registers. Although patterns existed for generating the SIMD variants, they relied on single-element vector types (such as v1i32 or v1i64) to decide whether the SIMD variant should be selected. This was not useful, because many NEON intrinsics and other LLVM IR operations use scalar types (i32/i64) even though they expect the result to be stored in SIMD&FP registers. This patch is part of a series that addresses this and also adds support for generating these instructions in GlobalISel. To fix this in SelectionDAG, bitcasts of the result to a floating-point type serve as a hint that the SIMD variant of the conversion should be used, rather than relying on single-element vector types. These bitcasts are not currently generated by LLVM, but the goal is to add explicit bitcasts to the inputs and outputs of NEON intrinsics operating on integers in follow-up patches. For GlobalISel, the register bank selection algorithm is used to determine which variant to generate Patch is 106.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/156892.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 8958ad129269c..690cb5500875f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5299,28 +5299,29 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
}
}
-multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm> {
+multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm,
+ SDPatternOperator OpN> {
// double-precision to 32-bit SIMD/FPR
def SDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, FPR32, asm,
- []> {
+ [(set FPR32:$Rd, (i32 (OpN (f64 FPR64:$Rn))))]> {
let Inst{31} = 0; // 32-bit FPR flag
}
// half-precision to 32-bit SIMD/FPR
def SHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, FPR32, asm,
- []> {
+ [(set FPR32:$Rd, (i32 (OpN (f16 FPR16:$Rn))))]> {
let Inst{31} = 0; // 32-bit FPR flag
}
// half-precision to 64-bit SIMD/FPR
def DHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, FPR64, asm,
- []> {
+ [(set FPR64:$Rd, (i64 (OpN (f16 FPR16:$Rn))))]> {
let Inst{31} = 1; // 64-bit FPR flag
}
// single-precision to 64-bit SIMD/FPR
def DSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, FPR64, asm,
- []> {
+ [(set FPR64:$Rd, (i64 (OpN (f32 FPR32:$Rn))))]> {
let Inst{31} = 1; // 64-bit FPR flag
}
}
@@ -7949,6 +7950,21 @@ multiclass SIMDFPTwoScalar<bit U, bit S, bits<5> opc, string asm> {
}
}
+let mayRaiseFPException = 1, Uses = [FPCR] in
+multiclass SIMDFPTwoScalarFCVT<bit U, bit S, bits<5> opc, string asm,
+ SDPatternOperator OpN> {
+ let Predicates = [HasNEONandIsStreamingSafe], FastISelShouldIgnore = 1 in {
+ def v1i64 : BaseSIMDTwoScalar<U, {S,1}, 0b00, opc, FPR64, FPR64, asm,
+ [(set (i64 FPR64:$Rd), (OpN (f64 FPR64:$Rn)))]>;
+ def v1i32 : BaseSIMDTwoScalar<U, {S,0}, 0b00, opc, FPR32, FPR32, asm,
+ [(set FPR32:$Rd, (i32 (OpN (f32 FPR32:$Rn))))]>;
+ }
+ let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+ def v1f16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
+ [(set FPR16:$Rd, (i16 (OpN (f16 FPR16:$Rn))))]>;
+ }
+}
+
let mayRaiseFPException = 1, Uses = [FPCR] in
multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
SDPatternOperator OpNode> {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 62b26b5239365..34e55dcafcd06 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5212,18 +5212,55 @@ defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
+defm FCVTAS : SIMDFPTwoScalarFCVT< 0, 0, 0b11100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDFPTwoScalarFCVT< 1, 0, 0b11100, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : SIMDFPTwoScalarFCVT< 0, 0, 0b11011, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDFPTwoScalarFCVT< 1, 0, 0b11011, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDFPTwoScalarFCVT< 0, 0, 0b11010, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDFPTwoScalarFCVT< 1, 0, 0b11010, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : SIMDFPTwoScalarFCVT< 0, 1, 0b11010, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDFPTwoScalarFCVT< 1, 1, 0b11010, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : SIMDFPTwoScalarFCVT< 0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : SIMDFPTwoScalarFCVT< 1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
+
let Predicates = [HasNEON, HasFPRCVT] in{
- defm FCVTAS : FPToIntegerSIMDScalar<0b11, 0b010, "fcvtas">;
- defm FCVTAU : FPToIntegerSIMDScalar<0b11, 0b011, "fcvtau">;
- defm FCVTMS : FPToIntegerSIMDScalar<0b10, 0b100, "fcvtms">;
- defm FCVTMU : FPToIntegerSIMDScalar<0b10, 0b101, "fcvtmu">;
- defm FCVTNS : FPToIntegerSIMDScalar<0b01, 0b010, "fcvtns">;
- defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu">;
- defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps">;
- defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu">;
- defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">;
- defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">;
-}
+ defm FCVTAS : FPToIntegerSIMDScalar<0b11, 0b010, "fcvtas", int_aarch64_neon_fcvtas>;
+ defm FCVTAU : FPToIntegerSIMDScalar<0b11, 0b011, "fcvtau", int_aarch64_neon_fcvtau>;
+ defm FCVTMS : FPToIntegerSIMDScalar<0b10, 0b100, "fcvtms", int_aarch64_neon_fcvtms>;
+ defm FCVTMU : FPToIntegerSIMDScalar<0b10, 0b101, "fcvtmu", int_aarch64_neon_fcvtmu>;
+ defm FCVTNS : FPToIntegerSIMDScalar<0b01, 0b010, "fcvtns", int_aarch64_neon_fcvtns>;
+ defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu", int_aarch64_neon_fcvtnu>;
+ defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps", int_aarch64_neon_fcvtps>;
+ defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu", int_aarch64_neon_fcvtpu>;
+ defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs", any_fp_to_sint>;
+ defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu", any_fp_to_uint>;
+}
+
+multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {
+ def : Pat<(f32 (bitconvert (i32 (OpN (f64 FPR64:$Rn))))),
+ (!cast<Instruction>(INST # SDr) FPR64:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (OpN (f16 FPR16:$Rn))))),
+ (!cast<Instruction>(INST # SHr) FPR16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (OpN (f16 FPR16:$Rn))))),
+ (!cast<Instruction>(INST # DHr) FPR16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))),
+ (!cast<Instruction>(INST # DSr) FPR32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))),
+ (!cast<Instruction>(INST # v1i32) FPR32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))),
+ (!cast<Instruction>(INST # v1i64) FPR64:$Rn)>;
+
+}
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtas, "FCVTAS">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtau, "FCVTAU">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtms, "FCVTMS">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtmu, "FCVTMU">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtns, "FCVTNS">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtnu, "FCVTNU">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtps, "FCVTPS">;
+defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtpu, "FCVTPU">;
+defm: FPToIntegerSIMDScalarPatterns<any_fp_to_sint, "FCVTZS">;
+defm: FPToIntegerSIMDScalarPatterns<any_fp_to_uint, "FCVTZU">;
// AArch64's FCVT instructions saturate when out of range.
multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> {
@@ -5257,6 +5294,52 @@ multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string IN
def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+ // For global-isel we can use register classes to determine
+ // which FCVT instruction to use.
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(i32 (to_int_sat_gi f16:$Rn)),
+ (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f16:$Rn)),
+ (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f32:$Rn)),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int_sat_gi f64:$Rn)),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(i32 (to_int_sat_gi f32:$Rn)),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat f16:$Rn, i32)))),
+ (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat f16:$Rn, i64)))),
+ (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat f32:$Rn, i64)))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat f64:$Rn, i32)))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat_gi f16:$Rn)))),
+ (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat_gi f16:$Rn)))),
+ (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat_gi f32:$Rn)))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat_gi f64:$Rn)))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat f32:$Rn, i32)))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat f64:$Rn, i64)))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat_gi f32:$Rn)))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat_gi f64:$Rn)))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)),
(!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
@@ -5301,6 +5384,32 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;
+ // For global-isel we can use register classes to determine
+ // which FCVT instruction to use.
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # SHr) $Rn)>;
+ def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # DHr) $Rn)>;
+ def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # DSr) $Rn)>;
+ def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # SDr) $Rn)>;
+ }
+ def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # v1i32) $Rn)>;
+ def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # v1i64) $Rn)>;
+
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(f32 (bitconvert (i32 (round f16:$Rn)))),
+ (!cast<Instruction>(INST # SHr) $Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (round f16:$Rn)))),
+ (!cast<Instruction>(INST # DHr) $Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (round f32:$Rn)))),
+ (!cast<Instruction>(INST # DSr) $Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (round f64:$Rn)))),
+ (!cast<Instruction>(INST # SDr) $Rn)>;
+ }
+ def : Pat<(f32 (bitconvert (i32 (round f32:$Rn)))),
+ (!cast<Instruction>(INST # v1i32) $Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (round f64:$Rn)))),
+ (!cast<Instruction>(INST # v1i64) $Rn)>;
+
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
(!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
@@ -5330,6 +5439,30 @@ multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, strin
def : Pat<(i64 (to_int (round f64:$Rn))),
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+ // For global-isel we can use register classes to determine
+ // which FCVT instruction to use.
+ def : Pat<(i32 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(i64 (to_int (round f32:$Rn))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(i32 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(i64 (to_int (round f64:$Rn))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(f64 (bitconvert (i64 (to_int (round f32:$Rn))))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (to_int (round f64:$Rn))))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(f32 (bitconvert (i32 (to_int (round f32:$Rn))))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int (round f64:$Rn))))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
// These instructions saturate like fp_to_[su]int_sat.
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
@@ -5345,6 +5478,21 @@ multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, strin
(!cast<Instruction>(INST # UWDr) f64:$Rn)>;
def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)),
(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+ let Predicates = [HasFPRCVT] in {
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f16:$Rn), i32)))),
+ (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f16:$Rn), i64)))),
+ (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f32:$Rn), i64)))),
+ (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f64:$Rn), i32)))),
+ (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+ }
+ def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f32:$Rn), i32)))),
+ (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f64:$Rn), i64)))),
+ (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
}
defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil, "FCVTPS">;
@@ -5379,6 +5527,39 @@ def : Pat<(i64 (any_llround f32:$Rn)),
def : Pat<(i64 (any_llround f64:$Rn)),
(FCVTASUXDr f64:$Rn)>;
+// For global-isel we can use register classes to determine
+// which FCVT instruction to use.
+let Predicates = [HasFPRCVT] in {
+def : Pat<(i64 (any_lround f32:$Rn)),
+ (FCVTASDSr f32:$Rn)>;
+def : Pat<(i64 (any_llround f32:$Rn)),
+ (FCVTASDSr f32:$Rn)>;
+}
+def : Pat<(i64 (any_lround f64:$Rn)),
+ (FCVTASv1i64 f64:$Rn)>;
+def : Pat<(i64 (any_llround f64:$Rn)),
+ (FCVTASv1i64 f64:$Rn)>;
+
+let Predicates = [HasFPRCVT] in {
+ def : Pat<(f32 (bitconvert (i32 (any_lround f16:$Rn)))),
+ (FCVTASSHr f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (any_lround f16:$Rn)))),
+ (FCVTASDHr f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (any_llround f16:$Rn)))),
+ (FCVTASDHr f16:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (any_lround f32:$Rn)))),
+ (FCVTASDSr f32:$Rn)>;
+ def : Pat<(f32 (bitconvert (i32 (any_lround f64:$Rn)))),
+ (FCVTASSDr f64:$Rn)>;
+ def : Pat<(f64 (bitconvert (i64 (any_llround f32:$Rn)))),
+ (FCVTASDSr f32:$Rn)>;
+}
+def : Pat<(f32 (bitconvert (i32 (any_lround f32:$Rn)))),
+ (FCVTASv1i32 f32:$Rn)>;
+def : Pat<(f64 (bitconvert (i64 (any_lround f64:$Rn)))),
+ (FCVTASv1i64 f64:$Rn)>;
+def : Pat<(f64 (bitconvert (i64 (any_llround f64:$Rn)))),
+ (FCVTASv1i64 f64:$Rn)>;
//===----------------------------------------------------------------------===//
// Scaled integer to floating point conversion instructions.
//===----------------------------------------------------------------------===//
@@ -5524,6 +5705,44 @@ def : Pat<(i64 (any_llrint f32:$Rn)),
def : Pat<(i64 (any_llrint f64:$Rn)),
(FCVTZSUXDr (FRINTXDr f64:$Rn))>;
+// For global-isel we can use register classes to determine
+// which FCVT instruction to use.
+let Predicates = [HasFPRCVT] in {
+def : Pat<(i64 (any_lrint f16:$Rn)),
+ (FCVTZSDHr (FRINTXHr f16:$Rn))>;
+def : Pat<(i64 (any_llrint f16:$Rn)),
+ (FCVTZSDHr (FRINTXHr f16:$Rn))>;
+def : Pat<(i64 (any_lrint f32:$Rn)),
+ (FCVTZSDSr (FRINTXSr f32:$Rn))>;
+def : Pat<(i64 (any_llrint f32:$Rn)),
+ (FCVTZSDSr (FRINTXSr f32:$Rn))>;
+}
+def : Pat<(i64 (any_lrint f64:$Rn)),
+ (FCVTZSv1i64 (FRINTXDr f64:$Rn))>;
+def : Pat<(i64 (any_llrint f64:$Rn)),
+ (FCVTZSv1i64 (FRINTXDr f64:$Rn))>;
+
+let Predicates = [HasFPRCVT] in {
+ def : Pat<(f32 (bitconvert (i32 (any_lrint f16:$Rn)))),
+ (FCVTZSSHr (FRINTXHr f16:$Rn))>;
+ def : Pat<(f64 (bitconvert (i64 (any_lrint f16:$Rn)))),
+ (FCVTZSDHr (FRINTXHr f16:$Rn))>;
+ def : Pat<(f64 (bitconvert (i64 (any_llrint f16:$Rn)))),
+ (FCVTZSDHr (FRINTXHr f16:$Rn))>;
+ def : Pat<(f64 (bitconvert (i64 (any_lrint f32:$Rn)))),
+ (FCVTZSDSr (FRINTXSr f32:$Rn))>;
+ def : Pat<(f32 (bitconvert (i32 (any_lrint f64:$Rn)))),
+ (FCVTZSSDr (FRINTXDr f64:$Rn))>;
+ def : Pat<(f64 (bitconvert (i64 (any_llrint f32:$Rn)))),
+ (FCVTZSDSr (FRINTXSr f32:$Rn))>;
+}
+def : Pat<(f32 (bitconvert (i32 (any_lrint f32:$Rn)))),
+ (FCVTZSv1i32 (FRINTXSr f32:$Rn))>;
+def : Pat<(f64 (bitconvert (i64 (any_lrint f64:$Rn)))),
+ (FCVTZSv1i64 (FRINTXDr f64:$Rn))>;
+def : Pat<(f64 (bitconvert (i64 (any_llrint f64:$Rn)))),
+ (FCVTZSv1i64 (FRINTXDr f64:$Rn))>;
+
//===----------------------------------------------------------------------===//
// Floating point two operand instructions.
//===----------------------------------------------------------------------===//
@@ -6549,17 +6768,7 @@ defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
-defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index cf391c446a955..c75a3c406f60d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -568,9 +568,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
case Intrinsic::aarch64_neon_fcvtnu:
case Intrinsic::aarch64_neon_fcvtps:
case Intrinsic::aarch64_neon_fcvtpu:
- // Force FPR register bank for half types, as those types otherwise
- // don't get legalized correctly resulting in fp16 <-> gpr32 COPY's.
- return MRI.getType(MI.getOperand(2).getReg()) == LLT::float16();
+ return true;
default:
break;
}
@@ -864,10 +862,24 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case TargetOpcode::G_FPTOUI:
case TargetOpcode::G_INTRINSIC_LRINT:
case TargetOpcode::G_INTRINSIC_LLRINT:
+ case TargetOpcode::G_LROUND:
+ case TargetOpcode::G_LLROUND: {
if (MRI.getType(MI.getOperand(0).getReg()).isVector())
break;
- OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
+ TypeSize DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+ TypeSize SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, TRI);
+ if (((D...
[truncated]
|
|
Thank you Marian for the patch. |
| let mayRaiseFPException = 1, Uses = [FPCR] in | ||
| multiclass SIMDFPTwoScalarFCVT<bit U, bit S, bits<5> opc, string asm, | ||
| SDPatternOperator OpN> { | ||
| let Predicates = [HasNEONandIsStreamingSafe], FastISelShouldIgnore = 1 in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need to set FastISelShouldIgnore? Can we add a comment here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Another question is, I cannot find the description of these instructions on developer.arm. Where the converts vector in and out has the same size.
This one:
https://developer.arm.com/documentation/ddi0602/2025-06/SIMD-FP-Instructions/FCVTAU--vector---Floating-point-convert-to-unsigned-integer--rounding-to-nearest-with-ties-to-away--vector--?lang=en
only have form half to half
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I added it because it fails with FastISel. I didn't really investigate why it fails, as I thought FastISel is not really important.
They are under Scalar single-precision and double-precision section on the webpage
| define dso_local double @fcvtms_1d1d_simd(double %a) { | ||
| ; CHECK-LABEL: fcvtms_1d1d_simd: | ||
| ; CHECK: // %bb.0: | ||
| ; CHECK-NEXT: fcvtms d0, d0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I could not find these instruction described in developer.arm
| defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu", any_fp_to_uint>; | ||
| } | ||
|
|
||
| multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand what was done to use the new instructions. But the instructions suggest this is returning an Integer, but that is not the pattern output.
Have you thought how will this work with clang. So the optimisation does not removes the bitconvert and we can generate these instructions
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think there is any work necessary for clang, unless we try to change lowering of NEON scalar integer intrinsics to use single elements vector, in which case this whole patch will be irrelevant. The bitconvert will not get removed, because it is only going to be added during(after?) code legalization and there are, as far as I know, no optimizations which would remove before instruction selection
| def : Pat<(f32 (bitconvert (i32 (to_int_sat_gi f64:$Rn)))), | ||
| (!cast<Instruction>(INST # SDr) f64:$Rn)>; | ||
| } | ||
| def : Pat<(f32 (bitconvert (i32 (to_int_sat f32:$Rn, i32)))), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure these instructions exist. CVT Sx, Sx or CVT Dx,Dx. At least when I grep the tests inside MC/AArch64 I cannot see them being used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See my answer above. As for why there are no tests for them I cannot say. Probably someone has forgotten to add them when adding the instructions
davemgreen
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like a nice improvement. RegBankSelect needs some improvement but this looks in line with what I would expect. I have been looking into it a bit recently, to see if we can come up with a better algorithm for it (maybe using mincuts). That will take some time though.
There is quite a bit in this one patch and with all the variants it can be difficult to follow, it might be easier to review if it was split up a little.
| defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>; | ||
| defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>; | ||
| defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>; | ||
| defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would leave these instructions where they were to keep them with similar "SIMD" instructions, if that didn't make the patterns much more difficult.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I wanted to keep patterns together and unfortunately in tablegen you need to define records above their usage so I needed to move these instructions up.
|
Thanks David for the suggestion. I will split the patch up. The first patch can be found here #157680 |
This patch adds codegen support for fcvt instructions that keep the result in 32-bit or 64-bit SIMD&FP registers in both SelectionDAG and GlobalISel. For a long time, LLVM primarily generated fpcvt instructions, which store the result in GPRs, resulting in extra moves when the value was used by NEON instructions that operate on SIMD&FP registers. Although patterns existed for generating the SIMD variants, they relied on single-element vector types (such as v1i32 or v1i64) to decide whether the SIMD variant should be selected. This was not useful, because many NEON intrinsics and other LLVM IR operations use scalar types (i32/i64) even though they expect the result to be stored in SIMD&FP registers.
This patch is part of a series that addresses this and also adds support for generating these instructions in GlobalISel. To fix this in SelectionDAG, bitcasts of the result to a floating-point type serve as a hint that the SIMD variant of the conversion should be used, rather than relying on single-element vector types. These bitcasts are not currently generated by LLVM, but the goal is to add explicit bitcasts to the inputs and outputs of NEON intrinsics operating on integers in follow-up patches.
For GlobalISel, the register bank selection algorithm is used to determine which variant to generate