diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 80efd01391556..da344305f39d9 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -1829,8 +1829,8 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) { void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) { Register DestReg = MI.getOperand(0).getReg(); - if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() && - STI->isNeonAvailable()) { + if (STI->hasZeroCycleZeroingFPR64() && + !STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) { // Convert H/S register to corresponding D register if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) DestReg = AArch64::D0 + (DestReg - AArch64::H0); diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 55aea17d29f55..00cf039096d32 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -630,19 +630,18 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true", "Has zero-cycle register moves for FPR32 registers">; -def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", - "Has zero-cycle zeroing instructions for generic registers">; +def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true", + "Has zero-cycle zeroing instructions for GPR64 registers">; + +def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true", + "Has zero-cycle zeroing instructions for GPR32 registers">; // It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0". // as movi is more efficient across all cores. Newer cores can eliminate // fmovs early and there is no difference with movi, but this not true for // all implementations. -def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false", - "Has no zero-cycle zeroing instructions for FP registers">; - -def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", - "Has zero-cycle zeroing instructions", - [FeatureZCZeroingGP]>; +def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false", + "Has no zero-cycle zeroing instructions for FPR64 registers">; /// ... but the floating-point version doesn't quite work in rare cases on older /// CPUs. diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 1aa180688dcdd..68f708c25a241 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5075,7 +5075,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } - } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { + } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) { BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); @@ -5202,7 +5202,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); - } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { + } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) { BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index b7e08dbe7c792..e4a7d83e7f807 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -322,7 +322,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, FeatureZCZeroingFPWorkaround]>; def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", @@ -336,7 +337,8 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", "Apple A11", [ @@ -349,7 +351,8 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", "Apple A12", [ @@ -362,7 +365,8 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", "Apple A13", [ @@ -375,7 +379,8 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", "Apple A14", [ @@ -393,7 +398,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", "Apple A15", [ @@ -411,7 +417,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", "Apple A16", [ @@ -429,7 +436,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", "Apple A17", [ @@ -447,7 +455,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17", FeatureStorePairSuppress, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", "Apple M4", [ @@ -464,8 +473,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4", FeatureFuseLiterals, FeatureZCRegMoveGPR64, FeatureZCRegMoveFPR128, - FeatureZCZeroing - ]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", @@ -497,13 +506,15 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3", FeatureStorePairSuppress, FeatureALULSLFast, FeaturePostRAScheduler, - FeatureZCZeroing]>; + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64]>; def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", "Qualcomm Kryo processors", [ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, FeatureALULSLFast, FeatureStorePairSuppress]>; @@ -511,7 +522,8 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor", "Qualcomm Falkor processors", [ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, FeatureStorePairSuppress, FeatureALULSLFast, FeatureSlowSTRQro]>; @@ -607,7 +619,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing, + FeatureZCZeroingGPR32, + FeatureZCZeroingGPR64, FeatureStorePairSuppress, FeatureALULSLFast]>; diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir index 76b5b76130657..284d624a4e68f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir +++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir @@ -1,15 +1,15 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s -# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz" %s \ +# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \ # RUN: | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s --- | diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll new file mode 100644 index 0000000000000..2a75976d58549 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll @@ -0,0 +1,153 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16 +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64 + +define half @tf16() { +entry: +; ALL-LABEL: tf16: +; FP-WORKAROUND: mov s0, wzr +; NOZCZ-FPR64: mov s0, wzr +; NOZCZ-FPR64-FULLFP16: mov h0, wzr +; ZCZ-FPR64: movi d0, #0 + ret half 0.0 +} + +define float @tf32() { +entry: +; ALL-LABEL: tf32: +; FP-WORKAROUND: mov s0, wzr +; NOZCZ-FPR64: mov s0, wzr +; ZCZ-FPR64: movi d0, #0 + ret float 0.0 +} + +define double @td64() { +entry: +; ALL-LABEL: td64: +; FP-WORKAROUND: mov d0, xzr +; NOZCZ-FPR64: mov d0, xzr +; ZCZ-FPR64: movi d0, #0 + ret double 0.0 +} + +define <8 x i8> @tv8i8() { +entry: +; ALL-LABEL: tv8i8: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <8 x i8> +} + +define <4 x i16> @tv4i16() { +entry: +; ALL-LABEL: tv4i16: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <4 x i16> +} + +define <2 x i32> @tv2i32() { +entry: +; ALL-LABEL: tv2i32: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <2 x i32> +} + +define <2 x float> @tv2f32() { +entry: +; ALL-LABEL: tv2f32: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <2 x float> +} + +define <16 x i8> @tv16i8() { +entry: +; ALL-LABEL: tv16i8: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <16 x i8> +} + +define <8 x i16> @tv8i16() { +entry: +; ALL-LABEL: tv8i16: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <8 x i16> +} + +define <4 x i32> @tv4i32() { +entry: +; ALL-LABEL: tv4i32: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <4 x i32> +} + +define <2 x i64> @tv2i64() { +entry: +; ALL-LABEL: tv2i64: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <2 x i64> +} + +define <4 x float> @tv4f32() { +entry: +; ALL-LABEL: tv4f32: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <4 x float> +} + +define <2 x double> @tv2d64() { +entry: +; ALL-LABEL: tv2d64: +; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0 +; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 +; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0 + ret <2 x double> +} + +; We used to produce spills+reloads for a Q register with zero cycle zeroing +; enabled. +; ALL-LABEL: foo: +; ALL-NOT: str q{{[0-9]+}} +; ALL-NOT: ldr q{{[0-9]+}} +define double @foo(i32 %n) { +entry: + br label %for.body + +for.body: + %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ] + %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %conv21 = sitofp i32 %i.076 to double + %call = tail call fast double @sin(double %conv21) + %cmp.i = fcmp fast olt double %phi0, %call + %v0 = select i1 %cmp.i, double %call, double %phi0 + %inc = add nuw nsw i32 %i.076, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret double %v0 +} + +declare double @sin(double) diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll new file mode 100644 index 0000000000000..dc643062d8697 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll @@ -0,0 +1,41 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr32 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr64 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR64 +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 +; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64 + +define i8 @ti8() { +entry: +; ALL-LABEL: ti8: +; NOZCZ-GPR: mov w0, wzr +; ZCZ-GPR32: mov w0, #0 + ret i8 0 +} + +define i16 @ti16() { +entry: +; ALL-LABEL: ti16: +; NOZCZ-GPR: mov w0, wzr +; ZCZ-GPR32: mov w0, #0 + ret i16 0 +} + +define i32 @ti32() { +entry: +; ALL-LABEL: ti32: +; NOZCZ-GPR: mov w0, wzr +; ZCZ-GPR32: mov w0, #0 + ret i32 0 +} + +define i64 @ti64() { +entry: +; ALL-LABEL: ti64: +; NOZCZ-GPR: mov x0, xzr +; ZCZ-GPR64: mov x0, #0 + ret i64 0 +} diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll deleted file mode 100644 index 6c3cd4766d799..0000000000000 --- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ /dev/null @@ -1,231 +0,0 @@ -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=-zcz-gp,+no-zcz-fp | FileCheck %s -check-prefixes=ALL,NONEGP,NONEFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,ZERO16 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gp,+no-zcz-fp | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP -; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=apple-a10 | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP -; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,NONE16 -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP -; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP - -declare void @bar(half, float, double, <2 x double>) -declare void @bari(i32, i32) -declare void @barl(i64, i64) -declare void @barf(float, float) - -define void @t1() nounwind ssp { -entry: -; ALL-LABEL: t1: -; ALL-NOT: fmov -; NONEFP-DAG: fmov s0, wzr -; NONEFP-DAG: fmov s1, wzr -; NONEFP-DAG: fmov d2, xzr -; NONEFP-DAG: movi{{(.16b)?}} v3{{(.2d)?}}, #0 -; NONE16: fmov h0, wzr -; NONE16: fmov s1, wzr -; NONE16: fmov d2, xzr -; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0 -; ZEROFP-DAG: movi d0, #0 -; ZEROFP-DAG: movi d1, #0 -; ZEROFP-DAG: movi d2, #0 -; ZEROFP-DAG: movi v3.2d, #0 -; ZERO16: movi d0, #0 -; ZERO16: movi d1, #0 -; ZERO16: movi d2, #0 -; ZERO16: movi v3.2d, #0 - tail call void @bar(half 0.000000e+00, float 0.000000e+00, double 0.000000e+00, <2 x double> ) nounwind - ret void -} - -define void @t2() nounwind ssp { -entry: -; ALL-LABEL: t2: -; NONEGP: mov w0, wzr -; NONEGP: mov w1, wzr -; ZEROGP: mov w0, #0 -; ZEROGP: mov w1, #0 - tail call void @bari(i32 0, i32 0) nounwind - ret void -} - -define void @t3() nounwind ssp { -entry: -; ALL-LABEL: t3: -; NONEGP: mov x0, xzr -; NONEGP: mov x1, xzr -; ZEROGP: mov x0, #0 -; ZEROGP: mov x1, #0 - tail call void @barl(i64 0, i64 0) nounwind - ret void -} - -define void @t4() nounwind ssp { -; ALL-LABEL: t4: -; NONEFP: fmov s{{[0-3]+}}, wzr -; NONEFP: fmov s{{[0-3]+}}, wzr -; ZEROFP: movi d0, #0 -; ZEROFP: movi d1, #0 - tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind - ret void -} - -declare double @sin(double) - -; We used to produce spills+reloads for a Q register with zero cycle zeroing -; enabled. -; ALL-LABEL: foo: -; ALL-NOT: str q{{[0-9]+}} -; ALL-NOT: ldr q{{[0-9]+}} -define double @foo(i32 %n) { -entry: - br label %for.body - -for.body: - %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ] - %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %conv21 = sitofp i32 %i.076 to double - %call = tail call fast double @sin(double %conv21) - %cmp.i = fcmp fast olt double %phi0, %call - %v0 = select i1 %cmp.i, double %call, double %phi0 - %inc = add nuw nsw i32 %i.076, 1 - %cmp = icmp slt i32 %inc, %n - br i1 %cmp, label %for.body, label %for.end - -for.end: - ret double %v0 -} - -define <2 x i64> @t6() { -; ALL-LABEL: t6: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x i64> zeroinitializer -} - -define i1 @ti1() { -entry: -; ALL-LABEL: ti1: -; NONEGP: mov w0, wzr -; ZEROGP: mov w0, #0 - ret i1 false -} - -define i8 @ti8() { -entry: -; ALL-LABEL: ti8: -; NONEGP: mov w0, wzr -; ZEROGP: mov w0, #0 - ret i8 0 -} - -define i16 @ti16() { -entry: -; ALL-LABEL: ti16: -; NONEGP: mov w0, wzr - ; ZEROGP: mov w0, #0 - ret i16 0 -} - -define i32 @ti32() { -entry: -; ALL-LABEL: ti32: -; NONEGP: mov w0, wzr -; ZEROGP: mov w0, #0 - ret i32 0 -} - -define i64 @ti64() { -entry: -; ALL-LABEL: ti64: -; NONEGP: mov x0, xzr -; ZEROGP: mov x0, #0 - ret i64 0 -} - -define float @tf32() { -entry: -; ALL-LABEL: tf32: -; NONEFP: mov s0, wzr -; ZEROFP: movi d0, #0 - ret float 0.0 -} - -define double @td64() { -entry: -; ALL-LABEL: td64: -; NONEFP: mov d0, xzr -; ZEROFP: movi d0, #0 - ret double 0.0 -} - -define <8 x i8> @tv8i8() { -entry: -; ALL-LABEL: tv8i8: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <8 x i8> -} - -define <4 x i16> @tv4i16() { -entry: -; ALL-LABEL: tv4i16: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <4 x i16> -} - -define <2 x i32> @tv2i32() { -entry: -; ALL-LABEL: tv2i32: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x i32> -} - -define <2 x float> @tv2f32() { -entry: -; ALL-LABEL: tv2f32: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x float> -} - -define <16 x i8> @tv16i8() { -entry: -; ALL-LABEL: tv16i8: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <16 x i8> -} - -define <8 x i16> @tv8i16() { -entry: -; ALL-LABEL: tv8i16: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <8 x i16> -} - -define <4 x i32> @tv4i32() { -entry: -; ALL-LABEL: tv4i32: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <4 x i32> -} - -define <2 x i64> @tv2i64() { -entry: -; ALL-LABEL: tv2i64: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x i64> -} - -define <4 x float> @tv4f32() { -entry: -; ALL-LABEL: tv4f32: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <4 x float> -} - -define <2 x double> @tv2d64() { -entry: -; ALL-LABEL: tv2d64: -; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 - ret <2 x double> -} - diff --git a/llvm/test/CodeGen/AArch64/f16-imm.ll b/llvm/test/CodeGen/AArch64/f16-imm.ll index 58793bf19f3a6..68873f9b7c3de 100644 --- a/llvm/test/CodeGen/AArch64/f16-imm.ll +++ b/llvm/test/CodeGen/AArch64/f16-imm.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+no-zcz-fp | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-NOZCZ -; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+zcz | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-ZCZ +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+no-zcz-fpr64 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-NOZCZ +; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+zcz-gpr32,+zcz-gpr64 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-ZCZ ; RUN: llc < %s -mtriple=aarch64 -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK-NOFP16 define half @Const0() {