Skip to content

Commit acfa0b7

Browse files
committed
[AArch64] Split zero cycle zeoring per register class
This change improves LLVM's model accuracy by splitting AArch64 subtarget features of zero cycle zeroing per register class. This aligns with how uarch is designed (each register bank has unique capabilities). Similarly to how we improved ZCM modeling. It splits `HasZeroCycleZeroingGP` to `HasZeroCycleZeroingGPR32` and `HasZeroCycleZeroingGPR64`, removes opaque `FeatureZCZeroing`, and infers `FeatureNoZCZeroingFP` to be `FeatureNoZCZeroingFPR64` based on the single usage in `AArch64AsmPrinter.cpp`. It also splits `arm64-zero-cycle-zeroing.ll` into 2 tests one `-gpr` and one `-fpr`, similarly to ZCM, to make the tests more focused and managable in correspondance with the new modeling. The test cases are updated as well, exlpoiting the fact that this is a refactor patch: - remove redundant functions that just mix isolated ones (t1-4) - specialize check prefixes - replace `apple-a10` with `apple-m1` - add a `-mtriple=arm64-apple-macosx -mcpu=generic` test case for GPR - isolate `mtriple=arm64-apple-ios -mcpu=cyclone` FP workaround test cas and move `-fullfp16` to another non-workaround test case
1 parent 6747139 commit acfa0b7

File tree

9 files changed

+241
-266
lines changed

9 files changed

+241
-266
lines changed

llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1829,8 +1829,8 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {
18291829

18301830
void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
18311831
Register DestReg = MI.getOperand(0).getReg();
1832-
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
1833-
STI->isNeonAvailable()) {
1832+
if (STI->hasZeroCycleZeroingFPR64() &&
1833+
!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
18341834
// Convert H/S register to corresponding D register
18351835
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
18361836
DestReg = AArch64::D0 + (DestReg - AArch64::H0);

llvm/lib/Target/AArch64/AArch64Features.td

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -627,19 +627,18 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP
627627
def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
628628
"Has zero-cycle register moves for FPR32 registers">;
629629

630-
def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
631-
"Has zero-cycle zeroing instructions for generic registers">;
630+
def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
631+
"Has zero-cycle zeroing instructions for GPR64 registers">;
632+
633+
def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
634+
"Has zero-cycle zeroing instructions for GPR32 registers">;
632635

633636
// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
634637
// as movi is more efficient across all cores. Newer cores can eliminate
635638
// fmovs early and there is no difference with movi, but this not true for
636639
// all implementations.
637-
def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
638-
"Has no zero-cycle zeroing instructions for FP registers">;
639-
640-
def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
641-
"Has zero-cycle zeroing instructions",
642-
[FeatureZCZeroingGP]>;
640+
def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
641+
"Has no zero-cycle zeroing instructions for FPR64 registers">;
643642

644643
/// ... but the floating-point version doesn't quite work in rare cases on older
645644
/// CPUs.

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5075,7 +5075,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
50755075
.addImm(0)
50765076
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
50775077
}
5078-
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
5078+
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
50795079
BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
50805080
.addImm(0)
50815081
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
@@ -5202,7 +5202,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
52025202
.addReg(SrcReg, getKillRegState(KillSrc))
52035203
.addImm(0)
52045204
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
5205-
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
5205+
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
52065206
BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
52075207
.addImm(0)
52085208
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));

llvm/lib/Target/AArch64/AArch64Processors.td

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
321321
FeatureFuseAES, FeatureFuseCryptoEOR,
322322
FeatureStorePairSuppress,
323323
FeatureZCRegMoveGPR64,
324-
FeatureZCZeroing,
324+
FeatureZCZeroingGPR32,
325+
FeatureZCZeroingGPR64,
325326
FeatureZCZeroingFPWorkaround]>;
326327

327328
def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
@@ -334,7 +335,8 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
334335
FeatureFuseCryptoEOR,
335336
FeatureStorePairSuppress,
336337
FeatureZCRegMoveGPR64,
337-
FeatureZCZeroing]>;
338+
FeatureZCZeroingGPR32,
339+
FeatureZCZeroingGPR64]>;
338340

339341
def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
340342
"Apple A11", [
@@ -346,7 +348,8 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
346348
FeatureFuseCryptoEOR,
347349
FeatureStorePairSuppress,
348350
FeatureZCRegMoveGPR64,
349-
FeatureZCZeroing]>;
351+
FeatureZCZeroingGPR32,
352+
FeatureZCZeroingGPR64]>;
350353

351354
def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
352355
"Apple A12", [
@@ -358,7 +361,8 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
358361
FeatureFuseCryptoEOR,
359362
FeatureStorePairSuppress,
360363
FeatureZCRegMoveGPR64,
361-
FeatureZCZeroing]>;
364+
FeatureZCZeroingGPR32,
365+
FeatureZCZeroingGPR64]>;
362366

363367
def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
364368
"Apple A13", [
@@ -370,7 +374,8 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
370374
FeatureFuseCryptoEOR,
371375
FeatureStorePairSuppress,
372376
FeatureZCRegMoveGPR64,
373-
FeatureZCZeroing]>;
377+
FeatureZCZeroingGPR32,
378+
FeatureZCZeroingGPR64]>;
374379

375380
def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
376381
"Apple A14", [
@@ -387,7 +392,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
387392
FeatureFuseLiterals,
388393
FeatureStorePairSuppress,
389394
FeatureZCRegMoveGPR64,
390-
FeatureZCZeroing]>;
395+
FeatureZCZeroingGPR32,
396+
FeatureZCZeroingGPR64]>;
391397

392398
def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
393399
"Apple A15", [
@@ -404,7 +410,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
404410
FeatureFuseLiterals,
405411
FeatureStorePairSuppress,
406412
FeatureZCRegMoveGPR64,
407-
FeatureZCZeroing]>;
413+
FeatureZCZeroingGPR32,
414+
FeatureZCZeroingGPR64]>;
408415

409416
def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
410417
"Apple A16", [
@@ -421,7 +428,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
421428
FeatureFuseLiterals,
422429
FeatureStorePairSuppress,
423430
FeatureZCRegMoveGPR64,
424-
FeatureZCZeroing]>;
431+
FeatureZCZeroingGPR32,
432+
FeatureZCZeroingGPR64]>;
425433

426434
def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
427435
"Apple A17", [
@@ -438,7 +446,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
438446
FeatureFuseLiterals,
439447
FeatureStorePairSuppress,
440448
FeatureZCRegMoveGPR64,
441-
FeatureZCZeroing]>;
449+
FeatureZCZeroingGPR32,
450+
FeatureZCZeroingGPR64]>;
442451

443452
def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
444453
"Apple M4", [
@@ -454,8 +463,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
454463
FeatureFuseCryptoEOR,
455464
FeatureFuseLiterals,
456465
FeatureZCRegMoveGPR64,
457-
FeatureZCZeroing
458-
]>;
466+
FeatureZCZeroingGPR32,
467+
FeatureZCZeroingGPR64]>;
459468

460469
def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
461470
"Samsung Exynos-M3 processors",
@@ -487,21 +496,24 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
487496
FeatureStorePairSuppress,
488497
FeatureALULSLFast,
489498
FeaturePostRAScheduler,
490-
FeatureZCZeroing]>;
499+
FeatureZCZeroingGPR32,
500+
FeatureZCZeroingGPR64]>;
491501

492502
def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
493503
"Qualcomm Kryo processors", [
494504
FeaturePostRAScheduler,
495505
FeaturePredictableSelectIsExpensive,
496-
FeatureZCZeroing,
506+
FeatureZCZeroingGPR32,
507+
FeatureZCZeroingGPR64,
497508
FeatureALULSLFast,
498509
FeatureStorePairSuppress]>;
499510

500511
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
501512
"Qualcomm Falkor processors", [
502513
FeaturePostRAScheduler,
503514
FeaturePredictableSelectIsExpensive,
504-
FeatureZCZeroing,
515+
FeatureZCZeroingGPR32,
516+
FeatureZCZeroingGPR64,
505517
FeatureStorePairSuppress,
506518
FeatureALULSLFast,
507519
FeatureSlowSTRQro]>;
@@ -597,7 +609,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
597609
"Qualcomm Saphira processors", [
598610
FeaturePostRAScheduler,
599611
FeaturePredictableSelectIsExpensive,
600-
FeatureZCZeroing,
612+
FeatureZCZeroingGPR32,
613+
FeatureZCZeroingGPR64,
601614
FeatureStorePairSuppress,
602615
FeatureALULSLFast]>;
603616

llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz" %s \
2+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
33
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
4-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz" %s \
4+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
55
# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
6-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz" %s \
6+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
77
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
8-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz" %s \
8+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
99
# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
10-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz" %s \
10+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
1111
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
12-
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz" %s \
12+
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
1313
# RUN: | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s
1414

1515
--- |
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
2+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
3+
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
4+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
5+
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
6+
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
7+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
8+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
9+
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
10+
11+
define half @tf16() {
12+
entry:
13+
; ALL-LABEL: tf16:
14+
; FP-WORKAROUND: mov s0, wzr
15+
; NOZCZ-FPR64: mov s0, wzr
16+
; NOZCZ-FPR64-FULLFP16: mov h0, wzr
17+
; ZCZ-FPR64: movi d0, #0
18+
ret half 0.0
19+
}
20+
21+
define float @tf32() {
22+
entry:
23+
; ALL-LABEL: tf32:
24+
; FP-WORKAROUND: mov s0, wzr
25+
; NOZCZ-FPR64: mov s0, wzr
26+
; ZCZ-FPR64: movi d0, #0
27+
ret float 0.0
28+
}
29+
30+
define double @td64() {
31+
entry:
32+
; ALL-LABEL: td64:
33+
; FP-WORKAROUND: mov d0, xzr
34+
; NOZCZ-FPR64: mov d0, xzr
35+
; ZCZ-FPR64: movi d0, #0
36+
ret double 0.0
37+
}
38+
39+
define <8 x i8> @tv8i8() {
40+
entry:
41+
; ALL-LABEL: tv8i8:
42+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
43+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
44+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
45+
ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
46+
}
47+
48+
define <4 x i16> @tv4i16() {
49+
entry:
50+
; ALL-LABEL: tv4i16:
51+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
52+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
53+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
54+
ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
55+
}
56+
57+
define <2 x i32> @tv2i32() {
58+
entry:
59+
; ALL-LABEL: tv2i32:
60+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
61+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
62+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
63+
ret <2 x i32> <i32 0, i32 0>
64+
}
65+
66+
define <2 x float> @tv2f32() {
67+
entry:
68+
; ALL-LABEL: tv2f32:
69+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
70+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
71+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
72+
ret <2 x float> <float 0.0, float 0.0>
73+
}
74+
75+
define <16 x i8> @tv16i8() {
76+
entry:
77+
; ALL-LABEL: tv16i8:
78+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
79+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
80+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
81+
ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
82+
}
83+
84+
define <8 x i16> @tv8i16() {
85+
entry:
86+
; ALL-LABEL: tv8i16:
87+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
88+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
89+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
90+
ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
91+
}
92+
93+
define <4 x i32> @tv4i32() {
94+
entry:
95+
; ALL-LABEL: tv4i32:
96+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
97+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
98+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
99+
ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
100+
}
101+
102+
define <2 x i64> @tv2i64() {
103+
entry:
104+
; ALL-LABEL: tv2i64:
105+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
106+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
107+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
108+
ret <2 x i64> <i64 0, i64 0>
109+
}
110+
111+
define <4 x float> @tv4f32() {
112+
entry:
113+
; ALL-LABEL: tv4f32:
114+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
115+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
116+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
117+
ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
118+
}
119+
120+
define <2 x double> @tv2d64() {
121+
entry:
122+
; ALL-LABEL: tv2d64:
123+
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
124+
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
125+
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
126+
ret <2 x double> <double 0.0, double 0.0>
127+
}
128+
129+
; We used to produce spills+reloads for a Q register with zero cycle zeroing
130+
; enabled.
131+
; ALL-LABEL: foo:
132+
; ALL-NOT: str q{{[0-9]+}}
133+
; ALL-NOT: ldr q{{[0-9]+}}
134+
define double @foo(i32 %n) {
135+
entry:
136+
br label %for.body
137+
138+
for.body:
139+
%phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
140+
%i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
141+
%conv21 = sitofp i32 %i.076 to double
142+
%call = tail call fast double @sin(double %conv21)
143+
%cmp.i = fcmp fast olt double %phi0, %call
144+
%v0 = select i1 %cmp.i, double %call, double %phi0
145+
%inc = add nuw nsw i32 %i.076, 1
146+
%cmp = icmp slt i32 %inc, %n
147+
br i1 %cmp, label %for.body, label %for.end
148+
149+
for.end:
150+
ret double %v0
151+
}
152+
153+
declare double @sin(double)

0 commit comments

Comments
 (0)