diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index d9d3569affa39..3e68bb71c45c4 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -5222,19 +5222,13 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, InsertVal = MI.getOperand(2).getReg(); Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg(); - - // TODO: Handle total scalarization case. - if (!NarrowVecTy.isVector()) - return UnableToLegalize; - LLT VecTy = MRI.getType(SrcVec); // If the index is a constant, we can really break this down as you would // expect, and index into the target size pieces. - int64_t IdxVal; auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI); if (MaybeCst) { - IdxVal = MaybeCst->Value.getSExtValue(); + uint64_t IdxVal = MaybeCst->Value.getZExtValue(); // Avoid out of bounds indexing the pieces. if (IdxVal >= VecTy.getNumElements()) { MIRBuilder.buildUndef(DstReg); @@ -5242,33 +5236,45 @@ LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI, return Legalized; } - SmallVector VecParts; - LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); + if (!NarrowVecTy.isVector()) { + SmallVector SplitPieces; + extractParts(MI.getOperand(1).getReg(), NarrowVecTy, + VecTy.getNumElements(), SplitPieces, MIRBuilder, MRI); + if (IsInsert) { + SplitPieces[IdxVal] = InsertVal; + MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), SplitPieces); + } else { + MIRBuilder.buildCopy(MI.getOperand(0).getReg(), SplitPieces[IdxVal]); + } + } else { + SmallVector VecParts; + LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec); - // Build a sequence of NarrowTy pieces in VecParts for this operand. - LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, - TargetOpcode::G_ANYEXT); + // Build a sequence of NarrowTy pieces in VecParts for this operand. + LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts, + TargetOpcode::G_ANYEXT); - unsigned NewNumElts = NarrowVecTy.getNumElements(); + unsigned NewNumElts = NarrowVecTy.getNumElements(); - LLT IdxTy = MRI.getType(Idx); - int64_t PartIdx = IdxVal / NewNumElts; - auto NewIdx = - MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); + LLT IdxTy = MRI.getType(Idx); + int64_t PartIdx = IdxVal / NewNumElts; + auto NewIdx = + MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx); - if (IsInsert) { - LLT PartTy = MRI.getType(VecParts[PartIdx]); + if (IsInsert) { + LLT PartTy = MRI.getType(VecParts[PartIdx]); - // Use the adjusted index to insert into one of the subvectors. - auto InsertPart = MIRBuilder.buildInsertVectorElement( - PartTy, VecParts[PartIdx], InsertVal, NewIdx); - VecParts[PartIdx] = InsertPart.getReg(0); + // Use the adjusted index to insert into one of the subvectors. + auto InsertPart = MIRBuilder.buildInsertVectorElement( + PartTy, VecParts[PartIdx], InsertVal, NewIdx); + VecParts[PartIdx] = InsertPart.getReg(0); - // Recombine the inserted subvector with the others to reform the result - // vector. - buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); - } else { - MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); + // Recombine the inserted subvector with the others to reform the result + // vector. + buildWidenedRemergeToDst(DstReg, LCMTy, VecParts); + } else { + MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx); + } } MI.eraseFromParent(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index e0e1af78770de..1b98fecc40ceb 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -1146,7 +1146,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampMaxNumElements(1, s32, 4) .clampMaxNumElements(1, s16, 8) .clampMaxNumElements(1, s8, 16) - .clampMaxNumElements(1, p0, 2); + .clampMaxNumElements(1, p0, 2) + .scalarizeIf(scalarOrEltWiderThan(1, 64), 1); getActionDefinitionsBuilder(G_INSERT_VECTOR_ELT) .legalIf( @@ -1161,7 +1162,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) .clampNumElements(0, v4s16, v8s16) .clampNumElements(0, v2s32, v4s32) .clampMaxNumElements(0, s64, 2) - .clampMaxNumElements(0, p0, 2); + .clampMaxNumElements(0, p0, 2) + .scalarizeIf(scalarOrEltWiderThan(0, 64), 0); getActionDefinitionsBuilder(G_BUILD_VECTOR) .legalFor({{v8s8, s8}, diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll index 13a43d6d35239..a9167ad6ebb70 100644 --- a/llvm/test/CodeGen/AArch64/insertextract.ll +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -1,17 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI - -; CHECK-GI: warning: Instruction selection used fallback path for insert_v2i128_0 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2i128_1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2i128_c -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2fp128_0 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2fp128_1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for insert_v2fp128_c -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v2i128_0 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v2i128_1 -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v2i128_c -; CHECK-GI-NEXT: warning: Instruction selection used fallback path for extract_v2fp128_c +; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define <2 x double> @insert_v2f64_0(<2 x double> %a, double %b, i32 %c) { ; CHECK-LABEL: insert_v2f64_0: @@ -1324,13 +1313,21 @@ entry: } define <2 x i128> @insert_v2i128_0(<2 x i128> %a, i128 %b, i32 %c) { -; CHECK-LABEL: insert_v2i128_0: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adds x2, x2, x2 -; CHECK-NEXT: mov x1, x5 -; CHECK-NEXT: mov x0, x4 -; CHECK-NEXT: adc x3, x3, x3 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2i128_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: adds x2, x2, x2 +; CHECK-SD-NEXT: mov x1, x5 +; CHECK-SD-NEXT: mov x0, x4 +; CHECK-SD-NEXT: adc x3, x3, x3 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2i128_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adds x2, x2, x2 +; CHECK-GI-NEXT: mov x0, x4 +; CHECK-GI-NEXT: mov x1, x5 +; CHECK-GI-NEXT: adc x3, x3, x3 +; CHECK-GI-NEXT: ret entry: %aa = add <2 x i128> %a, %a %d = insertelement <2 x i128> %aa, i128 %b, i32 0 @@ -1338,13 +1335,21 @@ entry: } define <2 x i128> @insert_v2i128_1(<2 x i128> %a, i128 %b, i32 %c) { -; CHECK-LABEL: insert_v2i128_1: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adds x0, x0, x0 -; CHECK-NEXT: mov x3, x5 -; CHECK-NEXT: mov x2, x4 -; CHECK-NEXT: adc x1, x1, x1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2i128_1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: adds x0, x0, x0 +; CHECK-SD-NEXT: mov x3, x5 +; CHECK-SD-NEXT: mov x2, x4 +; CHECK-SD-NEXT: adc x1, x1, x1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2i128_1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adds x0, x0, x0 +; CHECK-GI-NEXT: mov x2, x4 +; CHECK-GI-NEXT: mov x3, x5 +; CHECK-GI-NEXT: adc x1, x1, x1 +; CHECK-GI-NEXT: ret entry: %aa = add <2 x i128> %a, %a %d = insertelement <2 x i128> %aa, i128 %b, i32 1 @@ -1352,28 +1357,63 @@ entry: } define <2 x i128> @insert_v2i128_c(<2 x i128> %a, i128 %b, i32 %c) { -; CHECK-LABEL: insert_v2i128_c: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: // kill: def $w6 killed $w6 def $x6 -; CHECK-NEXT: adds x8, x0, x0 -; CHECK-NEXT: and x11, x6, #0x1 -; CHECK-NEXT: mov x12, sp -; CHECK-NEXT: adc x9, x1, x1 -; CHECK-NEXT: adds x10, x2, x2 -; CHECK-NEXT: add x11, x12, x11, lsl #4 -; CHECK-NEXT: str x8, [sp] -; CHECK-NEXT: adc x8, x3, x3 -; CHECK-NEXT: str x10, [sp, #16] -; CHECK-NEXT: str x4, [x11] -; CHECK-NEXT: str x8, [sp, #24] -; CHECK-NEXT: str x9, [sp, #8] -; CHECK-NEXT: str x5, [x11, #8] -; CHECK-NEXT: ldp x0, x1, [sp] -; CHECK-NEXT: ldp x2, x3, [sp, #16] -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2i128_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #32 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: // kill: def $w6 killed $w6 def $x6 +; CHECK-SD-NEXT: adds x8, x0, x0 +; CHECK-SD-NEXT: and x11, x6, #0x1 +; CHECK-SD-NEXT: mov x12, sp +; CHECK-SD-NEXT: adc x9, x1, x1 +; CHECK-SD-NEXT: adds x10, x2, x2 +; CHECK-SD-NEXT: add x11, x12, x11, lsl #4 +; CHECK-SD-NEXT: str x8, [sp] +; CHECK-SD-NEXT: adc x8, x3, x3 +; CHECK-SD-NEXT: str x10, [sp, #16] +; CHECK-SD-NEXT: str x4, [x11] +; CHECK-SD-NEXT: str x8, [sp, #24] +; CHECK-SD-NEXT: str x9, [sp, #8] +; CHECK-SD-NEXT: str x5, [x11, #8] +; CHECK-SD-NEXT: ldp x0, x1, [sp] +; CHECK-SD-NEXT: ldp x2, x3, [sp, #16] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2i128_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: adds x8, x0, x0 +; CHECK-GI-NEXT: mov v2.d[0], x4 +; CHECK-GI-NEXT: adc x9, x1, x1 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: adds x8, x2, x2 +; CHECK-GI-NEXT: mov v1.d[0], x8 +; CHECK-GI-NEXT: adc x8, x3, x3 +; CHECK-GI-NEXT: mov v2.d[1], x5 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: mov w8, w6 +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: str q2, [x9, x8, lsl #4] +; CHECK-GI-NEXT: ldp q0, q1, [sp] +; CHECK-GI-NEXT: mov d2, v0.d[1] +; CHECK-GI-NEXT: mov d3, v1.d[1] +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: fmov x2, d1 +; CHECK-GI-NEXT: fmov x1, d2 +; CHECK-GI-NEXT: fmov x3, d3 +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %aa = add <2 x i128> %a, %a %d = insertelement <2 x i128> %aa, i128 %b, i32 %c @@ -1381,20 +1421,38 @@ entry: } define <2 x fp128> @insert_v2fp128_0(<2 x fp128> %a, fp128 %b, i32 %c) { -; CHECK-LABEL: insert_v2fp128_0: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2fp128_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #32 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: mov v0.16b, v1.16b +; CHECK-SD-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __addtf3 +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2fp128_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #48 +; CHECK-GI-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 48 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: stp q1, q2, [sp] // 32-byte Folded Spill +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #48 +; CHECK-GI-NEXT: ret entry: %aa = fadd <2 x fp128> %a, %a %d = insertelement <2 x fp128> %aa, fp128 %b, i32 0 @@ -1402,19 +1460,38 @@ entry: } define <2 x fp128> @insert_v2fp128_1(<2 x fp128> %a, fp128 %b, i32 %c) { -; CHECK-LABEL: insert_v2fp128_1: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #32 -; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: str q2, [sp] // 16-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload -; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2fp128_1: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #32 +; CHECK-SD-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: str q2, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __addtf3 +; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2fp128_1: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub sp, sp, #64 +; CHECK-GI-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-GI-NEXT: .cfi_def_cfa_offset 64 +; CHECK-GI-NEXT: .cfi_offset w30, -16 +; CHECK-GI-NEXT: stp q1, q2, [sp, #16] // 32-byte Folded Spill +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-GI-NEXT: add sp, sp, #64 +; CHECK-GI-NEXT: ret entry: %aa = fadd <2 x fp128> %a, %a %d = insertelement <2 x fp128> %aa, fp128 %b, i32 1 @@ -1422,32 +1499,65 @@ entry: } define <2 x fp128> @insert_v2fp128_c(<2 x fp128> %a, fp128 %b, i32 %c) { -; CHECK-LABEL: insert_v2fp128_c: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #96 -; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset w19, -8 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: str q2, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: bl __addtf3 -; CHECK-NEXT: str q0, [sp, #64] -; CHECK-NEXT: ldp q3, q0, [sp, #16] // 32-byte Folded Reload -; CHECK-NEXT: and x8, x19, #0x1 -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: str q3, [sp, #48] -; CHECK-NEXT: str q0, [x9, x8, lsl #4] -; CHECK-NEXT: ldp q0, q1, [sp, #48] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: add sp, sp, #96 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: insert_v2fp128_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #96 +; CHECK-SD-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; CHECK-SD-NEXT: .cfi_def_cfa_offset 96 +; CHECK-SD-NEXT: .cfi_offset w19, -8 +; CHECK-SD-NEXT: .cfi_offset w30, -16 +; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: mov w19, w0 +; CHECK-SD-NEXT: str q2, [sp, #32] // 16-byte Folded Spill +; CHECK-SD-NEXT: bl __addtf3 +; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill +; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: bl __addtf3 +; CHECK-SD-NEXT: str q0, [sp, #64] +; CHECK-SD-NEXT: ldp q3, q0, [sp, #16] // 32-byte Folded Reload +; CHECK-SD-NEXT: and x8, x19, #0x1 +; CHECK-SD-NEXT: add x9, sp, #48 +; CHECK-SD-NEXT: str q3, [sp, #48] +; CHECK-SD-NEXT: str q0, [x9, x8, lsl #4] +; CHECK-SD-NEXT: ldp q0, q1, [sp, #48] +; CHECK-SD-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; CHECK-SD-NEXT: add sp, sp, #96 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: insert_v2fp128_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #96 +; CHECK-GI-NEXT: str x19, [sp, #16] // 8-byte Folded Spill +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 32 +; CHECK-GI-NEXT: .cfi_offset w19, -16 +; CHECK-GI-NEXT: .cfi_offset w30, -24 +; CHECK-GI-NEXT: .cfi_offset w29, -32 +; CHECK-GI-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: mov w19, w0 +; CHECK-GI-NEXT: str q2, [sp, #48] // 16-byte Folded Spill +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: str q0, [sp, #32] // 16-byte Folded Spill +; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: bl __addtf3 +; CHECK-GI-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload +; CHECK-GI-NEXT: mov w8, w19 +; CHECK-GI-NEXT: add x9, sp, #64 +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: stp q1, q0, [sp, #64] +; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload +; CHECK-GI-NEXT: str q0, [x9, x8, lsl #4] +; CHECK-GI-NEXT: ldp q0, q1, [sp, #64] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload +; CHECK-GI-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %aa = fadd <2 x fp128> %a, %a %d = insertelement <2 x fp128> %aa, fp128 %b, i32 %c @@ -2741,31 +2851,60 @@ entry: } define i128 @extract_v2i128_c(<2 x i128> %a, i32 %c) { -; CHECK-LABEL: extract_v2i128_c: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub sp, sp, #64 -; CHECK-NEXT: .cfi_def_cfa_offset 64 -; CHECK-NEXT: adds x9, x0, x0 -; CHECK-NEXT: mov w8, w4 -; CHECK-NEXT: adc x10, x1, x1 -; CHECK-NEXT: adds x11, x2, x2 -; CHECK-NEXT: fmov d1, x9 -; CHECK-NEXT: fmov d0, x11 -; CHECK-NEXT: adc x12, x3, x3 -; CHECK-NEXT: add x8, x8, x8 -; CHECK-NEXT: and x9, x8, #0x3 -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: mov x11, sp -; CHECK-NEXT: mov v1.d[1], x10 -; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: and x8, x8, #0x3 -; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: stp q1, q0, [sp] -; CHECK-NEXT: stp q1, q0, [sp, #32] -; CHECK-NEXT: ldr x0, [x10, x9, lsl #3] -; CHECK-NEXT: ldr x1, [x11, x8, lsl #3] -; CHECK-NEXT: add sp, sp, #64 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extract_v2i128_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: sub sp, sp, #64 +; CHECK-SD-NEXT: .cfi_def_cfa_offset 64 +; CHECK-SD-NEXT: adds x9, x0, x0 +; CHECK-SD-NEXT: mov w8, w4 +; CHECK-SD-NEXT: adc x10, x1, x1 +; CHECK-SD-NEXT: adds x11, x2, x2 +; CHECK-SD-NEXT: fmov d1, x9 +; CHECK-SD-NEXT: fmov d0, x11 +; CHECK-SD-NEXT: adc x12, x3, x3 +; CHECK-SD-NEXT: add x8, x8, x8 +; CHECK-SD-NEXT: and x9, x8, #0x3 +; CHECK-SD-NEXT: add w8, w8, #1 +; CHECK-SD-NEXT: mov x11, sp +; CHECK-SD-NEXT: mov v1.d[1], x10 +; CHECK-SD-NEXT: add x10, sp, #32 +; CHECK-SD-NEXT: and x8, x8, #0x3 +; CHECK-SD-NEXT: mov v0.d[1], x12 +; CHECK-SD-NEXT: stp q1, q0, [sp] +; CHECK-SD-NEXT: stp q1, q0, [sp, #32] +; CHECK-SD-NEXT: ldr x0, [x10, x9, lsl #3] +; CHECK-SD-NEXT: ldr x1, [x11, x8, lsl #3] +; CHECK-SD-NEXT: add sp, sp, #64 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i128_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: adds x8, x0, x0 +; CHECK-GI-NEXT: adc x9, x1, x1 +; CHECK-GI-NEXT: mov v0.d[0], x8 +; CHECK-GI-NEXT: adds x8, x2, x2 +; CHECK-GI-NEXT: mov v1.d[0], x8 +; CHECK-GI-NEXT: adc x8, x3, x3 +; CHECK-GI-NEXT: mov v0.d[1], x9 +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: mov v1.d[1], x8 +; CHECK-GI-NEXT: mov w8, w4 +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: ldr q0, [x9, x8, lsl #4] +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: fmov x0, d0 +; CHECK-GI-NEXT: fmov x1, d1 +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %b = add <2 x i128> %a, %a %d = extractelement <2 x i128> %b, i32 %c @@ -2792,16 +2931,34 @@ entry: } define fp128 @extract_v2fp128_c(<2 x fp128> %a, i32 %c) { -; CHECK-LABEL: extract_v2fp128_c: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp q0, q1, [sp, #-32]! -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 -; CHECK-NEXT: and x8, x0, #0x1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: ldr q0, [x9, x8, lsl #4] -; CHECK-NEXT: add sp, sp, #32 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extract_v2fp128_c: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]! +; CHECK-SD-NEXT: .cfi_def_cfa_offset 32 +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: and x8, x0, #0x1 +; CHECK-SD-NEXT: mov x9, sp +; CHECK-SD-NEXT: ldr q0, [x9, x8, lsl #4] +; CHECK-SD-NEXT: add sp, sp, #32 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2fp128_c: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-GI-NEXT: sub x9, sp, #48 +; CHECK-GI-NEXT: mov x29, sp +; CHECK-GI-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-GI-NEXT: .cfi_def_cfa w29, 16 +; CHECK-GI-NEXT: .cfi_offset w30, -8 +; CHECK-GI-NEXT: .cfi_offset w29, -16 +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: stp q0, q1, [sp] +; CHECK-GI-NEXT: mov x9, sp +; CHECK-GI-NEXT: and x8, x8, #0x1 +; CHECK-GI-NEXT: ldr q0, [x9, x8, lsl #4] +; CHECK-GI-NEXT: mov sp, x29 +; CHECK-GI-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-GI-NEXT: ret entry: %d = extractelement <2 x fp128> %a, i32 %c ret fp128 %d