From 7d055ed9b9db19ac232e791d3ef4587e3a7ede75 Mon Sep 17 00:00:00 2001 From: Lucas Ste <38472950+LucasSte@users.noreply.github.com> Date: Thu, 20 Jun 2024 15:59:56 -0300 Subject: [PATCH] [SOL] Correctly copy 16-byte aligned memory (#97) * Fix issue with copying 16-byte aligned memory --- llvm/lib/Target/SBF/SBFInstrInfo.cpp | 72 +++++++++------ llvm/lib/Target/SBF/SBFSelectionDAGInfo.cpp | 6 +- .../CodeGen/SBF/memcpy-expand-in-order.ll | 6 +- llvm/test/CodeGen/SBF/memcpy_16.ll | 92 +++++++++++++++++++ 4 files changed, 142 insertions(+), 34 deletions(-) create mode 100644 llvm/test/CodeGen/SBF/memcpy_16.ll diff --git a/llvm/lib/Target/SBF/SBFInstrInfo.cpp b/llvm/lib/Target/SBF/SBFInstrInfo.cpp index c0047291f4398b..b222513ae2f112 100644 --- a/llvm/lib/Target/SBF/SBFInstrInfo.cpp +++ b/llvm/lib/Target/SBF/SBFInstrInfo.cpp @@ -52,6 +52,7 @@ void SBFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { DebugLoc dl = MI->getDebugLoc(); unsigned LdOpc, StOpc; + unsigned BytesPerOp = std::min(static_cast(Alignment), 8u); switch (Alignment) { case 1: LdOpc = SBF::LDB; @@ -66,6 +67,7 @@ void SBFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { StOpc = SBF::STW; break; case 8: + case 16: LdOpc = SBF::LDD; StOpc = SBF::STD; break; @@ -73,42 +75,54 @@ void SBFInstrInfo::expandMEMCPY(MachineBasicBlock::iterator MI) const { llvm_unreachable("unsupported memcpy alignment"); } - unsigned IterationNum = CopyLen >> Log2_64(Alignment); - for(unsigned I = 0; I < IterationNum; ++I) { + unsigned IterationNum = (CopyLen >> Log2_64(BytesPerOp)); + for (unsigned I = 0; I < IterationNum; ++I) { BuildMI(*BB, MI, dl, get(LdOpc)) - .addReg(ScratchReg, RegState::Define).addReg(SrcReg) - .addImm(I * Alignment); + .addReg(ScratchReg, RegState::Define) + .addReg(SrcReg) + .addImm(I * BytesPerOp); BuildMI(*BB, MI, dl, get(StOpc)) - .addReg(ScratchReg, RegState::Kill).addReg(DstReg) - .addImm(I * Alignment); + .addReg(ScratchReg, RegState::Kill) + .addReg(DstReg) + .addImm(I * BytesPerOp); } - unsigned BytesLeft = CopyLen & (Alignment - 1); - unsigned Offset = IterationNum * Alignment; - bool Hanging4Byte = BytesLeft & 0x4; - bool Hanging2Byte = BytesLeft & 0x2; - bool Hanging1Byte = BytesLeft & 0x1; - if (Hanging4Byte) { - BuildMI(*BB, MI, dl, get(SBF::LDW)) - .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset); - BuildMI(*BB, MI, dl, get(SBF::STW)) - .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset); - Offset += 4; + unsigned BytesLeft = CopyLen - IterationNum * BytesPerOp; + unsigned Offset; + if (BytesLeft == 0) { + BB->erase(MI); + return; } - if (Hanging2Byte) { - BuildMI(*BB, MI, dl, get(SBF::LDH)) - .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset); - BuildMI(*BB, MI, dl, get(SBF::STH)) - .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset); - Offset += 2; - } - if (Hanging1Byte) { - BuildMI(*BB, MI, dl, get(SBF::LDB)) - .addReg(ScratchReg, RegState::Define).addReg(SrcReg).addImm(Offset); - BuildMI(*BB, MI, dl, get(SBF::STB)) - .addReg(ScratchReg, RegState::Kill).addReg(DstReg).addImm(Offset); + + if (BytesLeft < 2) { + Offset = CopyLen - 1; + LdOpc = SBF::LDB; + StOpc = SBF::STB; + } else if (BytesLeft <= 2) { + Offset = CopyLen - 2; + LdOpc = SBF::LDH; + StOpc = SBF::STH; + } else if (BytesLeft <= 4) { + Offset = CopyLen - 4; + LdOpc = SBF::LDW; + StOpc = SBF::STW; + } else if (BytesLeft <= 8) { + Offset = CopyLen - 8; + LdOpc = SBF::LDD; + StOpc = SBF::STD; + } else { + llvm_unreachable("There cannot be more than 8 bytes left"); } + BuildMI(*BB, MI, dl, get(LdOpc)) + .addReg(ScratchReg, RegState::Define) + .addReg(SrcReg) + .addImm(Offset); + BuildMI(*BB, MI, dl, get(StOpc)) + .addReg(ScratchReg, RegState::Kill) + .addReg(DstReg) + .addImm(Offset); + BB->erase(MI); } diff --git a/llvm/lib/Target/SBF/SBFSelectionDAGInfo.cpp b/llvm/lib/Target/SBF/SBFSelectionDAGInfo.cpp index 0a5d3aab104628..53f4e42b25e126 100644 --- a/llvm/lib/Target/SBF/SBFSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SBF/SBFSelectionDAGInfo.cpp @@ -27,7 +27,11 @@ SDValue SBFSelectionDAGInfo::EmitTargetCodeForMemcpy( return SDValue(); unsigned CopyLen = ConstantSize->getZExtValue(); - unsigned StoresNumEstimate = alignTo(CopyLen, Alignment) >> Log2(Alignment); + // If the alignment is greater than 8, we can only store and load 8 bytes at a + // time. + uint64_t BytesPerOp = std::min(Alignment.value(), static_cast(8)); + unsigned StoresNumEstimate = + alignTo(CopyLen, Alignment) >> Log2_64(BytesPerOp); // Impose the same copy length limit as MaxStoresPerMemcpy. if (StoresNumEstimate > getCommonMaxStoresPerMemFunc()) return SDValue(); diff --git a/llvm/test/CodeGen/SBF/memcpy-expand-in-order.ll b/llvm/test/CodeGen/SBF/memcpy-expand-in-order.ll index 2c59a27aab5967..e081396d3fc49c 100644 --- a/llvm/test/CodeGen/SBF/memcpy-expand-in-order.ll +++ b/llvm/test/CodeGen/SBF/memcpy-expand-in-order.ll @@ -72,7 +72,5 @@ entry: ; CHECK: stxdw [[[DST_REG]] + 8], [[SCRATCH_REG]] ; CHECK: ldxdw [[SCRATCH_REG]], [[[SRC_REG]] + 16] ; CHECK: stxdw [[[DST_REG]] + 16], [[SCRATCH_REG]] -; CHECK: ldxh [[SCRATCH_REG]], [[[SRC_REG]] + 24] -; CHECK: stxh [[[DST_REG]] + 24], [[SCRATCH_REG]] -; CHECK: ldxb [[SCRATCH_REG]], [[[SRC_REG]] + 26] -; CHECK: stxb [[[DST_REG]] + 26], [[SCRATCH_REG]] +; CHECK: ldxw [[SCRATCH_REG]], [[[SRC_REG]] + 23] +; CHECK: stxw [[[DST_REG]] + 23], [[SCRATCH_REG]] diff --git a/llvm/test/CodeGen/SBF/memcpy_16.ll b/llvm/test/CodeGen/SBF/memcpy_16.ll new file mode 100644 index 00000000000000..4cfa911aab1911 --- /dev/null +++ b/llvm/test/CodeGen/SBF/memcpy_16.ll @@ -0,0 +1,92 @@ +; RUN: llc < %s -march=sbf -sbf-expand-memcpy-in-order | FileCheck %s + +; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1 + +define void @memcpy_test_1(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 { +entry: + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 32, i1 0) + +; 4 pairs of loads and stores +; CHECK: memcpy_test_1 +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 16] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 16], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 24] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 24], [[SCRATCH_REG:r[0-9]]] + ret void +} + +define void @memcpy_test_2(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 { +entry: + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 17, i1 0) + +; 2 pairs of loads and stores + 1 pair for the byte +; CHECK: memcpy_test_2 +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxb [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 16] +; CHECK: stxb [[[DST_REG:r[0-9]]] + 16], [[SCRATCH_REG:r[0-9]]] + ret void +} + +define void @memcpy_test_3(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 { +entry: + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 18, i1 0) + +; 2 pairs of loads and stores + 1 pair for the 2 bytes +; CHECK: memcpy_test_3 +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxh [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 16] +; CHECK: stxh [[[DST_REG:r[0-9]]] + 16], [[SCRATCH_REG:r[0-9]]] + ret void +} + +define void @memcpy_test_4(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 { +entry: + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 19, i1 0) + +; 2 pairs of loads and stores + 1 pair for the 3 bytes +; CHECK: memcpy_test_4 +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 15] +; CHECK: stxw [[[DST_REG:r[0-9]]] + 15], [[SCRATCH_REG:r[0-9]]] + ret void +} + +define void @memcpy_test_5(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 { +entry: + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 21, i1 0) + +; 2 pairs of loads and stores + 1 pair for the 5 bytes +; CHECK: memcpy_test_5 +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 0] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 0], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 8] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 8], [[SCRATCH_REG:r[0-9]]] +; CHECK: ldxdw [[SCRATCH_REG:r[0-9]]], [[[SRC_REG:r[0-9]]] + 13] +; CHECK: stxdw [[[DST_REG:r[0-9]]] + 13], [[SCRATCH_REG:r[0-9]]] + ret void +} + +define void @memcpy_test_6(ptr align 16 %a, ptr align 16 %b) local_unnamed_addr #0 { +entry: + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %a, ptr align 16 %b, i64 33, i1 0) + +; More than 32 bytes, call memcpy +; CHECK: memcpy_test_6 +; CHECK: mov64 r3, 33 +; CHECK: call memcpy + ret void +} \ No newline at end of file