Skip to content

Commit

Permalink
Fix interaction between stack alignment and inline-asm stack clash pr…
Browse files Browse the repository at this point in the history
…otection

As reported in rust-lang/rust#70143 alignment is not
taken into account when doing the probing. Fix that by adjusting the first probe
if the stack align is small, or by extending the dynamic probing if the
alignment is large.

Differential Revision: https://reviews.llvm.org/D84419

(cherry picked from commit 387e2c2)
  • Loading branch information
serge-sans-paille authored and tstellar committed Nov 25, 2020
1 parent d842e9b commit 8278599
Show file tree
Hide file tree
Showing 6 changed files with 512 additions and 51 deletions.
222 changes: 202 additions & 20 deletions llvm/lib/Target/X86/X86FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,29 +586,55 @@ void X86FrameLowering::emitStackProbeInlineGeneric(
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
uint64_t ProbeChunk = StackProbeSize * 8;

uint64_t MaxAlign =
TRI->needsStackRealignment(MF) ? calculateMaxStackAlign(MF) : 0;

// Synthesize a loop or unroll it, depending on the number of iterations.
// BuildStackAlignAND ensures that only MaxAlign % StackProbeSize bits left
// between the unaligned rsp and current rsp.
if (Offset > ProbeChunk) {
emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset);
emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset,
MaxAlign % StackProbeSize);
} else {
emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset);
emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset,
MaxAlign % StackProbeSize);
}
}

void X86FrameLowering::emitStackProbeInlineGenericBlock(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
uint64_t Offset) const {
MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
uint64_t AlignOffset) const {

const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const X86TargetLowering &TLI = *STI.getTargetLowering();
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);

uint64_t CurrentOffset = 0;
// 0 Thanks to return address being saved on the stack
uint64_t CurrentProbeOffset = 0;

// For the first N - 1 pages, just probe. I tried to take advantage of
assert(AlignOffset < StackProbeSize);

// If the offset is so small it fits within a page, there's nothing to do.
if (StackProbeSize < Offset + AlignOffset) {

MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
.addReg(StackPtr)
.addImm(StackProbeSize - AlignOffset)
.setMIFlag(MachineInstr::FrameSetup);
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.

addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
.setMIFlag(MachineInstr::FrameSetup),
StackPtr, false, 0)
.addImm(0)
.setMIFlag(MachineInstr::FrameSetup);
NumFrameExtraProbe++;
CurrentOffset = StackProbeSize - AlignOffset;
}

// For the next N - 1 pages, just probe. I tried to take advantage of
// natural probes but it implies much more logic and there was very few
// interesting natural probes to interleave.
while (CurrentOffset + StackProbeSize < Offset) {
Expand All @@ -626,9 +652,9 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock(
.setMIFlag(MachineInstr::FrameSetup);
NumFrameExtraProbe++;
CurrentOffset += StackProbeSize;
CurrentProbeOffset += StackProbeSize;
}

// No need to probe the tail, it is smaller than a Page.
uint64_t ChunkSize = Offset - CurrentOffset;
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
.addReg(StackPtr)
Expand All @@ -639,15 +665,35 @@ void X86FrameLowering::emitStackProbeInlineGenericBlock(

void X86FrameLowering::emitStackProbeInlineGenericLoop(
MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
uint64_t Offset) const {
MachineBasicBlock::iterator MBBI, const DebugLoc &DL, uint64_t Offset,
uint64_t AlignOffset) const {
assert(Offset && "null offset");

const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const X86TargetLowering &TLI = *STI.getTargetLowering();
const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);

if (AlignOffset) {
if (AlignOffset < StackProbeSize) {
// Perform a first smaller allocation followed by a probe.
const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, AlignOffset);
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), StackPtr)
.addReg(StackPtr)
.addImm(AlignOffset)
.setMIFlag(MachineInstr::FrameSetup);
MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.

addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
.setMIFlag(MachineInstr::FrameSetup),
StackPtr, false, 0)
.addImm(0)
.setMIFlag(MachineInstr::FrameSetup);
NumFrameExtraProbe++;
Offset -= AlignOffset;
}
}

// Synthesize a loop
NumFrameLoopProbe++;
const BasicBlock *LLVM_BB = MBB.getBasicBlock();
Expand All @@ -666,17 +712,17 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(

// save loop bound
{
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed)
const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset);
BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
.addReg(FinalStackProbed)
.addImm(Offset / StackProbeSize * StackProbeSize)
.setMIFlag(MachineInstr::FrameSetup);
}

// allocate a page
{
const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
BuildMI(testMBB, DL, TII.get(Opc), StackPtr)
const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
BuildMI(testMBB, DL, TII.get(SUBOpc), StackPtr)
.addReg(StackPtr)
.addImm(StackProbeSize)
.setMIFlag(MachineInstr::FrameSetup);
Expand Down Expand Up @@ -1052,13 +1098,149 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
uint64_t MaxAlign) const {
uint64_t Val = -MaxAlign;
unsigned AndOp = getANDriOpcode(Uses64BitFramePtr, Val);
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
.addReg(Reg)
.addImm(Val)
.setMIFlag(MachineInstr::FrameSetup);

// The EFLAGS implicit def is dead.
MI->getOperand(3).setIsDead();
MachineFunction &MF = *MBB.getParent();
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
const X86TargetLowering &TLI = *STI.getTargetLowering();
const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);

// We want to make sure that (in worst case) less than StackProbeSize bytes
// are not probed after the AND. This assumption is used in
// emitStackProbeInlineGeneric.
if (Reg == StackPtr && EmitInlineStackProbe && MaxAlign >= StackProbeSize) {
{
NumFrameLoopProbe++;
MachineBasicBlock *entryMBB =
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
MachineBasicBlock *headMBB =
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
MachineBasicBlock *bodyMBB =
MF.CreateMachineBasicBlock(MBB.getBasicBlock());
MachineBasicBlock *footMBB =
MF.CreateMachineBasicBlock(MBB.getBasicBlock());

MachineFunction::iterator MBBIter = MBB.getIterator();
MF.insert(MBBIter, entryMBB);
MF.insert(MBBIter, headMBB);
MF.insert(MBBIter, bodyMBB);
MF.insert(MBBIter, footMBB);
const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;

// Setup entry block
{

entryMBB->splice(entryMBB->end(), &MBB, MBB.begin(), MBBI);
BuildMI(entryMBB, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);
MachineInstr *MI =
BuildMI(entryMBB, DL, TII.get(AndOp), FinalStackProbed)
.addReg(FinalStackProbed)
.addImm(Val)
.setMIFlag(MachineInstr::FrameSetup);

// The EFLAGS implicit def is dead.
MI->getOperand(3).setIsDead();

BuildMI(entryMBB, DL,
TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
.addReg(FinalStackProbed)
.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);
BuildMI(entryMBB, DL, TII.get(X86::JCC_1))
.addMBB(&MBB)
.addImm(X86::COND_E)
.setMIFlag(MachineInstr::FrameSetup);
entryMBB->addSuccessor(headMBB);
entryMBB->addSuccessor(&MBB);
}

// Loop entry block

{
const unsigned SUBOpc =
getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
BuildMI(headMBB, DL, TII.get(SUBOpc), StackPtr)
.addReg(StackPtr)
.addImm(StackProbeSize)
.setMIFlag(MachineInstr::FrameSetup);

BuildMI(headMBB, DL,
TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
.addReg(FinalStackProbed)
.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);

// jump
BuildMI(headMBB, DL, TII.get(X86::JCC_1))
.addMBB(footMBB)
.addImm(X86::COND_B)
.setMIFlag(MachineInstr::FrameSetup);

headMBB->addSuccessor(bodyMBB);
headMBB->addSuccessor(footMBB);
}

// setup loop body
{
addRegOffset(BuildMI(bodyMBB, DL, TII.get(MovMIOpc))
.setMIFlag(MachineInstr::FrameSetup),
StackPtr, false, 0)
.addImm(0)
.setMIFlag(MachineInstr::FrameSetup);

const unsigned SUBOpc =
getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
BuildMI(bodyMBB, DL, TII.get(SUBOpc), StackPtr)
.addReg(StackPtr)
.addImm(StackProbeSize)
.setMIFlag(MachineInstr::FrameSetup);

// cmp with stack pointer bound
BuildMI(bodyMBB, DL,
TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
.addReg(FinalStackProbed)
.addReg(StackPtr)
.setMIFlag(MachineInstr::FrameSetup);

// jump
BuildMI(bodyMBB, DL, TII.get(X86::JCC_1))
.addMBB(bodyMBB)
.addImm(X86::COND_B)
.setMIFlag(MachineInstr::FrameSetup);
bodyMBB->addSuccessor(bodyMBB);
bodyMBB->addSuccessor(footMBB);
}

// setup loop footer
{
BuildMI(footMBB, DL, TII.get(TargetOpcode::COPY), StackPtr)
.addReg(FinalStackProbed)
.setMIFlag(MachineInstr::FrameSetup);
addRegOffset(BuildMI(footMBB, DL, TII.get(MovMIOpc))
.setMIFlag(MachineInstr::FrameSetup),
StackPtr, false, 0)
.addImm(0)
.setMIFlag(MachineInstr::FrameSetup);
footMBB->addSuccessor(&MBB);
}

recomputeLiveIns(*headMBB);
recomputeLiveIns(*bodyMBB);
recomputeLiveIns(*footMBB);
recomputeLiveIns(MBB);
}
} else {
MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
.addReg(Reg)
.addImm(Val)
.setMIFlag(MachineInstr::FrameSetup);

// The EFLAGS implicit def is dead.
MI->getOperand(3).setIsDead();
}
}

bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/X86/X86FrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,14 +213,14 @@ class X86FrameLowering : public TargetFrameLowering {
void emitStackProbeInlineGenericBlock(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,
uint64_t Offset) const;
const DebugLoc &DL, uint64_t Offset,
uint64_t Align) const;

void emitStackProbeInlineGenericLoop(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL,
uint64_t Offset) const;
const DebugLoc &DL, uint64_t Offset,
uint64_t Align) const;

/// Emit a stub to later inline the target stack probe.
MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
Expand Down
88 changes: 88 additions & 0 deletions llvm/test/CodeGen/X86/stack-clash-large-large-align.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
; RUN: llc < %s | FileCheck %s


target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i32 @foo_noprotect() local_unnamed_addr {
; CHECK-LABEL: foo_noprotect:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: andq $-4096, %rsp # imm = 0xF000
; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000
; CHECK-NEXT: movl $1, 392(%rsp)
; CHECK-NEXT: movl $1, 28792(%rsp)
; CHECK-NEXT: movl (%rsp), %eax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: retq


%a = alloca i32, i64 18000, align 4096
%b0 = getelementptr inbounds i32, i32* %a, i64 98
%b1 = getelementptr inbounds i32, i32* %a, i64 7198
store volatile i32 1, i32* %b0
store volatile i32 1, i32* %b1
%c = load volatile i32, i32* %a
ret i32 %c
}

define i32 @foo_protect() local_unnamed_addr #0 {
; CHECK-LABEL: foo_protect:
; CHECK: # %bb.0:
; CHECK-NEXT: pushq %rbp
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset %rbp, -16
; CHECK-NEXT: movq %rsp, %rbp
; CHECK-NEXT: .cfi_def_cfa_register %rbp
; CHECK-NEXT: movq %rsp, %r11
; CHECK-NEXT: andq $-4096, %r11 # imm = 0xF000
; CHECK-NEXT: cmpq %rsp, %r11
; CHECK-NEXT: je .LBB1_4
; CHECK-NEXT:# %bb.1:
; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
; CHECK-NEXT: cmpq %rsp, %r11
; CHECK-NEXT: jb .LBB1_3
; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: movq $0, (%rsp)
; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
; CHECK-NEXT: cmpq %rsp, %r11
; CHECK-NEXT: jb .LBB1_2
; CHECK-NEXT:.LBB1_3:
; CHECK-NEXT: movq %r11, %rsp
; CHECK-NEXT: movq $0, (%rsp)
; CHECK-NEXT:.LBB1_4:
; CHECK-NEXT: movq %rsp, %r11
; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000
; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000
; CHECK-NEXT: movq $0, (%rsp)
; CHECK-NEXT: cmpq %r11, %rsp
; CHECK-NEXT: jne .LBB1_5
; CHECK-NEXT:# %bb.6:
; CHECK-NEXT: movl $1, 392(%rsp)
; CHECK-NEXT: movl $1, 28792(%rsp)
; CHECK-NEXT: movl (%rsp), %eax
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
; CHECK-NEXT: retq




%a = alloca i32, i64 18000, align 4096
%b0 = getelementptr inbounds i32, i32* %a, i64 98
%b1 = getelementptr inbounds i32, i32* %a, i64 7198
store volatile i32 1, i32* %b0
store volatile i32 1, i32* %b1
%c = load volatile i32, i32* %a
ret i32 %c
}

attributes #0 = {"probe-stack"="inline-asm"}
Loading

0 comments on commit 8278599

Please sign in to comment.