From 7b3e40920fbef51a581993213365948208b736fa Mon Sep 17 00:00:00 2001 From: Filip Navara Date: Sun, 4 Feb 2024 03:48:45 +0100 Subject: [PATCH] [NativeAOT/ARM] Save R9 (REG_SAVED_LOCALLOC_SP) in PInvoke frames (#97919) * Save R9 (REG_SAVED_LOCALLOC_SP) in PInvoke frames * Handle 'mov r9, sp' as part of prolog * Remove m_ChainPointer from PInvokeTransitionFrame and update comments --- .../nativeaot/Runtime/arm/AsmOffsetsCpu.h | 8 ++++---- src/coreclr/nativeaot/Runtime/arm/GcProbe.S | 12 ++++-------- src/coreclr/nativeaot/Runtime/arm/PInvoke.S | 6 ++++-- src/coreclr/nativeaot/Runtime/inc/rhbinder.h | 7 ++----- .../Runtime/unix/UnixNativeCodeManager.cpp | 6 +++++- .../Runtime/unix/unixasmmacrosarm.inc | 19 +++++++++---------- .../JitInterface/CorInfoImpl.RyuJit.cs | 16 +++++++++------- 7 files changed, 37 insertions(+), 37 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/arm/AsmOffsetsCpu.h b/src/coreclr/nativeaot/Runtime/arm/AsmOffsetsCpu.h index f5b6631f510b2..4d40c0ecb27f6 100644 --- a/src/coreclr/nativeaot/Runtime/arm/AsmOffsetsCpu.h +++ b/src/coreclr/nativeaot/Runtime/arm/AsmOffsetsCpu.h @@ -17,11 +17,11 @@ PLAT_ASM_OFFSET(10, ExInfo, m_idxCurClause) PLAT_ASM_OFFSET(18, ExInfo, m_frameIter) PLAT_ASM_OFFSET(130, ExInfo, m_notifyDebuggerSP) +PLAT_ASM_OFFSET(0, PInvokeTransitionFrame, m_FramePointer) PLAT_ASM_OFFSET(4, PInvokeTransitionFrame, m_RIP) -PLAT_ASM_OFFSET(8, PInvokeTransitionFrame, m_FramePointer) -PLAT_ASM_OFFSET(0c, PInvokeTransitionFrame, m_pThread) -PLAT_ASM_OFFSET(10, PInvokeTransitionFrame, m_Flags) -PLAT_ASM_OFFSET(14, PInvokeTransitionFrame, m_PreservedRegs) +PLAT_ASM_OFFSET(8, PInvokeTransitionFrame, m_pThread) +PLAT_ASM_OFFSET(c, PInvokeTransitionFrame, m_Flags) +PLAT_ASM_OFFSET(10, PInvokeTransitionFrame, m_PreservedRegs) PLAT_ASM_SIZEOF(118, StackFrameIterator) PLAT_ASM_OFFSET(08, StackFrameIterator, m_FramePointer) diff --git a/src/coreclr/nativeaot/Runtime/arm/GcProbe.S b/src/coreclr/nativeaot/Runtime/arm/GcProbe.S index c202591d7fcd1..8277d9035b0d0 100644 --- a/src/coreclr/nativeaot/Runtime/arm/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/arm/GcProbe.S @@ -21,21 +21,19 @@ // Define the method prolog, allocating enough stack space for the PInvokeTransitionFrame and saving // incoming register values into it. PROLOG_VPUSH "{d0-d3}" // Save d0-d3 which can have the floating point return value - PROLOG_STACK_ALLOC 4 // Padding for 8-byte alignment PROLOG_PUSH "{r0,r1}" // Save return registers PROLOG_STACK_ALLOC 4 // Space for caller's SP PROLOG_PUSH "{r4-r10}" // Save non-volatile registers PROLOG_STACK_ALLOC 8 // Space for flags and Thread* - PROLOG_PUSH "{r11}" // Save caller's frame pointer - PROLOG_PUSH "{r11,lr}" // Save frame-chain pointer and return address + PROLOG_PUSH "{r11,lr}" // Save caller's frame pointer and return address str \threadReg, [sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] mov \trashReg, \BITMASK str \trashReg, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] // Compute SP value at entry to this method and save it in slot of the frame. - add \trashReg, sp, #(16 * 4 + 4 * 8) - str \trashReg, [sp, #(12 * 4)] + add \trashReg, sp, #(14 * 4 + 4 * 8) + str \trashReg, [sp, #(11 * 4)] // Link the frame into the Thread str sp, [\threadReg, #OFFSETOF__Thread__m_pDeferredTransitionFrame] @@ -47,13 +45,11 @@ // object refs or byrefs). // .macro POP_PROBE_FRAME - EPILOG_POP "{r11,lr}" // Restore frame-chain pointer and return address - EPILOG_POP "{r11}" // Restore caller's frame pointer + EPILOG_POP "{r11,lr}" // Restore caller's frame pointer and return address EPILOG_STACK_FREE 8 // Discard flags and Thread* EPILOG_POP "{r4-r10}" // Restore non-volatile registers EPILOG_STACK_FREE 4 // Discard caller's SP EPILOG_POP "{r0,r1}" // Restore return registers - EPILOG_STACK_FREE 4 // Discard padding for 8-byte alignment EPILOG_VPOP "{d0-d3}" // Restore d0-d3 which can have the floating point return value .endm diff --git a/src/coreclr/nativeaot/Runtime/arm/PInvoke.S b/src/coreclr/nativeaot/Runtime/arm/PInvoke.S index 39136c17935ea..72e434e4c21e4 100644 --- a/src/coreclr/nativeaot/Runtime/arm/PInvoke.S +++ b/src/coreclr/nativeaot/Runtime/arm/PInvoke.S @@ -20,8 +20,10 @@ NESTED_ENTRY RhpPInvoke, _TEXT, NoHandler str lr, [r0, #OFFSETOF__PInvokeTransitionFrame__m_RIP] str r11, [r0, #OFFSETOF__PInvokeTransitionFrame__m_FramePointer] - str sp, [r0, #OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs] - mov r3, #PTFF_SAVE_SP + // We need to save R9 which could be frame pointer if the caller method uses stackalloc (REG_SAVED_LOCALLOC_SP) + str r9, [r0, #OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs] + str sp, [r0, #OFFSETOF__PInvokeTransitionFrame__m_PreservedRegs + 4] + mov r3, #(PTFF_SAVE_R9 + PTFF_SAVE_SP) str r3, [r0, #OFFSETOF__PInvokeTransitionFrame__m_Flags] PROLOG_PUSH "{r5,lr}" diff --git a/src/coreclr/nativeaot/Runtime/inc/rhbinder.h b/src/coreclr/nativeaot/Runtime/inc/rhbinder.h index 7d4d2a149e4e8..f0ebc5b7a7e50 100644 --- a/src/coreclr/nativeaot/Runtime/inc/rhbinder.h +++ b/src/coreclr/nativeaot/Runtime/inc/rhbinder.h @@ -400,11 +400,8 @@ struct PInvokeTransitionFrame #else // USE_PORTABLE_HELPERS struct PInvokeTransitionFrame { -#ifdef TARGET_ARM - TgtPTR_Void m_ChainPointer; // R11, used by OS to walk stack quickly -#endif -#ifdef TARGET_ARM64 - // On arm64, the FP and LR registers are pushed in that order when setting up frames +#if defined(TARGET_ARM64) || defined(TARGET_ARM) + // On arm32/arm64, the FP and LR registers are pushed in that order when setting up frames TgtPTR_Void m_FramePointer; TgtPTR_Void m_RIP; #else diff --git a/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp b/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp index ee0097b7c0dd6..42766563f1a14 100644 --- a/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp @@ -532,11 +532,15 @@ int UnixNativeCodeManager::IsInProlog(MethodInfo * pMethodInfo, PTR_VOID pvAddre // MOV SP, R4 #define MOV_SP_R4 0x46A5 +// MOV R9, SP +#define MOV_R9_SP 0x46E9 + uint16_t* pInstr = (uint16_t*)pvAddress; uint32_t instr = *pInstr; if ((instr & SUB_SP_IMM_MASK) == SUB_SP_IMM_BITS || - (instr & PUSH_MASK) == PUSH_BITS) + (instr & PUSH_MASK) == PUSH_BITS || + instr == MOV_R9_SP) { return 1; } diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm.inc index 0c5fe62c5b199..8b4b426361897 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm.inc @@ -15,6 +15,7 @@ #define TSF_DoNotTriggerGc 0x10 #define PTFF_SAVE_ALL_PRESERVED 0x0000007F // NOTE: R11 is not included in this set! +#define PTFF_SAVE_R9 0x00000020 #define PTFF_SAVE_SP 0x00000100 #define PTFF_SAVE_R0 0x00000200 #define PTFF_THREAD_ABORT 0x00100000 @@ -244,30 +245,28 @@ C_FUNC(\Name): // .macro PUSH_COOP_PINVOKE_FRAME trashReg - PROLOG_STACK_ALLOC 8 // Save space for caller's SP and 8-byte alignment padding + PROLOG_STACK_ALLOC 4 // Save space for caller's SP PROLOG_PUSH "{r4-r10}" // Save preserved registers PROLOG_STACK_ALLOC 8 // Save space for flags and Thread* - PROLOG_PUSH "{r11}" // Save caller's FP - PROLOG_PUSH "{r11,lr}" // Save caller's frame-chain pointer and PC + PROLOG_PUSH "{r11,lr}" // Save caller's frame pointer and PC // Compute SP value at entry to this method and save it in the last slot of the frame (slot #12). - add \trashReg, sp, #(14 * 4) - str \trashReg, [sp, #(12 * 4)] + add \trashReg, sp, #(12 * 4) + str \trashReg, [sp, #(11 * 4)] - // Record the bitmask of saved registers in the frame (slot #4). + // Record the bitmask of saved registers in the frame (slot #3). mov \trashReg, #DEFAULT_FRAME_SAVE_FLAGS - str \trashReg, [sp, #(4 * 4)] + str \trashReg, [sp, #(3 * 4)] mov \trashReg, sp .endm // Pop the frame and restore register state preserved by PUSH_COOP_PINVOKE_FRAME .macro POP_COOP_PINVOKE_FRAME - EPILOG_POP "{r11,lr}" // Restore caller's frame-chain pointer and PC (return address) - EPILOG_POP "{r11}" // Restore caller's FP + EPILOG_POP "{r11,lr}" // Restore caller's frame pointer and PC (return address) EPILOG_STACK_FREE 8 // Discard flags and Thread* EPILOG_POP "{r4-r10}" // Restore preserved registers - EPILOG_STACK_FREE 8 // Discard caller's SP and 8-byte alignment padding + EPILOG_STACK_FREE 4 // Discard caller's SP .endm // thumb with PIC version diff --git a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs index 0e6dd63b49fc1..7da670a1d6aa3 100644 --- a/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs +++ b/src/coreclr/tools/aot/ILCompiler.RyuJit/JitInterface/CorInfoImpl.RyuJit.cs @@ -2006,20 +2006,22 @@ private int SizeOfPInvokeTransitionFrame get { // struct PInvokeTransitionFrame: - // #ifdef _TARGET_ARM_ - // m_ChainPointer - // #endif - // m_RIP - // m_FramePointer + // m_RIP (1) + // m_FramePointer (1) // m_pThread // m_Flags + align (no align for ARM64 that has 64 bit m_Flags) - // m_PreserverRegs - RSP + // m_PreservedRegs - RSP / R9 (2) // No need to save other preserved regs because of the JIT ensures that there are // no live GC references in callee saved registers around the PInvoke callsite. + // + // (1) On ARM32/ARM64 the order of m_RIP and m_FramePointer is reverse + // (2) R9 is saved for ARM32 because it needs to be preserved for methods with stackalloc int size = 5 * this.PointerSize; if (_compilation.TypeSystemContext.Target.Architecture == TargetArchitecture.ARM) - size += this.PointerSize; // m_ChainPointer + { + size += this.PointerSize; // R9 (REG_SAVED_LOCALLOC_SP) + } return size; }