From 0552936138525ee4ac2a2df427d6269bab1fdcc6 Mon Sep 17 00:00:00 2001 From: Tymoteusz Wenerski Date: Mon, 22 Jan 2024 10:56:34 +0100 Subject: [PATCH] [RISC-V] Enable On Stack Replacement (#96558) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [RISC-V] Implement On Stack Replacement Note: Pinned local test is failing. * [RISC-V] Apply suggestions from code review Co-authored-by: Tomasz Sowiński * [RISC-V] apply jit-format * [RISC-V] Cosmetic changes after code review * [RISC-V] Changes assuming memory page is always equal 4KiB * [RISC-V] Remove stack probing * [RISC-V] Replace GetEmitter() with emit * [RISC-V] Sync frame type 1 in genPushCalleeSavedRegisters with genPopCalleeSavedRegisters * [RISC-V] Fix assembly emmited by genStackProbe * [RISC-V] Apply jit-formatter --------- Co-authored-by: Tomasz Sowiński --- src/coreclr/clrdefinitions.cmake | 4 +- src/coreclr/inc/clrconfigvalues.h | 6 +- src/coreclr/jit/codegen.h | 12 +- src/coreclr/jit/codegencommon.cpp | 30 +- src/coreclr/jit/codegenriscv64.cpp | 1436 ++++++++++++++++++---------- src/coreclr/jit/compiler.cpp | 4 +- src/coreclr/jit/gcencode.cpp | 4 +- src/coreclr/jit/jitconfigvalues.h | 4 +- src/coreclr/jit/lclvars.cpp | 33 +- src/coreclr/jit/targetriscv64.h | 3 + 10 files changed, 989 insertions(+), 547 deletions(-) diff --git a/src/coreclr/clrdefinitions.cmake b/src/coreclr/clrdefinitions.cmake index c180198a4db4a..fb8d095b5606d 100644 --- a/src/coreclr/clrdefinitions.cmake +++ b/src/coreclr/clrdefinitions.cmake @@ -184,9 +184,9 @@ endif(FEATURE_ENABLE_NO_ADDRESS_SPACE_RANDOMIZATION) add_definitions(-DFEATURE_SVR_GC) add_definitions(-DFEATURE_SYMDIFF) add_compile_definitions(FEATURE_TIERED_COMPILATION) -if (CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR CLR_CMAKE_TARGET_ARCH_LOONGARCH64) +if (CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR CLR_CMAKE_TARGET_ARCH_LOONGARCH64 OR CLR_CMAKE_TARGET_ARCH_RISCV64) add_compile_definitions(FEATURE_ON_STACK_REPLACEMENT) -endif (CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR CLR_CMAKE_TARGET_ARCH_LOONGARCH64) +endif (CLR_CMAKE_TARGET_ARCH_AMD64 OR CLR_CMAKE_TARGET_ARCH_ARM64 OR CLR_CMAKE_TARGET_ARCH_LOONGARCH64 OR CLR_CMAKE_TARGET_ARCH_RISCV64) add_compile_definitions(FEATURE_PGO) if (CLR_CMAKE_TARGET_WIN32) add_definitions(-DFEATURE_TYPEEQUIVALENCE) diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 6035bcd509425..12a1e706602cc 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -569,11 +569,11 @@ RETAIL_CONFIG_DWORD_INFO(INTERNAL_HillClimbing_GainExponent, #endif // _DEBUG RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TieredCompilation, W("TieredCompilation"), 1, "Enables tiered compilation") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TC_QuickJit, W("TC_QuickJit"), 1, "For methods that would be jitted, enable using quick JIT when appropriate.") -#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TC_QuickJitForLoops, W("TC_QuickJitForLoops"), 1, "When quick JIT is enabled, quick JIT may also be used for methods that contain loops.") -#else // !(defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)) +#else // !(defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64)) || defined(TARGET_RISCV64) RETAIL_CONFIG_DWORD_INFO(UNSUPPORTED_TC_QuickJitForLoops, W("TC_QuickJitForLoops"), 0, "When quick JIT is enabled, quick JIT may also be used for methods that contain loops.") -#endif // defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#endif // defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TC_AggressiveTiering, W("TC_AggressiveTiering"), 0, "Transition through tiers aggressively.") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TC_CallCountThreshold, W("TC_CallCountThreshold"), TC_CallCountThreshold, "Number of times a method must be called in tier 0 after which it is promoted to the next tier.") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_TC_CallCountingDelayMs, W("TC_CallCountingDelayMs"), TC_CallCountingDelayMs, "A perpetual delay in milliseconds that is applied to call counting in tier 0 and jitting at higher tiers, while there is startup-like activity.") diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index e1af485a0eb4f..386183737d760 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -265,7 +265,7 @@ class CodeGen final : public CodeGenInterface void genFnPrologCalleeRegArgs(regNumber xtraReg, bool* pXtraRegClobbered, RegState* regState); #endif void genEnregisterIncomingStackArgs(); -#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) void genEnregisterOSRArgsAndLocals(regNumber initReg, bool* pInitRegZeroed); #else void genEnregisterOSRArgsAndLocals(); @@ -345,6 +345,10 @@ class CodeGen final : public CodeGenInterface void genOSRSaveRemainingCalleeSavedRegisters(); #endif // TARGET_AMD64 +#if defined(TARGET_RISCV64) + void genStackProbe(ssize_t frameSize, regNumber rOffset, regNumber rLimit, regNumber rPageSize); +#endif + void genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn); void genPoisonFrame(regMaskTP bbRegLiveIn); @@ -450,11 +454,11 @@ class CodeGen final : public CodeGenInterface regMaskTP fiSaveRegs; // Set of callee-saved registers saved in the funclet prolog (includes RA) int fiFunction_CallerSP_to_FP_delta; // Delta between caller SP and the frame pointer in the parent function // (negative) - int fiSP_to_FPRA_save_delta; // FP/RA register save offset from SP (positive) + int fiSP_to_CalleeSaved_delta; // CalleeSaved register save offset from SP (positive) + int fiCalleeSavedPadding; // CalleeSaved offset padding (positive) int fiSP_to_PSP_slot_delta; // PSP slot offset from SP (positive) int fiCallerSP_to_PSP_slot_delta; // PSP slot offset from Caller SP (negative) - int fiFrameType; // Funclet frame types are numbered. See genFuncletProlog() for details. - int fiSpDelta1; // Stack pointer delta 1 (negative) + int fiSpDelta; // Stack pointer delta (negative) }; FuncletFrameInfoDsc genFuncletInfo; diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index b12b584038c53..133403e2f9d2b 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -4750,7 +4750,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg, // initReg -- scratch register to use if needed // pInitRegZeroed -- [IN,OUT] if init reg is zero (on entry/exit) // -#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) void CodeGen::genEnregisterOSRArgsAndLocals(regNumber initReg, bool* pInitRegZeroed) #else void CodeGen::genEnregisterOSRArgsAndLocals() @@ -4891,7 +4891,7 @@ void CodeGen::genEnregisterOSRArgsAndLocals() GetEmitter()->emitIns_R_AR(ins_Load(lclTyp), size, varDsc->GetRegNum(), genFramePointerReg(), offset); -#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // Patchpoint offset is from top of Tier0 frame // @@ -4923,7 +4923,7 @@ void CodeGen::genEnregisterOSRArgsAndLocals() genInstrWithConstant(ins_Load(lclTyp), size, varDsc->GetRegNum(), genFramePointerReg(), offset, initReg); *pInitRegZeroed = false; -#endif +#endif // TARGET_ARM64 || TARGET_LOONGARCH64 || TARGET_RISCV64 } } @@ -5530,7 +5530,7 @@ void CodeGen::genFnProlog() psiBegProlog(); } -#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // For arm64 OSR, emit a "phantom prolog" to account for the actions taken // in the tier0 frame that impact FP and SP on entry to the OSR method. // @@ -5545,7 +5545,7 @@ void CodeGen::genFnProlog() // SP is tier0 method's SP. compiler->unwindAllocStack(tier0FrameSize); } -#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#endif // defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) #ifdef DEBUG @@ -5875,13 +5875,25 @@ void CodeGen::genFnProlog() { initReg = REG_SCRATCH; } +#elif defined(TARGET_RISCV64) + // For RISC-V64 OSR root frames, we may need a scratch register for large + // offset addresses. Use a register that won't be allocated. + if (isRoot && compiler->opts.IsOSR()) + { + initReg = REG_SCRATCH; // REG_T0 + } #endif -#ifndef TARGET_LOONGARCH64 +#if !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64) // For LoongArch64's OSR root frames, we may need a scratch register for large // offset addresses. But this does not conflict with the REG_PINVOKE_FRAME. + // + // RISC-V64's OSR root frames are similar to LoongArch64's. In this case + // REG_SCRATCH also shouldn't conflict with REG_PINVOKE_FRAME, even if + // technically they are the same register - REG_T0. + // noway_assert(!compiler->compMethodRequiresPInvokeFrame() || (initReg != REG_PINVOKE_FRAME)); -#endif +#endif // !TARGET_LOONGARCH64 && !TARGET_RISCV64 #if defined(TARGET_AMD64) // If we are a varargs call, in order to set up the arguments correctly this @@ -6192,7 +6204,7 @@ void CodeGen::genFnProlog() // Otherwise we'll do some of these fetches twice. // CLANG_FORMAT_COMMENT_ANCHOR; -#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) genEnregisterOSRArgsAndLocals(initReg, &initRegZeroed); #else genEnregisterOSRArgsAndLocals(); @@ -6250,7 +6262,7 @@ void CodeGen::genFnProlog() assignIncomingRegisterArgs(&intRegState); #else assignIncomingRegisterArgs(&intRegState); -#endif +#endif // TARGET_ARM64 || TARGET_LOONGARCH64 || TARGET_RISCV64 #endif // TARGET_LOONGARCH64 || TARGET_RISCV64 diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index 3e2788c385a73..05f7a2c11f443 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -20,7 +20,34 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "lower.h" #include "gcinfo.h" #include "gcinfoencoder.h" +#include "patchpointinfo.h" +//------------------------------------------------------------------------ +// genInstrWithConstant: we will typically generate one instruction +// +// ins reg1, reg2, imm +// +// However the imm might not fit as a directly encodable immediate, +// when it doesn't fit we generate extra instruction(s) that sets up +// the 'regTmp' with the proper immediate value. +// +// mov regTmp, imm +// ins reg1, reg2, regTmp +// +// Arguments: +// ins - instruction +// attr - operation size and GC attribute +// reg1, reg2 - first and second register operands +// imm - immediate value (third operand when it fits) +// tmpReg - temp register to use when the 'imm' doesn't fit. Can be REG_NA +// if caller knows for certain the constant will fit. +// inUnwindRegion - true if we are in a prolog/epilog region with unwind codes. +// Default: false. +// +// Return Value: +// returns true if the immediate was small enough to be encoded inside instruction. If not, +// returns false meaning the immediate was too large and tmpReg was used and modified. +// bool CodeGen::genInstrWithConstant(instruction ins, emitAttr attr, regNumber reg1, @@ -99,10 +126,25 @@ bool CodeGen::genInstrWithConstant(instruction ins, return immFitsInIns; } +//------------------------------------------------------------------------ +// genStackPointerAdjustment: add a specified constant value to the stack pointer in either the prolog +// or the epilog. The unwind codes for the generated instructions are produced. An available temporary +// register is required to be specified, in case the constant is too large to encode in an "add" +// instruction, such that we need to load the constant +// into a register first, before using it. +// +// Arguments: +// spDelta - the value to add to SP (can be negative) +// tmpReg - an available temporary register +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// reportUnwindData - If true, report the change in unwind data. Otherwise, do not report it. +// +// Return Value: +// None. void CodeGen::genStackPointerAdjustment(ssize_t spDelta, regNumber tmpReg, bool* pTmpRegIsZero, bool reportUnwindData) { - // Even though INS_addi is specified here, the encoder will choose either - // an INS_add_d or an INS_addi_d and encode the immediate as a positive value + // Even though INS_addi is specified here, the encoder will replace it with INS_add // bool wasTempRegisterUsedForImm = !genInstrWithConstant(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, spDelta, tmpReg, true); @@ -126,6 +168,26 @@ void CodeGen::genStackPointerAdjustment(ssize_t spDelta, regNumber tmpReg, bool* } } +//------------------------------------------------------------------------ +// genPrologSaveRegPair: Save a pair of general-purpose or floating-point/SIMD registers in a function or funclet +// prolog. If possible, we use pre-indexed addressing to adjust SP and store the registers with a single instruction. +// The caller must ensure that we can use the STP instruction, and that spOffset will be in the legal range for that +// instruction. +// +// Arguments: +// reg1 - First register of pair to save. +// reg2 - Second register of pair to save. +// spOffset - The offset from SP to store reg1 (must be positive or zero). +// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or +// zero). +// useSaveNextPair - True if the last prolog instruction was to save the previous register pair. This +// allows us to emit the "save_next" unwind code. +// tmpReg - An available temporary register. Needed for the case of large frames. +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. void CodeGen::genPrologSaveRegPair(regNumber reg1, regNumber reg2, int spOffset, @@ -148,7 +210,7 @@ void CodeGen::genPrologSaveRegPair(regNumber reg1, if (spDelta != 0) { - // generate addi.d SP,SP,-imm + // generate addi SP,SP,-imm genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero, /* reportUnwindData */ true); assert((spDelta + spOffset + 16) <= 0); @@ -156,13 +218,36 @@ void CodeGen::genPrologSaveRegPair(regNumber reg1, assert(spOffset <= 2031); // 2047-16 } - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); + emitter* emit = GetEmitter(); + + // sd reg1, #spOffset(sp) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); compiler->unwindSaveReg(reg1, spOffset); - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8); + // sd reg2, #(spOffset + 8)(sp) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8); compiler->unwindSaveReg(reg2, spOffset + 8); } +//------------------------------------------------------------------------ +// genPrologSaveReg: Like genPrologSaveRegPair, but for a single register. Save a single general-purpose or +// floating-point/SIMD register in a function or funclet prolog. Note that if we wish to change SP (i.e., spDelta != 0), +// then spOffset must be 8. This is because otherwise we would create an alignment hole above the saved register, not +// below it, which we currently don't support. This restriction could be loosened if the callers change to handle it +// (and this function changes to support using pre-indexed SD addressing). The caller must ensure that we can use the +// SD instruction, and that spOffset will be in the legal range for that instruction. +// +// Arguments: +// reg1 - Register to save. +// spOffset - The offset from SP to store reg1 (must be positive or zero). +// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or +// zero). +// tmpReg - An available temporary register. Needed for the case of large frames. +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. void CodeGen::genPrologSaveReg(regNumber reg1, int spOffset, int spDelta, regNumber tmpReg, bool* pTmpRegIsZero) { assert(spOffset >= 0); @@ -177,14 +262,37 @@ void CodeGen::genPrologSaveReg(regNumber reg1, int spOffset, int spDelta, regNum if (spDelta != 0) { - // generate daddiu SP,SP,-imm + // generate addi SP,SP,-imm genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero, /* reportUnwindData */ true); } - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); + emitter* emit = GetEmitter(); + + // sd reg1, #spOffset(sp) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); compiler->unwindSaveReg(reg1, spOffset); } +//------------------------------------------------------------------------ +// genEpilogRestoreRegPair: This is the opposite of genPrologSaveRegPair(), run in the epilog instead of the prolog. +// The stack pointer adjustment, if requested, is done after the register restore, using post-index addressing. +// The caller must ensure that we can use the LDP instruction, and that spOffset will be in the legal range for that +// instruction. +// +// Arguments: +// reg1 - First register of pair to restore. +// reg2 - Second register of pair to restore. +// spOffset - The offset from SP to load reg1 (must be positive or zero). +// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or +// zero). +// useSaveNextPair - True if the last prolog instruction was to save the previous register pair. This +// allows us to emit the "save_next" unwind code. +// tmpReg - An available temporary register. Needed for the case of large frames. +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. void CodeGen::genEpilogRestoreRegPair(regNumber reg1, regNumber reg2, int spOffset, @@ -205,29 +313,49 @@ void CodeGen::genEpilogRestoreRegPair(regNumber reg1, ins = INS_fld; } + emitter* emit = GetEmitter(); + if (spDelta != 0) { assert(!useSaveNextPair); - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8); + // ld reg2, #(spOffset + 8)(SP) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8); compiler->unwindSaveReg(reg2, spOffset + 8); - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); + // ld reg1, #spOffset(SP) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); compiler->unwindSaveReg(reg1, spOffset); - // generate daddiu SP,SP,imm + // generate addi SP,SP,imm genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero, /* reportUnwindData */ true); } else { - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8); + // ld reg2, #(spOffset + 8)(SP) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg2, REG_SPBASE, spOffset + 8); compiler->unwindSaveReg(reg2, spOffset + 8); - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); + // ld reg1, #spOffset(SP) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); compiler->unwindSaveReg(reg1, spOffset); } } +//------------------------------------------------------------------------ +// genEpilogRestoreReg: The opposite of genPrologSaveReg(), run in the epilog instead of the prolog. +// +// Arguments: +// reg1 - Register to restore. +// spOffset - The offset from SP to restore reg1 (must be positive or zero). +// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or +// zero). +// tmpReg - An available temporary register. Needed for the case of large frames. +// pTmpRegIsZero - If we use tmpReg, and pTmpRegIsZero is non-null, we set *pTmpRegIsZero to 'false'. +// Otherwise, we don't touch it. +// +// Return Value: +// None. void CodeGen::genEpilogRestoreReg(regNumber reg1, int spOffset, int spDelta, regNumber tmpReg, bool* pTmpRegIsZero) { assert(spOffset >= 0); @@ -240,22 +368,38 @@ void CodeGen::genEpilogRestoreReg(regNumber reg1, int spOffset, int spDelta, reg ins = INS_fld; } + emitter* emit = GetEmitter(); + if (spDelta != 0) { - // ld reg1, offset(SP) - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); + // ld reg1, #spOffset(SP) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); compiler->unwindSaveReg(reg1, spOffset); - // generate add SP,SP,imm + // generate addi SP,SP,imm genStackPointerAdjustment(spDelta, tmpReg, pTmpRegIsZero, /* reportUnwindData */ true); } else { - GetEmitter()->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); + // ld reg1 #spOffset(SP) + emit->emitIns_R_R_I(ins, EA_PTRSIZE, reg1, REG_SPBASE, spOffset); compiler->unwindSaveReg(reg1, spOffset); } } +//------------------------------------------------------------------------ +// genBuildRegPairsStack: Build a stack of register pairs for prolog/epilog save/restore for the given mask. +// The first register pair will contain the lowest register. Register pairs will combine neighbor +// registers in pairs. If it can't be done (for example if we have a hole or this is the last reg in a mask with +// odd number of regs) then the second element of that RegPair will be REG_NA. +// +// Arguments: +// regsMask - a mask of registers for prolog/epilog generation; +// regStack - a regStack instance to build the stack in, used to save temp copyings. +// +// Return value: +// no return value; the regStack argument is modified. +// // static void CodeGen::genBuildRegPairsStack(regMaskTP regsMask, ArrayStack* regStack) { @@ -310,6 +454,19 @@ void CodeGen::genBuildRegPairsStack(regMaskTP regsMask, ArrayStack* reg genSetUseSaveNextPairs(regStack); } +//------------------------------------------------------------------------ +// genSetUseSaveNextPairs: Set useSaveNextPair for each RegPair on the stack which unwind info can be encoded as +// save_next code. +// +// Arguments: +// regStack - a regStack instance to set useSaveNextPair. +// +// Notes: +// We can use save_next for RegPair(N, N+1) only when we have sequence like (N-2, N-1), (N, N+1). +// In this case in the prolog save_next for (N, N+1) refers to save_pair(N-2, N-1); +// in the epilog the unwinder will search for the first save_pair (N-2, N-1) +// and then go back to the first save_next (N, N+1) to restore it first. +// // static void CodeGen::genSetUseSaveNextPairs(ArrayStack* regStack) { @@ -338,6 +495,18 @@ void CodeGen::genSetUseSaveNextPairs(ArrayStack* regStack) } } +//------------------------------------------------------------------------ +// genGetSlotSizeForRegsInMask: Get the stack slot size appropriate for the register type from the mask. +// +// Arguments: +// regsMask - a mask of registers for prolog/epilog generation. +// +// Return value: +// stack slot size in bytes. +// +// Note: Because int and float register type sizes match we can call this function with a mask that includes both. +// +// static int CodeGen::genGetSlotSizeForRegsInMask(regMaskTP regsMask) { assert((regsMask & (RBM_CALLEE_SAVED | RBM_FP | RBM_RA)) == regsMask); // Do not expect anything else. @@ -346,6 +515,14 @@ int CodeGen::genGetSlotSizeForRegsInMask(regMaskTP regsMask) return REGSIZE_BYTES; } +//------------------------------------------------------------------------ +// genSaveCalleeSavedRegisterGroup: Saves the group of registers described by the mask. +// +// Arguments: +// regsMask - a mask of registers for prolog generation; +// spDelta - if non-zero, the amount to add to SP before the first register save (or together with it); +// spOffset - the offset from SP that is the beginning of the callee-saved register area; +// void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, int spOffset) { const int slotSize = genGetSlotSizeForRegsInMask(regsMask); @@ -353,21 +530,23 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i ArrayStack regStack(compiler->getAllocator(CMK_Codegen)); genBuildRegPairsStack(regsMask, ®Stack); + regNumber tempReg = rsGetRsvdReg(); + for (int i = 0; i < regStack.Height(); ++i) { RegPair regPair = regStack.Bottom(i); if (regPair.reg2 != REG_NA) { // We can use two SD instructions. - genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, rsGetRsvdReg(), + genPrologSaveRegPair(regPair.reg1, regPair.reg2, spOffset, spDelta, regPair.useSaveNextPair, tempReg, nullptr); - spOffset += 2 * slotSize; + spOffset += slotSize << 1; } else { // No register pair; we use a SD instruction. - genPrologSaveReg(regPair.reg1, spOffset, spDelta, rsGetRsvdReg(), nullptr); + genPrologSaveReg(regPair.reg1, spOffset, spDelta, tempReg, nullptr); spOffset += slotSize; } @@ -375,6 +554,37 @@ void CodeGen::genSaveCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, i } } +//------------------------------------------------------------------------ +// genSaveCalleeSavedRegistersHelp: Save the callee-saved registers in 'regsToSaveMask' to the stack frame +// in the function or funclet prolog. Registers are saved in register number order from low addresses +// to high addresses. This means that integer registers are saved at lower addresses than floatint-point/SIMD +// registers. +// +// If establishing frame pointer chaining, it must be done after saving the callee-saved registers. +// +// We can only use the instructions that are allowed by the unwind codes. The caller ensures that +// there is enough space on the frame to store these registers, and that the store instructions +// we need to use (SD) are encodable with the stack-pointer immediate offsets we need to use. +// +// The caller can tell us to fold in a stack pointer adjustment, which we will do with the first instruction. +// Note that the stack pointer adjustment must be by a multiple of 16 to preserve the invariant that the +// stack pointer is always 16 byte aligned. If we are saving an odd number of callee-saved +// registers, though, we will have an empty alignment slot somewhere. It turns out we will put +// it below (at a lower address) the callee-saved registers, as that is currently how we +// do frame layout. This means that the first stack offset will be 8 and the stack pointer +// adjustment must be done by an ADDI (or ADD), and not folded in to a pre-indexed store. +// +// Arguments: +// regsToSaveMask - The mask of callee-saved registers to save. If empty, this function does nothing. +// lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area. Note that +// if non-zero spDelta, then this is the offset of the first save *after* that +// SP adjustment. +// spDelta - If non-zero, the amount to add to SP before the register saves (must be negative or +// zero). +// +// Notes: +// The save set can not contain FP/RA in which case FP/RA is saved along with the other callee-saved registers. +// void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowestCalleeSavedOffset, int spDelta) { assert(spDelta <= 0); @@ -386,17 +596,17 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe { // Currently this is the case for varargs only // whose size is MAX_REG_ARG * REGSIZE_BYTES = 64 bytes. + // addi sp, sp, #spDelta genStackPointerAdjustment(spDelta, rsGetRsvdReg(), nullptr, /* reportUnwindData */ true); } return; } - assert((spDelta % 16) == 0); + assert((spDelta % STACK_ALIGN) == 0); assert(regsToSaveCount <= genCountBits(RBM_CALLEE_SAVED)); // Save integer registers at higher addresses than floating-point registers. - regMaskTP maskSaveRegsFloat = regsToSaveMask & RBM_ALLFLOAT; regMaskTP maskSaveRegsInt = regsToSaveMask & ~maskSaveRegsFloat; @@ -414,6 +624,14 @@ void CodeGen::genSaveCalleeSavedRegistersHelp(regMaskTP regsToSaveMask, int lowe } } +//------------------------------------------------------------------------ +// genRestoreCalleeSavedRegisterGroup: Restores the group of registers described by the mask. +// +// Arguments: +// regsMask - a mask of registers for epilog generation; +// spDelta - if non-zero, the amount to add to SP after the last register restore (or together with it); +// spOffset - the offset from SP that is the beginning of the callee-saved register area; +// void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta, int spOffset) { const int slotSize = genGetSlotSizeForRegsInMask(regsMask); @@ -421,6 +639,8 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta ArrayStack regStack(compiler->getAllocator(CMK_Codegen)); genBuildRegPairsStack(regsMask, ®Stack); + regNumber tempReg = rsGetRsvdReg(); + int stackDelta = 0; for (int i = 0; i < regStack.Height(); ++i) { @@ -436,19 +656,47 @@ void CodeGen::genRestoreCalleeSavedRegisterGroup(regMaskTP regsMask, int spDelta RegPair regPair = regStack.Top(i); if (regPair.reg2 != REG_NA) { - spOffset -= 2 * slotSize; + spOffset -= slotSize << 1; - genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, - rsGetRsvdReg(), nullptr); + genEpilogRestoreRegPair(regPair.reg1, regPair.reg2, spOffset, stackDelta, regPair.useSaveNextPair, tempReg, + nullptr); } else { spOffset -= slotSize; - genEpilogRestoreReg(regPair.reg1, spOffset, stackDelta, rsGetRsvdReg(), nullptr); + genEpilogRestoreReg(regPair.reg1, spOffset, stackDelta, tempReg, nullptr); } } } +//------------------------------------------------------------------------ +// genRestoreCalleeSavedRegistersHelp: Restore the callee-saved registers in 'regsToRestoreMask' from the stack frame +// in the function or funclet epilog. This exactly reverses the actions of genSaveCalleeSavedRegistersHelp(). +// +// Arguments: +// regsToRestoreMask - The mask of callee-saved registers to restore. If empty, this function does nothing. +// lowestCalleeSavedOffset - The offset from SP that is the beginning of the callee-saved register area. +// spDelta - If non-zero, the amount to add to SP after the register restores (must be positive or +// zero). +// +// Here's an example restore sequence: +// ld s11, #xxx(sp) +// ld s10, #xxx(sp) +// ld s9, #xxx(sp) +// ld s8, #xxx(sp) +// ld s7, #xxx(sp) +// ld s6, #xxx(sp) +// ld s5, #xxx(sp) +// ld s4, #xxx(sp) +// ld s3, #xxx(sp) +// ld s2, #xxx(sp) +// ld s1, #xxx(sp) +// +// Note you call the unwind functions specifying the prolog operation that is being un-done. So, for example, when +// generating a post-indexed load, you call the unwind function for specifying the corresponding preindexed store. +// +// Return Value: +// None. void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, int lowestCalleeSavedOffset, int spDelta) { assert(spDelta >= 0); @@ -464,7 +712,7 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in return; } - assert((spDelta % 16) == 0); + assert((spDelta % STACK_ALIGN) == 0); // We also can restore FP and RA, even though they are not in RBM_CALLEE_SAVED. assert(regsToRestoreCount <= genCountBits(RBM_CALLEE_SAVED | RBM_FP | RBM_RA)); @@ -495,14 +743,102 @@ void CodeGen::genRestoreCalleeSavedRegistersHelp(regMaskTP regsToRestoreMask, in } } +// clang-format off +/***************************************************************************** + * + * Generates code for an EH funclet prolog. + * + * Funclets have the following incoming arguments: + * + * catch: a0 = the exception object that was caught (see GT_CATCH_ARG) + * filter: a0 = the exception object to filter (see GT_CATCH_ARG), a1 = CallerSP of the containing function + * finally/fault: none + * + * Funclets set the following registers on exit: + * + * catch: a0 = the address at which execution should resume (see BBJ_EHCATCHRET) + * filter: a0 = non-zero if the handler should handle the exception, zero otherwise (see GT_RETFILT) + * finally/fault: none + * + * The RISC-V64 funclet prolog is the following (Note: #framesz is total funclet frame size, + * including everything; #outsz is outgoing argument space. #framesz must be a multiple of 16): + * + * Frame type liking: + * addi sp, sp, -#framesz ; establish the frame + * sd s1, #outsz(sp) ; save callee-saved registers, as necessary + * sd s2, #(outsz+8)(sp) + * sd ra, #(outsz+?)(sp) ; save RA (8 bytes) + * sd fp, #(outsz+?+8)(sp) ; save FP (8 bytes) + * + * The funclet frame layout: + * + * | | + * |-----------------------| + * | incoming arguments | + * +=======================+ <---- Caller's SP + * | Arguments Or | // if needed + * | Varargs regs space | // Only for varargs functions; NYI on RV64 + * |-----------------------| + * | MonitorAcquired | // 8 bytes; for synchronized methods + * |-----------------------| + * | PSP slot | // 8 bytes (omitted in NativeAOT ABI) + * |-----------------------| + * ~ alignment padding ~ // To make the whole frame 16 byte aligned + * |-----------------------| + * | Saved FP | // 8 bytes + * |-----------------------| + * | Saved RA | // 8 bytes + * |-----------------------| + * |Callee saved registers | // multiple of 8 bytes, not includting RA/FP + * |-----------------------| + * | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + * + * Note, that SP only change once. That means, there will be a maximum of one alignment slot needed. + * Also remember, the stack oiubter needs to be 16 byte aligned at all times. + * The size of the PSP slot plus callee-saved registers space is a maximum of 280 bytes: + * + * RA,FP registers + * 11 int callee-saved register s1-s11 + * 12 float callee-saved registers f8-f9, f18-f27 + * 8 saved integer argument registers a0-a7, if varargs function support. + * 1 PSP slot + * 1 alignment slot or monitor acquired slot + * == 35 slots * 8 bytes = 280 bytes. + * + * The outgoing argument size, however, can be very large, if we call a function that takes a large number of + * arguments (note that we currently use the same outgoing argument space size in the funclet as for the main + * function, even if the funclet doesn't have any calls, or has a much smaller, or larger, maximum number of + * outgoing arguments for any call). In that case, we need to 16-byte align the initial change to SP, before + * saving off the callee-saved registers and establishing the PSPsym, so we can use the limited immediate offset + * encodings we have available, before doing another 16-byte aligned SP adjustment to create the outgoing argument + * space. Both changes to SP might need to add alignment padding. + * + * An example epilog sequence: + * addi sp, sp, #outsz ; if any outgoing argument space + * ld s1, #(xxx-8)(sp) ; restore callee-saved registers + * ld s2, #xxx(sp) + * ld ra, #(xxx+?-8)(sp) ; restore RA + * ld fp, #(xxx+?)(sp) ; restore FP + * addi sp, sp, #framesz + * jarl zero, ra + */ // clang-format on void CodeGen::genFuncletProlog(BasicBlock* block) { #ifdef DEBUG if (verbose) + { printf("*************** In genFuncletProlog()\n"); + } #endif + // TODO-RISCV64: Implement varargs (NYI_RISCV64) + // TODO-RISCV64-CQ: We can use C extension for optimization assert(block != NULL); assert(block->HasFlag(BBF_FUNCLET_BEG)); @@ -513,15 +849,10 @@ void CodeGen::genFuncletProlog(BasicBlock* block) compiler->unwindBegProlog(); - regMaskTP maskSaveRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT; - regMaskTP maskSaveRegsInt = genFuncletInfo.fiSaveRegs & ~maskSaveRegsFloat; - - // Funclets must always save RA and FP, since when we have funclets we must have an FP frame. - assert((maskSaveRegsInt & RBM_RA) != 0); - assert((maskSaveRegsInt & RBM_FP) != 0); + const bool isFilter = (block->bbCatchTyp == BBCT_FILTER); + const int frameSize = genFuncletInfo.fiSpDelta; - bool isFilter = (block->bbCatchTyp == BBCT_FILTER); - int frameSize = genFuncletInfo.fiSpDelta1; + assert(frameSize < 0); regMaskTP maskArgRegsLiveIn; if (isFilter) @@ -537,62 +868,53 @@ void CodeGen::genFuncletProlog(BasicBlock* block) maskArgRegsLiveIn = RBM_A0; } -#ifdef DEBUG - if (compiler->opts.disAsm) - { - printf("DEBUG: CodeGen::genFuncletProlog, frameType:%d\n\n", genFuncletInfo.fiFrameType); - } -#endif + regMaskTP maskSaveRegs = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED; + int regsSavedSize = (compiler->compCalleeRegsPushed - 2) << 3; - int offset = 0; - if (genFuncletInfo.fiFrameType == 1) - { - // fiFrameType constraints: - assert(frameSize < 0); - assert(frameSize >= -2048); + int calleeSavedDelta = genFuncletInfo.fiSP_to_CalleeSaved_delta; - assert(genFuncletInfo.fiSP_to_FPRA_save_delta < 2040); - genStackPointerAdjustment(frameSize, rsGetRsvdReg(), nullptr, /* reportUnwindData */ true); + emitter* emit = GetEmitter(); - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, genFuncletInfo.fiSP_to_FPRA_save_delta); - compiler->unwindSaveReg(REG_FP, genFuncletInfo.fiSP_to_FPRA_save_delta); + if (calleeSavedDelta + regsSavedSize + genFuncletInfo.fiCalleeSavedPadding <= 2040) + { + calleeSavedDelta += genFuncletInfo.fiCalleeSavedPadding; + + // addi sp, sp, #frameSize + genStackPointerAdjustment(frameSize, REG_SCRATCH, nullptr, /* reportUnwindData */ true); - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, genFuncletInfo.fiSP_to_FPRA_save_delta + 8); - compiler->unwindSaveReg(REG_RA, genFuncletInfo.fiSP_to_FPRA_save_delta + 8); + genSaveCalleeSavedRegistersHelp(maskSaveRegs, calleeSavedDelta, 0); + calleeSavedDelta += regsSavedSize; - maskSaveRegsInt &= ~(RBM_RA | RBM_FP); // We've saved these now + // sd ra, #calleeSavedDelta(sp) + emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, calleeSavedDelta); + compiler->unwindSaveReg(REG_RA, calleeSavedDelta); - genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, genFuncletInfo.fiSP_to_PSP_slot_delta + 8, - 0); + // sd fp, #(calleeSavedDelta+8)(sp) + emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, calleeSavedDelta + 8); + compiler->unwindSaveReg(REG_FP, calleeSavedDelta + 8); } - else if (genFuncletInfo.fiFrameType == 2) + else { - // fiFrameType constraints: - assert(frameSize < -2048); - - offset = -frameSize - genFuncletInfo.fiSP_to_FPRA_save_delta; - int spDelta = roundUp((UINT)offset, STACK_ALIGN); - offset = spDelta - offset; + assert(frameSize < -2040); - genStackPointerAdjustment(-spDelta, rsGetRsvdReg(), nullptr, /* reportUnwindData */ true); + int spDelta = frameSize + calleeSavedDelta; - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, offset); - compiler->unwindSaveReg(REG_FP, offset); + // addi sp, sp, #spDelta + genStackPointerAdjustment(spDelta, REG_SCRATCH, nullptr, /* reportUnwindData */ true); - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, offset + 8); - compiler->unwindSaveReg(REG_RA, offset + 8); + genSaveCalleeSavedRegistersHelp(maskSaveRegs, genFuncletInfo.fiCalleeSavedPadding, 0); + regsSavedSize += genFuncletInfo.fiCalleeSavedPadding; - maskSaveRegsInt &= ~(RBM_RA | RBM_FP); // We've saved these now + // sd ra, #regsSavedSize(sp) + emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, regsSavedSize); + compiler->unwindSaveReg(REG_RA, regsSavedSize); - offset = frameSize + spDelta + genFuncletInfo.fiSP_to_PSP_slot_delta + 8; - genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, offset, 0); + // sd fp, #(regsSavedSize+8)(sp) + emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, regsSavedSize + 8); + compiler->unwindSaveReg(REG_FP, regsSavedSize + 8); - genStackPointerAdjustment(frameSize + spDelta, rsGetRsvdReg(), nullptr, - /* reportUnwindData */ true); - } - else - { - unreached(); + // addi sp, sp -#calleeSavedDelta + genStackPointerAdjustment(-calleeSavedDelta, REG_SCRATCH, nullptr, /* reportUnwindData */ true); } // This is the end of the OS-reported prolog for purposes of unwinding @@ -638,6 +960,12 @@ void CodeGen::genFuncletProlog(BasicBlock* block) } } +/***************************************************************************** + * + * Generates code for an EH funclet epilog. + * + * See the description of frame shapes at genFuncletProlog(). + */ void CodeGen::genFuncletEpilog() { #ifdef DEBUG @@ -646,93 +974,80 @@ void CodeGen::genFuncletEpilog() printf("*************** In genFuncletEpilog()\n"); } #endif + // TODO-RISCV64: Implement varargs (NYI_RISCV64) + // TODO-RISCV64-CQ: We can use C extension for optimization ScopedSetVariable _setGeneratingEpilog(&compiler->compGeneratingEpilog, true); - bool unwindStarted = false; - int frameSize = genFuncletInfo.fiSpDelta1; + compiler->unwindBegEpilog(); - if (!unwindStarted) - { - // We can delay this until we know we'll generate an unwindable instruction, if necessary. - compiler->unwindBegEpilog(); - unwindStarted = true; - } + const int frameSize = genFuncletInfo.fiSpDelta; - regMaskTP maskRestoreRegsFloat = genFuncletInfo.fiSaveRegs & RBM_ALLFLOAT; - regMaskTP maskRestoreRegsInt = genFuncletInfo.fiSaveRegs & ~maskRestoreRegsFloat; + assert(frameSize < 0); - // Funclets must always save RA and FP, since when we have funclets we must have an FP frame. - assert((maskRestoreRegsInt & RBM_RA) != 0); - assert((maskRestoreRegsInt & RBM_FP) != 0); + regMaskTP maskRestoreRegs = genFuncletInfo.fiSaveRegs & RBM_CALLEE_SAVED; + int regsRestoreSize = (compiler->compCalleeRegsPushed - 2) << 3; -#ifdef DEBUG - if (compiler->opts.disAsm) - { - printf("DEBUG: CodeGen::genFuncletEpilog, frameType:%d\n\n", genFuncletInfo.fiFrameType); - } -#endif + int calleeSavedDelta = genFuncletInfo.fiSP_to_CalleeSaved_delta; - regMaskTP regsToRestoreMask = maskRestoreRegsInt | maskRestoreRegsFloat; + emitter* emit = GetEmitter(); + regNumber tempReg = rsGetRsvdReg(); - assert(frameSize < 0); - if (genFuncletInfo.fiFrameType == 1) + if (calleeSavedDelta + regsRestoreSize + genFuncletInfo.fiCalleeSavedPadding <= 2040) { - // fiFrameType constraints: - assert(frameSize >= -2048); - assert(genFuncletInfo.fiSP_to_FPRA_save_delta < 2040); - - regsToRestoreMask &= ~(RBM_RA | RBM_FP); // We restore FP/RA at the end - - genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, genFuncletInfo.fiSP_to_PSP_slot_delta + 8, 0); + calleeSavedDelta += genFuncletInfo.fiCalleeSavedPadding; + genRestoreCalleeSavedRegistersHelp(maskRestoreRegs, calleeSavedDelta, 0); + calleeSavedDelta += regsRestoreSize; - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, genFuncletInfo.fiSP_to_FPRA_save_delta + 8); - compiler->unwindSaveReg(REG_RA, genFuncletInfo.fiSP_to_FPRA_save_delta + 8); + // ld ra, #calleeSavedDelta(sp) + emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, calleeSavedDelta); + compiler->unwindSaveReg(REG_RA, calleeSavedDelta); - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, genFuncletInfo.fiSP_to_FPRA_save_delta); - compiler->unwindSaveReg(REG_FP, genFuncletInfo.fiSP_to_FPRA_save_delta); + // ld fp, #(calleeSavedDelta+8)(sp) + emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, calleeSavedDelta + 8); + compiler->unwindSaveReg(REG_FP, calleeSavedDelta + 8); - // generate daddiu SP,SP,imm - genStackPointerAdjustment(-frameSize, rsGetRsvdReg(), nullptr, /* reportUnwindData */ true); + // addi sp, sp, -#frameSize + genStackPointerAdjustment(-frameSize, tempReg, nullptr, /* reportUnwindData */ true); } - else if (genFuncletInfo.fiFrameType == 2) + else { - // fiFrameType constraints: - assert(frameSize < -2048); - - int offset = -frameSize - genFuncletInfo.fiSP_to_FPRA_save_delta; - int spDelta = roundUp((UINT)offset, STACK_ALIGN); - offset = spDelta - offset; - - // first, generate daddiu SP,SP,imm - genStackPointerAdjustment(-frameSize - spDelta, rsGetRsvdReg(), nullptr, - /* reportUnwindData */ true); + assert(frameSize < -2040); - int offset2 = frameSize + spDelta + genFuncletInfo.fiSP_to_PSP_slot_delta + 8; - assert(offset2 < 2040); // can amend. + // addi sp, sp, #calleeSavedDelta + genStackPointerAdjustment(calleeSavedDelta, tempReg, nullptr, /* reportUnwindData */ true); - regsToRestoreMask &= ~(RBM_RA | RBM_FP); // We restore FP/RA at the end - genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, offset2, 0); + genRestoreCalleeSavedRegistersHelp(maskRestoreRegs, genFuncletInfo.fiCalleeSavedPadding, 0); + regsRestoreSize += genFuncletInfo.fiCalleeSavedPadding; - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, offset + 8); - compiler->unwindSaveReg(REG_RA, offset + 8); + // ld ra, #regsRestoreSize(sp) + emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, regsRestoreSize); + compiler->unwindSaveReg(REG_RA, regsRestoreSize); - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, offset); - compiler->unwindSaveReg(REG_FP, offset); + // ld fp, #(regsRestoreSize+8)(sp) + emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, regsRestoreSize + 8); + compiler->unwindSaveReg(REG_FP, regsRestoreSize + 8); - // second, generate daddiu SP,SP,imm for remaine space. - genStackPointerAdjustment(spDelta, rsGetRsvdReg(), nullptr, /* reportUnwindData */ true); + // addi sp, sp, -#(frameSize + calleeSavedDelta) + genStackPointerAdjustment(-(frameSize + calleeSavedDelta), tempReg, nullptr, /* reportUnwindData */ true); } - else - { - unreached(); - } - GetEmitter()->emitIns_R_R_I(INS_jalr, emitActualTypeSize(TYP_I_IMPL), REG_R0, REG_RA, 0); + + // jarl zero, ra + emit->emitIns_R_R_I(INS_jalr, emitActualTypeSize(TYP_I_IMPL), REG_R0, REG_RA, 0); compiler->unwindReturn(REG_RA); compiler->unwindEndEpilog(); } +/***************************************************************************** + * + * Capture the information used to generate the funclet prologs and epilogs. + * Note that all funclet prologs are identical, and all funclet epilogs are + * identical (per type: filters are identical, and non-filters are identical). + * Thus, we compute the data used for these just once. + * + * See genFuncletProlog() for more information about the prolog/epilog sequences. + */ void CodeGen::genCaptureFuncletPrologEpilogInfo() { if (!compiler->ehAnyFunclets()) @@ -745,87 +1060,87 @@ void CodeGen::genCaptureFuncletPrologEpilogInfo() // The frame size and offsets must be finalized assert(compiler->lvaDoneFrameLayout == Compiler::FINAL_FRAME_LAYOUT); - genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta(); - regMaskTP rsMaskSaveRegs = regSet.rsMaskCalleeSaved; assert((rsMaskSaveRegs & RBM_RA) != 0); assert((rsMaskSaveRegs & RBM_FP) != 0); unsigned pspSize = (compiler->lvaPSPSym != BAD_VAR_NUM) ? 8 : 0; - unsigned saveRegsCount = genCountBits(rsMaskSaveRegs); - assert((saveRegsCount == compiler->compCalleeRegsPushed) || (saveRegsCount == compiler->compCalleeRegsPushed - 1)); + // If there is a PSP slot, we have to pad the funclet frame size for OSR. + // For more details see CodeGen::genFuncletProlog + // + unsigned osrPad = 0; + if (compiler->opts.IsOSR() && (pspSize != 0)) + { + osrPad = compiler->info.compPatchpointInfo->TotalFrameSize(); - unsigned saveRegsPlusPSPSize = - roundUp((UINT)genTotalFrameSize(), STACK_ALIGN) - compiler->compLclFrameSize + pspSize; + // osrPad must be aligned to stackSize + assert(osrPad % STACK_ALIGN == 0); + } - unsigned saveRegsPlusPSPSizeAligned = roundUp(saveRegsPlusPSPSize, STACK_ALIGN); + genFuncletInfo.fiCalleeSavedPadding = 0; + genFuncletInfo.fiFunction_CallerSP_to_FP_delta = genCallerSPtoFPdelta() - osrPad; - assert(compiler->lvaOutgoingArgSpaceSize % REGSIZE_BYTES == 0); - unsigned outgoingArgSpaceAligned = roundUp(compiler->lvaOutgoingArgSpaceSize, STACK_ALIGN); + unsigned savedRegsSize = genCountBits(rsMaskSaveRegs); + assert(savedRegsSize == compiler->compCalleeRegsPushed); + savedRegsSize <<= 3; - unsigned maxFuncletFrameSizeAligned = saveRegsPlusPSPSizeAligned + outgoingArgSpaceAligned; - assert((maxFuncletFrameSizeAligned % STACK_ALIGN) == 0); + unsigned saveRegsPlusPSPSize = savedRegsSize + pspSize; - int spToFpraSaveDelta = compiler->lvaOutgoingArgSpaceSize; + assert(compiler->lvaOutgoingArgSpaceSize % REGSIZE_BYTES == 0); + unsigned outgoingArgSpaceAligned = roundUp(compiler->lvaOutgoingArgSpaceSize, STACK_ALIGN); - unsigned funcletFrameSize = saveRegsPlusPSPSize + compiler->lvaOutgoingArgSpaceSize; + unsigned funcletFrameSize = osrPad + saveRegsPlusPSPSize + compiler->lvaOutgoingArgSpaceSize; unsigned funcletFrameSizeAligned = roundUp(funcletFrameSize, STACK_ALIGN); - assert(funcletFrameSizeAligned <= maxFuncletFrameSizeAligned); - - unsigned funcletFrameAlignmentPad = funcletFrameSizeAligned - funcletFrameSize; - assert((funcletFrameAlignmentPad == 0) || (funcletFrameAlignmentPad == REGSIZE_BYTES)); - if (maxFuncletFrameSizeAligned <= (2048 - 8)) - { - genFuncletInfo.fiFrameType = 1; - saveRegsPlusPSPSize -= 2 * 8; // FP/RA - } - else + int SP_to_CalleeSaved_delta = compiler->lvaOutgoingArgSpaceSize; + if ((SP_to_CalleeSaved_delta + savedRegsSize) >= 2040) { - unsigned saveRegsPlusPSPAlignmentPad = saveRegsPlusPSPSizeAligned - saveRegsPlusPSPSize; - assert((saveRegsPlusPSPAlignmentPad == 0) || (saveRegsPlusPSPAlignmentPad == REGSIZE_BYTES)); + int offset = funcletFrameSizeAligned - SP_to_CalleeSaved_delta; + SP_to_CalleeSaved_delta = AlignUp((UINT)offset, STACK_ALIGN); - genFuncletInfo.fiFrameType = 2; - saveRegsPlusPSPSize -= 2 * 8; // FP/RA + genFuncletInfo.fiCalleeSavedPadding = SP_to_CalleeSaved_delta - offset; } - int callerSpToPspSlotDelta = -(int)saveRegsPlusPSPSize; - genFuncletInfo.fiSpDelta1 = -(int)funcletFrameSizeAligned; - int spToPspSlotDelta = funcletFrameSizeAligned - saveRegsPlusPSPSize; + if (compiler->lvaMonAcquired != BAD_VAR_NUM && !compiler->opts.IsOSR()) + { + // We furthermore allocate the "monitor acquired" bool between PSP and + // the saved registers because this is part of the EnC header. + // Note that OSR methods reuse the monitor bool created by tier 0. + osrPad += compiler->lvaLclSize(compiler->lvaMonAcquired); + } /* Now save it for future use */ - genFuncletInfo.fiSaveRegs = rsMaskSaveRegs; - genFuncletInfo.fiSP_to_FPRA_save_delta = spToFpraSaveDelta; - - genFuncletInfo.fiSP_to_PSP_slot_delta = spToPspSlotDelta; - genFuncletInfo.fiCallerSP_to_PSP_slot_delta = callerSpToPspSlotDelta; + genFuncletInfo.fiSpDelta = -(int)funcletFrameSizeAligned; + genFuncletInfo.fiSaveRegs = rsMaskSaveRegs; + genFuncletInfo.fiSP_to_CalleeSaved_delta = SP_to_CalleeSaved_delta; + genFuncletInfo.fiSP_to_PSP_slot_delta = funcletFrameSizeAligned - osrPad - 8; + genFuncletInfo.fiCallerSP_to_PSP_slot_delta = -(int)osrPad - 8; #ifdef DEBUG if (verbose) { printf("\n"); printf("Funclet prolog / epilog info\n"); - printf(" Save regs: "); + printf(" Save regs: "); dspRegMask(genFuncletInfo.fiSaveRegs); printf("\n"); - printf(" Function CallerSP-to-FP delta: %d\n", genFuncletInfo.fiFunction_CallerSP_to_FP_delta); - printf(" SP to FP/RA save location delta: %d\n", genFuncletInfo.fiSP_to_FPRA_save_delta); - printf(" Frame type: %d\n", genFuncletInfo.fiFrameType); - printf(" SP delta 1: %d\n", genFuncletInfo.fiSpDelta1); - - if (compiler->lvaPSPSym != BAD_VAR_NUM) + if (compiler->opts.IsOSR()) { - if (callerSpToPspSlotDelta != compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)) // for - // debugging - { - printf("lvaGetCallerSPRelativeOffset(lvaPSPSym): %d\n", - compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)); - } + printf(" OSR Pad: %d\n", osrPad); } + printf(" Function CallerSP-to-FP delta: %d\n", genFuncletInfo.fiFunction_CallerSP_to_FP_delta); + printf(" SP to CalleeSaved location delta: %d\n", genFuncletInfo.fiSP_to_CalleeSaved_delta); + printf(" SP delta: %d\n", genFuncletInfo.fiSpDelta); } + assert(genFuncletInfo.fiSP_to_CalleeSaved_delta >= 0); - assert(genFuncletInfo.fiSP_to_FPRA_save_delta >= 0); + if (compiler->lvaPSPSym != BAD_VAR_NUM) + { + assert(genFuncletInfo.fiCallerSP_to_PSP_slot_delta == + compiler->lvaGetCallerSPRelativeOffset(compiler->lvaPSPSym)); // same offset used in main function and + // funclet! + } #endif // DEBUG } @@ -1008,14 +1323,19 @@ void CodeGen::genSetPSPSym(regNumber initReg, bool* pInitRegZeroed) noway_assert(isFramePointerUsed()); // We need an explicit frame pointer - int spToCallerSpDelta = -genCallerSPtoInitialSPdelta(); + int SPtoCallerSPdelta = -genCallerSPtoInitialSPdelta(); + + if (compiler->opts.IsOSR()) + { + SPtoCallerSPdelta += compiler->info.compPatchpointInfo->TotalFrameSize(); + } // We will just use the initReg since it is an available register // and we are probably done using it anyway... regNumber regTmp = initReg; *pInitRegZeroed = false; - genInstrWithConstant(INS_addi, EA_PTRSIZE, regTmp, REG_SPBASE, spToCallerSpDelta, rsGetRsvdReg(), false); + genInstrWithConstant(INS_addi, EA_PTRSIZE, regTmp, REG_SPBASE, SPtoCallerSPdelta, regTmp, false); GetEmitter()->emitIns_S_R(INS_sd, EA_PTRSIZE, regTmp, compiler->lvaPSPSym, 0); } @@ -1272,7 +1592,7 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre emitAttr size = emitActualTypeSize(tree); double constValue = tree->AsDblCon()->DconValue(); - // Make sure we use "daddiu reg, zero, 0x00" only for positive zero (0.0) + // Make sure we use "fmv.w.x reg, zero" only for positive zero (0.0) // and not for negative zero (-0.0) if (FloatingPointUtils::isPositiveZero(constValue)) { @@ -1290,12 +1610,10 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre CORINFO_FIELD_HANDLE hnd = emit->emitFltOrDblConst(constValue, size); // Load the FP constant. - assert(targetReg >= REG_F0); - - instruction ins = size == EA_4BYTE ? INS_flw : INS_fld; + assert(emit->isFloatReg(targetReg)); // Compute the address of the FP constant and load the data. - emit->emitIns_R_C(ins, size, targetReg, REG_NA, hnd, 0); + emit->emitIns_R_C(size == EA_4BYTE ? INS_flw : INS_fld, size, targetReg, REG_NA, hnd, 0); } } break; @@ -1685,12 +2003,18 @@ void CodeGen::genLclHeap(GenTree* tree) BasicBlock* endLabel = nullptr; // can optimize for riscv64. unsigned stackAdjustment = 0; const target_ssize_t ILLEGAL_LAST_TOUCH_DELTA = (target_ssize_t)-1; - target_ssize_t lastTouchDelta = - ILLEGAL_LAST_TOUCH_DELTA; // The number of bytes from SP to the last stack address probed. + + // The number of bytes from SP to the last stack address probed. + target_ssize_t lastTouchDelta = ILLEGAL_LAST_TOUCH_DELTA; noway_assert(isFramePointerUsed()); // localloc requires Frame Pointer to be established since SP changes noway_assert(genStackLevel == 0); // Can't have anything on the stack + const target_ssize_t pageSize = compiler->eeGetPageSize(); + + // According to RISC-V Privileged ISA page size is 4KiB + noway_assert(pageSize == 0x1000); + // compute the amount of memory to allocate to properly STACK_ALIGN. size_t amount = 0; if (size->IsCnsIntOrI()) @@ -1792,7 +2116,7 @@ void CodeGen::genLclHeap(GenTree* tree) goto ALLOC_DONE; } } - else if (amount < compiler->eeGetPageSize()) // must be < not <= + else if (amount < pageSize) // must be < not <= { // Since the size is less than a page, simply adjust the SP value. // The SP might already be in the guard page, so we must touch it BEFORE @@ -1882,8 +2206,6 @@ void CodeGen::genLclHeap(GenTree* tree) // addi regCnt, REG_R0, 0 // // Skip: - // sub regCnt, SP, regCnt - // // lui regTmp, eeGetPageSize()>>12 // Loop: // lw r0, 0(SP) // tickle the page - read from the page @@ -1899,20 +2221,19 @@ void CodeGen::genLclHeap(GenTree* tree) if (tempReg == REG_NA) tempReg = tree->ExtractTempReg(); - regNumber regTmp = tree->GetSingleTempReg(); + regNumber rPageSize = tree->GetSingleTempReg(); assert(regCnt != tempReg); emit->emitIns_R_R_R(INS_sltu, EA_PTRSIZE, tempReg, REG_SPBASE, regCnt); - //// subu regCnt, SP, regCnt // regCnt now holds ultimate SP + // sub regCnt, SP, regCnt // regCnt now holds ultimate SP emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, regCnt, REG_SPBASE, regCnt); // Overflow, set regCnt to lowest possible value emit->emitIns_R_R_I(INS_beq, EA_PTRSIZE, tempReg, REG_R0, 2 << 2); emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, regCnt, REG_R0, 0); - assert(compiler->eeGetPageSize() == ((compiler->eeGetPageSize() >> 12) << 12)); - emit->emitIns_R_I(INS_lui, EA_PTRSIZE, regTmp, compiler->eeGetPageSize() >> 12); + emit->emitIns_R_I(INS_lui, EA_PTRSIZE, rPageSize, pageSize >> 12); // genDefineTempLabel(loop); @@ -1920,14 +2241,14 @@ void CodeGen::genLclHeap(GenTree* tree) emit->emitIns_R_R_I(INS_lw, EA_4BYTE, REG_R0, REG_SPBASE, 0); // decrement SP by eeGetPageSize() - emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, tempReg, REG_SPBASE, regTmp); + emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, tempReg, REG_SPBASE, rPageSize); - assert(regTmp != tempReg); + assert(rPageSize != tempReg); ssize_t imm = 3 << 2; // goto done. emit->emitIns_R_R_I(INS_bltu, EA_PTRSIZE, tempReg, regCnt, imm); - emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, regTmp); + emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, rPageSize); imm = -4 << 2; // Jump to loop and tickle new stack address @@ -1951,8 +2272,7 @@ void CodeGen::genLclHeap(GenTree* tree) assert((lastTouchDelta == ILLEGAL_LAST_TOUCH_DELTA) || (lastTouchDelta >= 0)); if ((lastTouchDelta == ILLEGAL_LAST_TOUCH_DELTA) || - (stackAdjustment + (unsigned)lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > - compiler->eeGetPageSize())) + (stackAdjustment + (unsigned)lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize)) { genStackPointerConstantAdjustmentLoopWithProbe(-(ssize_t)stackAdjustment, tempReg); } @@ -1969,7 +2289,7 @@ void CodeGen::genLclHeap(GenTree* tree) else // stackAdjustment == 0 { // Move the final value of SP to targetReg - GetEmitter()->emitIns_R_R_I(INS_ori, EA_PTRSIZE, targetReg, REG_SPBASE, 0); + emit->emitIns_R_R_I(INS_ori, EA_PTRSIZE, targetReg, REG_SPBASE, 0); } BAILOUT: @@ -2306,6 +2626,27 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) } } +// Generate code for CpObj nodes which copy structs that have interleaved +// GC pointers. +// For this case we'll generate a sequence of loads/stores in the case of struct +// slots that don't contain GC pointers. The generated code will look like: +// ld tempReg, 8(a5) +// sd tempReg, 8(a6) +// +// In the case of a GC-Pointer we'll call the ByRef write barrier helper +// who happens to use the same registers as the previous call to maintain +// the same register requirements and register killsets: +// call CORINFO_HELP_ASSIGN_BYREF +// +// So finally an example would look like this: +// ld tempReg, 8(a5) +// sd tempReg 8(a6) +// call CORINFO_HELP_ASSIGN_BYREF +// ld tempReg, 8(a5) +// sd tempReg, 8(a6) +// call CORINFO_HELP_ASSIGN_BYREF +// ld tempReg, 8(a5) +// sd tempReg, 8(a6) void CodeGen::genCodeForCpObj(GenTreeBlk* cpObjNode) { GenTree* dstAddr = cpObjNode->Addr(); @@ -3499,28 +3840,7 @@ void CodeGen::genCodeForCompare(GenTreeOp* tree) { assert(targetReg != REG_NA); assert(tree->TypeGet() != TYP_VOID); - if (op1->isContainedIntOrIImmed()) - { - op1 = tree->gtOp2; - op2 = tree->gtOp1; - switch (tree->OperGet()) - { - case GT_LT: - tree->SetOper(GT_GT); - break; - case GT_LE: - tree->SetOper(GT_GE); - break; - case GT_GT: - tree->SetOper(GT_LT); - break; - case GT_GE: - tree->SetOper(GT_LE); - break; - default: - break; - } - } + assert(!op1->isContainedIntOrIImmed()); assert(tree->OperIs(GT_LT, GT_LE, GT_EQ, GT_NE, GT_GT, GT_GE)); @@ -3936,8 +4256,9 @@ void CodeGen::genCodeForJumpCompare(GenTreeOpCC* tree) int CodeGenInterface::genSPtoFPdelta() const { assert(isFramePointerUsed()); + assert(compiler->compCalleeRegsPushed >= 2); - int delta = compiler->lvaOutgoingArgSpaceSize; + int delta = compiler->lvaOutgoingArgSpaceSize + (compiler->compCalleeRegsPushed << 3) - 8; assert(delta >= 0); return delta; @@ -3991,9 +4312,7 @@ int CodeGenInterface::genCallerSPtoFPdelta() const int CodeGenInterface::genCallerSPtoInitialSPdelta() const { - int callerSPtoSPdelta = 0; - - callerSPtoSPdelta -= genTotalFrameSize(); + int callerSPtoSPdelta = -genTotalFrameSize(); assert(callerSPtoSPdelta <= 0); return callerSPtoSPdelta; @@ -4831,27 +5150,34 @@ void CodeGen::genSetGSSecurityCookie(regNumber initReg, bool* pInitRegZeroed) return; } + if (compiler->opts.IsOSR() && compiler->info.compPatchpointInfo->HasSecurityCookie()) + { + // Security cookie is on original frame and was initialized there. + return; + } + + emitter* emit = GetEmitter(); + if (compiler->gsGlobalSecurityCookieAddr == nullptr) { noway_assert(compiler->gsGlobalSecurityCookieVal != 0); instGen_Set_Reg_To_Imm(EA_PTRSIZE, initReg, compiler->gsGlobalSecurityCookieVal); - GetEmitter()->emitIns_S_R(INS_sd, EA_PTRSIZE, initReg, compiler->lvaGSSecurityCookie, 0); + emit->emitIns_S_R(INS_sd, EA_PTRSIZE, initReg, compiler->lvaGSSecurityCookie, 0); } else { if (compiler->opts.compReloc) { - GetEmitter()->emitIns_R_AI(INS_jalr, EA_PTR_DSP_RELOC, initReg, - (ssize_t)compiler->gsGlobalSecurityCookieAddr); + emit->emitIns_R_AI(INS_jalr, EA_PTR_DSP_RELOC, initReg, (ssize_t)compiler->gsGlobalSecurityCookieAddr); } else { - GetEmitter()->emitLoadImmediate(EA_PTRSIZE, initReg, ((size_t)compiler->gsGlobalSecurityCookieAddr)); - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, initReg, initReg, 0); + emit->emitLoadImmediate(EA_PTRSIZE, initReg, ((size_t)compiler->gsGlobalSecurityCookieAddr)); + emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, initReg, initReg, 0); } regSet.verifyRegUsed(initReg); - GetEmitter()->emitIns_S_R(INS_sd, EA_PTRSIZE, initReg, compiler->lvaGSSecurityCookie, 0); + emit->emitIns_S_R(INS_sd, EA_PTRSIZE, initReg, compiler->lvaGSSecurityCookie, 0); } *pInitRegZeroed = false; @@ -7156,11 +7482,172 @@ void CodeGen::genEstablishFramePointer(int delta, bool reportUnwindData) } //------------------------------------------------------------------------ -// genAllocLclFrame: Probe the stack and allocate the local stack frame: subtract from SP. +// genStackProbe: Probe the stack without changing it +// +// Notes: +// This function is using loop to probe each memory page. +// +// Arguments: +// frameSize - total frame size +// rOffset - usually initial register number +// rLimit - an extra register for comparison +// rPageSize - register for storing page size +// +void CodeGen::genStackProbe(ssize_t frameSize, regNumber rOffset, regNumber rLimit, regNumber rPageSize) +{ + // make sure frameSize safely fits within 4 bytes + noway_assert((ssize_t)(int)frameSize == (ssize_t)frameSize); + + const target_size_t pageSize = compiler->eeGetPageSize(); + + // According to RISC-V Privileged ISA page size should be equal 4KiB + noway_assert(pageSize == 0x1000); + emitter* emit = GetEmitter(); + + emit->emitLoadImmediate(EA_PTRSIZE, rLimit, -frameSize); + regSet.verifyRegUsed(rLimit); + + emit->emitIns_R_R_R(INS_add, EA_PTRSIZE, rLimit, rLimit, REG_SPBASE); + + emit->emitIns_R_I(INS_lui, EA_PTRSIZE, rPageSize, pageSize >> 12); + regSet.verifyRegUsed(rPageSize); + + emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, rOffset, REG_SPBASE, rPageSize); + + // Loop: + // tickle the page - Read from the updated SP - this triggers a page fault when on the guard page + emit->emitIns_R_R_I(INS_lw, EA_4BYTE, REG_R0, rOffset, 0); + emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, rOffset, rOffset, rPageSize); + + // each instr is 4 bytes + // if (rOffset >= rLimit) goto Loop; + emit->emitIns_R_R_I(INS_bge, EA_PTRSIZE, rOffset, rLimit, -2 << 2); +} + +//------------------------------------------------------------------------ +// genAllocLclFrame: Probe the stack. +// +// Notes: +// This only does the probing; allocating the frame is done when callee-saved registers are saved. +// This is done before anything has been pushed. The previous frame might have a large outgoing argument +// space that has been allocated, but the lowest addresses have not been touched. Our frame setup might +// not touch up to the first 504 bytes. This means we could miss a guard page. On Windows, however, +// there are always three guard pages, so we will not miss them all. On Linux, there is only one guard +// page by default, so we need to be more careful. We do an extra probe if we might not have probed +// recently enough. That is, if a call and prolog establishment might lead to missing a page. We do this +// on Windows as well just to be consistent, even though it should not be necessary. +// +// Arguments: +// frameSize - the size of the stack frame being allocated. +// initReg - register to use as a scratch register. +// pInitRegZeroed - OUT parameter. *pInitRegZeroed is set to 'false' if and only if +// this call sets 'initReg' to a non-zero value. Otherwise, it is unchanged. +// maskArgRegsLiveIn - incoming argument registers that are currently live. +// +// Return value: +// None +// void CodeGen::genAllocLclFrame(unsigned frameSize, regNumber initReg, bool* pInitRegZeroed, regMaskTP maskArgRegsLiveIn) { - NYI_RISCV64("genAllocLclFrame-----unimplemented/unused on RISCV64 yet----"); + assert(compiler->compGeneratingProlog); + + if (frameSize == 0) + { + return; + } + + // According to RISC-V Privileged ISA page size should be equal 4KiB + const target_size_t pageSize = compiler->eeGetPageSize(); + + assert(!compiler->info.compPublishStubParam || (REG_SECRET_STUB_PARAM != initReg)); + + target_size_t lastTouchDelta = 0; + + emitter* emit = GetEmitter(); + + // Emit the following sequence to 'tickle' the pages. + // Note it is important that stack pointer not change until this is complete since the tickles + // could cause a stack overflow, and we need to be able to crawl the stack afterward + // (which means the stack pointer needs to be known). + + if (frameSize < pageSize) + { + // no probe needed + lastTouchDelta = frameSize; + } + else if (frameSize < 3 * pageSize) + { + // between 1 and 3 pages we will probe each page without a loop, + // because it is faster that way and doesn't cost us much + lastTouchDelta = frameSize; + + for (target_size_t probeOffset = pageSize; probeOffset <= frameSize; probeOffset += pageSize) + { + emit->emitIns_R_I(INS_lui, EA_PTRSIZE, initReg, probeOffset >> 12); + regSet.verifyRegUsed(initReg); + + emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, initReg, REG_SPBASE, initReg); + emit->emitIns_R_R_I(INS_lw, EA_4BYTE, REG_R0, initReg, 0); + + lastTouchDelta -= pageSize; + } + + assert(pInitRegZeroed != nullptr); + *pInitRegZeroed = false; // The initReg does not contain zero + + assert(lastTouchDelta == frameSize % pageSize); + compiler->unwindPadding(); + } + else + { + // probe each page, that we need to allocate large stack frame + assert(frameSize >= 3 * pageSize); + + regMaskTP availMask = RBM_ALLINT & (regSet.rsGetModifiedRegsMask() | ~RBM_INT_CALLEE_SAVED); + availMask &= ~maskArgRegsLiveIn; // Remove all of the incoming argument registers + // as they are currently live + availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg + + noway_assert(availMask != RBM_NONE); + + regMaskTP regMask = genFindLowestBit(availMask); + regNumber rLimit = genRegNumFromMask(regMask); + + availMask &= ~regMask; // Remove rLimit register + + noway_assert(availMask != RBM_NONE); + + regMask = genFindLowestBit(availMask); + regNumber rPageSize = genRegNumFromMask(regMask); + + genStackProbe((ssize_t)frameSize, initReg, rLimit, rPageSize); + + assert(pInitRegZeroed != nullptr); + *pInitRegZeroed = false; // The initReg does not contain zero + + lastTouchDelta = frameSize % pageSize; + compiler->unwindPadding(); + } + +#if STACK_PROBE_BOUNDARY_THRESHOLD_BYTES != 0 + // if the last page was too far, we will make an extra probe at the bottom + if (lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES > pageSize) + { + assert(lastTouchDelta + STACK_PROBE_BOUNDARY_THRESHOLD_BYTES < pageSize << 1); + + emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, initReg, REG_R0, frameSize); + regSet.verifyRegUsed(initReg); + + emit->emitIns_R_R_R(INS_sub, EA_PTRSIZE, initReg, REG_SPBASE, initReg); + emit->emitIns_R_R_I(INS_lw, EA_4BYTE, REG_R0, initReg, 0); + + assert(pInitRegZeroed != nullptr); + *pInitRegZeroed = false; // The initReg does not contain zero + + compiler->unwindPadding(); + } +#endif } inline void CodeGen::genJumpToThrowHlpBlk_la( @@ -7319,12 +7806,77 @@ void CodeGen::instGen_MemoryBarrier(BarrierKind barrierKind) /*----------------------------------------------------------------------------- * - * Push/Pop any callee-saved registers we have used + * Push/Pop any callee-saved registers we have used, + * For most frames, generatint liking: + * addi sp, sp, -#framesz ; establish the frame + * + * ; save float regs + * fsd f8, #offset(sp) + * fsd f9, #(offset+8)(sp) + * fsd f18, #(offset+16)(sp) + * ; ... + * fsd f27, #(offset+8*11)(sp) + * + * ; save int regs + * sd s1, #offset2(sp) + * sd s2, #(offset2+8)(sp) + * ; ... + * sd s11, #(offset+8*10)(sp) + * + * ; save ra, fp + * sd ra, #offset3(sp) ; save RA (8 bytes) + * sd fp, #(offset3+8)(sp) ; save FP (8 bytes) + * + * Notes: + * 1. FP is always saved, and the first store is FP, RA. + * 2. General-purpose registers are 8 bytes, floating-point registers are 8 bytes. + * 3. For frames with varargs, not implemented completely and not tested ! + * 4. We allocate the frame here; no further changes to SP are allowed (except in the body, for localloc). + * + * For functions with GS and localloc, we change the frame so the frame pointer and RA are saved at the top + * of the frame, just under the varargs registers (if any). Note that the funclet frames must follow the same + * rule, and both main frame and funclet frames (if any) must put PSPSym in the same offset from Caller-SP. + * Since this frame type is relatively rare, we force using it via stress modes, for additional coverage. + * + * The frames look like the following (simplified to only include components that matter for establishing the + * frames). See also Compiler::lvaAssignFrameOffsets(). + * + * The RISC-V's frame layout is liking: + * + * | | + * |-----------------------| + * | incoming arguments | + * +=======================+ <---- Caller's SP + * | Arguments Or | // if needed + * | Varargs regs space | // Only for varargs functions; NYI on RV64 + * |-----------------------| + * | MonitorAcquired | // 8 bytes; for synchronized methods + * |-----------------------| + * | PSP slot | // 8 bytes (omitted in NativeAOT ABI) + * |-----------------------| + * | locals, temps, etc. | + * |-----------------------| + * | possible GS cookie | + * |-----------------------| + * | Saved FP | // 8 bytes + * |-----------------------| + * | Saved RA | // 8 bytes + * |-----------------------| + * |Callee saved registers | // not including FP/RA; multiple of 8 bytes + * |-----------------------| + * | Outgoing arg space | // multiple of 8 bytes; if required (i.e., #outsz != 0) + * |-----------------------| <---- Ambient SP + * | | | + * ~ | Stack grows ~ + * | | downward | + * V + * */ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroed) { assert(compiler->compGeneratingProlog); + // Unlike on x86/x64, we can also push float registers to stack regMaskTP rsPushRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED; #if ETW_EBP_FRAMED @@ -7334,11 +7886,8 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe } #endif - // On RISCV64 we push the FP (frame-pointer) here along with all other callee saved registers - if (isFramePointerUsed()) - { - rsPushRegs |= RBM_FPBASE; - } + // On RV64 we always use the FP (frame-pointer) + assert(isFramePointerUsed()); // // It may be possible to skip pushing/popping ra for leaf methods. However, such optimization would require @@ -7360,29 +7909,25 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe // is not worth it. // - rsPushRegs |= RBM_RA; // We must save the return address (in the RA register). - regSet.rsMaskCalleeSaved = rsPushRegs; - regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_ALLFLOAT; - regMaskTP maskSaveRegsInt = rsPushRegs & ~maskSaveRegsFloat; + // we will push callee-saved registers along with fp and ra registers to stack + regMaskTP rsPushRegsMask = rsPushRegs | RBM_FP | RBM_RA; + regSet.rsMaskCalleeSaved = rsPushRegsMask; #ifdef DEBUG - if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegs)) + if (compiler->compCalleeRegsPushed != genCountBits(rsPushRegsMask)) { printf("Error: unexpected number of callee-saved registers to push. Expected: %d. Got: %d ", - compiler->compCalleeRegsPushed, genCountBits(rsPushRegs)); - dspRegMask(rsPushRegs); + compiler->compCalleeRegsPushed, genCountBits(rsPushRegsMask)); + dspRegMask(rsPushRegsMask); printf("\n"); - assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegs)); + assert(compiler->compCalleeRegsPushed == genCountBits(rsPushRegsMask)); } -#endif // DEBUG - - int totalFrameSize = genTotalFrameSize(); - int offset; // This will be the starting place for saving the callee-saved registers, in increasing order. - -#ifdef DEBUG if (verbose) { + regMaskTP maskSaveRegsFloat = rsPushRegs & RBM_FLT_CALLEE_SAVED; + regMaskTP maskSaveRegsInt = rsPushRegs & RBM_INT_CALLEE_SAVED; + printf("Save float regs: "); dspRegMask(maskSaveRegsFloat); printf("\n"); @@ -7400,96 +7945,70 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe // first save instruction as a "predecrement" amount, if possible. int calleeSaveSPDelta = 0; - // By default, we'll establish the frame pointer chain. (Note that currently frames without FP are NYI.) - bool establishFramePointer = true; - - // If we do establish the frame pointer, what is the amount we add to SP to do so? - unsigned offsetSpToSavedFp = 0; - - if (isFramePointerUsed()) - { - // We need to save both FP and RA. - - assert((maskSaveRegsInt & RBM_FP) != 0); - assert((maskSaveRegsInt & RBM_RA) != 0); - - // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address - // (FP and RA) are protected from buffer overrun by the GS cookie. If FP/RA are at the lowest addresses, - // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will - // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our - // saved FP/RA. In that case, we save FP/RA along with the rest of the callee-saved registers, above - // the GS cookie. - // - // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to - // create a frame pointer chain. - // + // If we need to generate a GS cookie, we need to make sure the saved frame pointer and return address + // (FP and RA) are protected from buffer overrun by the GS cookie. If FP/RA are at the lowest addresses, + // then they are safe, since they are lower than any unsafe buffers. And the GS cookie we add will + // protect our caller's frame. If we have a localloc, however, that is dynamically placed lower than our + // saved FP/RA. In that case, we save FP/RA along with the rest of the callee-saved registers, above + // the GS cookie. + // + // After the frame is allocated, the frame pointer is established, pointing at the saved frame pointer to + // create a frame pointer chain. + // - if (totalFrameSize < 2048) - { - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -totalFrameSize); - compiler->unwindAllocStack(totalFrameSize); + // This will be the starting place for saving the callee-saved registers, in increasing order. + int offset = compiler->lvaOutgoingArgSpaceSize; - // Case #1. - // - // Generate: - // addi sp, sp, -framesz - // sd fp, outsz(sp) - // sd ra, outsz+8(sp) - // - // The (totalFrameSize <= 2047) condition ensures the offsets of sd/ld. - // - // After saving callee-saved registers, we establish the frame pointer with: - // daddiu fp, sp, offset-fp - // We do this *after* saving callee-saved registers, so the prolog/epilog unwind codes mostly match. + int totalFrameSize = genTotalFrameSize(); - JITDUMP("Frame type 1. #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); + emitter* emit = GetEmitter(); - frameType = 1; + // ensure offset of sd/ld + if (totalFrameSize <= 2040) + { + frameType = 1; - offsetSpToSavedFp = compiler->lvaOutgoingArgSpaceSize; + emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, -totalFrameSize); + compiler->unwindAllocStack(totalFrameSize); - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, offsetSpToSavedFp); - compiler->unwindSaveReg(REG_FP, offsetSpToSavedFp); + JITDUMP("Frame type 1. #outsz=%d; #framesz=%d; LclFrameSize=%d\n", unsigned(compiler->lvaOutgoingArgSpaceSize), + totalFrameSize, compiler->compLclFrameSize); + } + else + { + frameType = 2; + // we have to adjust stack pointer; probably using add instead of addi - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, offsetSpToSavedFp + 8); - compiler->unwindSaveReg(REG_RA, offsetSpToSavedFp + 8); + JITDUMP("Frame type 2. #outsz=%d; #framesz=%d; LclFrameSize=%d\n", unsigned(compiler->lvaOutgoingArgSpaceSize), + totalFrameSize, compiler->compLclFrameSize); - maskSaveRegsInt &= ~(RBM_FP | RBM_RA); // We've already saved FP/RA + if ((offset + (compiler->compCalleeRegsPushed << 3)) >= 2040) + { + offset = totalFrameSize - compiler->lvaOutgoingArgSpaceSize; + calleeSaveSPDelta = AlignUp((UINT)offset, STACK_ALIGN); + offset = calleeSaveSPDelta - offset; - offset = compiler->compLclFrameSize + 2 * REGSIZE_BYTES; // FP/RA + genStackPointerAdjustment(-calleeSaveSPDelta, initReg, pInitRegZeroed, /* reportUnwindData */ true); } else { - JITDUMP("Frame type 2. #outsz=%d; #framesz=%d; LclFrameSize=%d\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compLclFrameSize); - - frameType = 2; - - maskSaveRegsInt &= ~(RBM_FP | RBM_RA); // We've already saved FP/RA - - offset = totalFrameSize - compiler->compLclFrameSize - 2 * REGSIZE_BYTES; - calleeSaveSPDelta = AlignUp((UINT)offset, STACK_ALIGN); - offset = calleeSaveSPDelta - offset; + genStackPointerAdjustment(-totalFrameSize, initReg, pInitRegZeroed, /* reportUnwindData */ true); } } - else - { - // No frame pointer (no chaining). - assert((maskSaveRegsInt & RBM_FP) == 0); - assert((maskSaveRegsInt & RBM_RA) != 0); - // Note that there is no pre-indexed save_lrpair unwind code variant, so we can't allocate the frame using - // 'sd' if we only have one callee-saved register plus RA to save. + JITDUMP(" offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta); - NYI_RISCV64("Frame without frame pointer"); - offset = 0; - } + genSaveCalleeSavedRegistersHelp(rsPushRegs, offset, 0); + offset += (int)(genCountBits(rsPushRegs) << 3); // each reg has 8 bytes - assert(frameType != 0); + emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, offset); + compiler->unwindSaveReg(REG_RA, offset); - JITDUMP(" offset=%d, calleeSaveSPDelta=%d\n", offset, calleeSaveSPDelta); - genSaveCalleeSavedRegistersHelp(maskSaveRegsInt | maskSaveRegsFloat, offset, -calleeSaveSPDelta); + emit->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, offset + 8); + compiler->unwindSaveReg(REG_FP, offset + 8); + + JITDUMP(" offsetSpToSavedFp=%d\n", offset + 8); + genEstablishFramePointer(offset + 8, /* reportUnwindData */ true); // For varargs, home the incoming arg registers last. Note that there is nothing to unwind here, // so we just report "NOP" unwind codes. If there's no more frame setup after this, we don't @@ -7506,60 +8025,12 @@ void CodeGen::genPushCalleeSavedRegisters(regNumber initReg, bool* pInitRegZeroe printf("DEBUG: RISCV64, frameType:%d\n\n", frameType); } #endif - if (frameType == 1) - { - // offsetSpToSavedFp = genSPtoFPdelta(); - } - else if (frameType == 2) - { - if (compiler->lvaOutgoingArgSpaceSize >= 2040) - { - offset = totalFrameSize - calleeSaveSPDelta - compiler->lvaOutgoingArgSpaceSize; - calleeSaveSPDelta = AlignUp((UINT)offset, STACK_ALIGN); - offset = calleeSaveSPDelta - offset; - - genStackPointerAdjustment(-calleeSaveSPDelta, initReg, pInitRegZeroed, /* reportUnwindData */ true); - - offsetSpToSavedFp = offset; - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, offset); - compiler->unwindSaveReg(REG_FP, offset); - - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, offset + 8); - compiler->unwindSaveReg(REG_RA, offset + 8); - - genEstablishFramePointer(offset, /* reportUnwindData */ true); - - calleeSaveSPDelta = compiler->lvaOutgoingArgSpaceSize & ~0xf; - genStackPointerAdjustment(-calleeSaveSPDelta, initReg, pInitRegZeroed, /* reportUnwindData */ true); - } - else - { - calleeSaveSPDelta = totalFrameSize - calleeSaveSPDelta; - genStackPointerAdjustment(-calleeSaveSPDelta, initReg, pInitRegZeroed, /* reportUnwindData */ true); - - offset = compiler->lvaOutgoingArgSpaceSize; - - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_FP, REG_SPBASE, offset); - compiler->unwindSaveReg(REG_FP, offset); - - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_RA, REG_SPBASE, offset + 8); - compiler->unwindSaveReg(REG_RA, offset + 8); - - genEstablishFramePointer(offset, /* reportUnwindData */ true); - } - - establishFramePointer = false; - } - else - { - unreached(); - } - - if (establishFramePointer) + if (calleeSaveSPDelta != 0) { - JITDUMP(" offsetSpToSavedFp=%d\n", offsetSpToSavedFp); - genEstablishFramePointer(offsetSpToSavedFp, /* reportUnwindData */ true); + assert(frameType == 2); + calleeSaveSPDelta = totalFrameSize - calleeSaveSPDelta; + genStackPointerAdjustment(-calleeSaveSPDelta, initReg, pInitRegZeroed, /* reportUnwindData */ true); } } @@ -7567,149 +8038,112 @@ void CodeGen::genPopCalleeSavedRegisters(bool jmpEpilog) { assert(compiler->compGeneratingEpilog); - regMaskTP rsRestoreRegs = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED; + regMaskTP regsToRestoreMask = regSet.rsGetModifiedRegsMask() & RBM_CALLEE_SAVED; - if (isFramePointerUsed()) - { - rsRestoreRegs |= RBM_FPBASE; - } - - rsRestoreRegs |= RBM_RA; // We must save/restore the return address. - - regMaskTP regsToRestoreMask = rsRestoreRegs; - - int totalFrameSize = genTotalFrameSize(); + // On RV64 we always use the FP (frame-pointer) + assert(isFramePointerUsed()); + int totalFrameSize = genTotalFrameSize(); + int remainingSPSize = totalFrameSize; + int callerSPtoFPdelta = 0; int calleeSaveSPOffset = 0; // This will be the starting place for restoring // the callee-saved registers, in decreasing order. - int frameType = 0; // An indicator of what type of frame we are popping. - int calleeSaveSPDelta = 0; // Amount to add to SP after callee-saved registers have been restored. - - if (isFramePointerUsed()) - { - if (totalFrameSize <= 2047) - { - if (compiler->compLocallocUsed) - { - int spToFpDelta = genSPtoFPdelta(); - // Restore sp from fp - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -spToFpDelta); - compiler->unwindSetFrameReg(REG_FPBASE, spToFpDelta); - } - JITDUMP("Frame type 1(save FP/RA at bottom). #outsz=%d; #framesz=%d; localloc? %s\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, dspBool(compiler->compLocallocUsed)); - - frameType = 1; + emitter* emit = GetEmitter(); - regsToRestoreMask &= ~(RBM_FP | RBM_RA); // We'll restore FP/RA at the end. + // ensure offset of sd/ld + if (totalFrameSize <= 2040) + { + JITDUMP("Frame type 1. #outsz=%d; #framesz=%d; localloc? %s\n", unsigned(compiler->lvaOutgoingArgSpaceSize), + totalFrameSize, dspBool(compiler->compLocallocUsed)); - calleeSaveSPOffset = compiler->compLclFrameSize + 2 * REGSIZE_BYTES; - } - else + if (compiler->compLocallocUsed) { - JITDUMP("Frame type 2(save FP/RA at bottom). #outsz=%d; #framesz=%d; #calleeSaveRegsPushed:%d; " - "localloc? %s\n", - unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compCalleeRegsPushed, - dspBool(compiler->compLocallocUsed)); + callerSPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8 + compiler->lvaOutgoingArgSpaceSize; + } + calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize; + // remainingSPSize = totalFrameSize; + } + else + { + JITDUMP("Frame type 2. #outsz=%d; #framesz=%d; calleeSaveRegsPushed: %d; localloc? %s\n", + unsigned(compiler->lvaOutgoingArgSpaceSize), totalFrameSize, compiler->compCalleeRegsPushed, + dspBool(compiler->compLocallocUsed)); - frameType = 2; + if ((compiler->lvaOutgoingArgSpaceSize + (compiler->compCalleeRegsPushed << 3)) >= 2040) + { + calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize & 0xfffffff0; - int outSzAligned; - if (compiler->lvaOutgoingArgSpaceSize >= 2040) + if (compiler->compLocallocUsed) { - int offset = totalFrameSize - compiler->compLclFrameSize - 2 * REGSIZE_BYTES; - calleeSaveSPDelta = AlignUp((UINT)offset, STACK_ALIGN); - calleeSaveSPOffset = calleeSaveSPDelta - offset; - - int offset2 = totalFrameSize - calleeSaveSPDelta - compiler->lvaOutgoingArgSpaceSize; - calleeSaveSPDelta = AlignUp((UINT)offset2, STACK_ALIGN); - offset2 = calleeSaveSPDelta - offset2; - - if (compiler->compLocallocUsed) - { - // Restore sp from fp - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -offset2); - compiler->unwindSetFrameReg(REG_FPBASE, offset2); - } - else - { - outSzAligned = compiler->lvaOutgoingArgSpaceSize & ~0xf; - genStackPointerAdjustment(outSzAligned, rsGetRsvdReg(), nullptr, - /* reportUnwindData */ true); - } - - regsToRestoreMask &= ~(RBM_FP | RBM_RA); // We'll restore FP/RA at the end. - - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, offset2 + 8); - compiler->unwindSaveReg(REG_RA, offset2 + 8); - - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, offset2); - compiler->unwindSaveReg(REG_FP, offset2); - - genStackPointerAdjustment(calleeSaveSPDelta, rsGetRsvdReg(), nullptr, - /* reportUnwindData */ true); - - calleeSaveSPDelta = totalFrameSize - compiler->compLclFrameSize - 2 * REGSIZE_BYTES; - calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPDelta, STACK_ALIGN); + callerSPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8; } else { - int offset2 = compiler->lvaOutgoingArgSpaceSize; - if (compiler->compLocallocUsed) - { - // Restore sp from fp - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -offset2); - compiler->unwindSetFrameReg(REG_FPBASE, offset2); - } - - regsToRestoreMask &= ~(RBM_FP | RBM_RA); // We'll restore FP/RA at the end. - - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, offset2 + 8); - compiler->unwindSaveReg(REG_RA, offset2 + 8); - - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, offset2); - compiler->unwindSaveReg(REG_FP, offset2); - - calleeSaveSPOffset = totalFrameSize - compiler->compLclFrameSize - 2 * REGSIZE_BYTES; - calleeSaveSPDelta = AlignUp((UINT)calleeSaveSPOffset, STACK_ALIGN); - calleeSaveSPOffset = calleeSaveSPDelta - calleeSaveSPOffset; - - genStackPointerAdjustment(totalFrameSize - calleeSaveSPDelta, rsGetRsvdReg(), nullptr, - /* reportUnwindData */ true); + genStackPointerAdjustment(calleeSaveSPOffset, REG_RA, nullptr, /* reportUnwindData */ true); } + calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize - calleeSaveSPOffset; + remainingSPSize = remainingSPSize - calleeSaveSPOffset; + } + else + { + if (compiler->compLocallocUsed) + { + callerSPtoFPdelta = (compiler->compCalleeRegsPushed << 3) - 8 + compiler->lvaOutgoingArgSpaceSize; + } + calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize; + // remainingSPSize = totalFrameSize; } } - else + + if (compiler->compLocallocUsed) { - // No frame pointer (no chaining). - NYI_RISCV64("Frame without frame pointer"); - calleeSaveSPOffset = 0; + // restore sp form fp: addi sp, -#callerSPtoFPdelta(fp) + emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_FPBASE, -callerSPtoFPdelta); + compiler->unwindSetFrameReg(REG_FPBASE, callerSPtoFPdelta); } - JITDUMP(" calleeSaveSPOffset=%d, calleeSaveSPDelta=%d\n", calleeSaveSPOffset, calleeSaveSPDelta); - genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, calleeSaveSPOffset, calleeSaveSPDelta); + JITDUMP(" calleeSaveSPOffset=%d, callerSPtoFPdelta=%d\n", calleeSaveSPOffset, callerSPtoFPdelta); + genRestoreCalleeSavedRegistersHelp(regsToRestoreMask, calleeSaveSPOffset, 0); - if (frameType == 1) - { - calleeSaveSPOffset = compiler->lvaOutgoingArgSpaceSize; + // restore ra/fp regs + calleeSaveSPOffset += (compiler->compCalleeRegsPushed - 2) << 3; - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, calleeSaveSPOffset + 8); - compiler->unwindSaveReg(REG_RA, calleeSaveSPOffset + 8); + emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_RA, REG_SPBASE, calleeSaveSPOffset); + compiler->unwindSaveReg(REG_RA, calleeSaveSPOffset); - GetEmitter()->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, calleeSaveSPOffset); - compiler->unwindSaveReg(REG_FP, calleeSaveSPOffset); + emit->emitIns_R_R_I(INS_ld, EA_PTRSIZE, REG_FP, REG_SPBASE, calleeSaveSPOffset + 8); + compiler->unwindSaveReg(REG_FP, calleeSaveSPOffset + 8); - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, totalFrameSize); - compiler->unwindAllocStack(totalFrameSize); - } - else if (frameType == 2) + if (emitter::isValidUimm11(remainingSPSize)) { - // had done. + emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, remainingSPSize); } else { - unreached(); + regNumber tempReg = rsGetRsvdReg(); + emit->emitLoadImmediate(EA_PTRSIZE, tempReg, remainingSPSize); + emit->emitIns_R_R_R(INS_add, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, tempReg); + } + compiler->unwindAllocStack(remainingSPSize); + + // for OSR we have to adjust SP to remove tier0 frame + if (compiler->opts.IsOSR()) + { + const int tier0FrameSize = compiler->info.compPatchpointInfo->TotalFrameSize(); + JITDUMP("Extra SP adjust for OSR to pop off Tier0 frame: %d bytes\n", tier0FrameSize); + + if (emitter::isValidUimm11(tier0FrameSize)) + { + emit->emitIns_R_R_I(INS_addi, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, tier0FrameSize); + } + else + { + regNumber tempReg = rsGetRsvdReg(); + emit->emitLoadImmediate(EA_PTRSIZE, tempReg, tier0FrameSize); + emit->emitIns_R_R_R(INS_add, EA_PTRSIZE, REG_SPBASE, REG_SPBASE, tempReg); + } + compiler->unwindAllocStack(tier0FrameSize); } } diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 7fed233ed8cce..bd8cd590eea53 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -5682,7 +5682,7 @@ void Compiler::generatePatchpointInfo() // const int totalFrameSize = codeGen->genTotalFrameSize() + TARGET_POINTER_SIZE; const int offsetAdjust = 0; -#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // SP is not manipulated by calls so no frame size adjustment needed. // Local Offsets may need adjusting, if FP is at bottom of frame. // @@ -6947,7 +6947,7 @@ int Compiler::compCompileHelper(CORINFO_MODULE_HANDLE classPtr, { frameSizeUpdate = 8; } -#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) if ((totalFrameSize & 0xf) != 0) { frameSizeUpdate = 8; diff --git a/src/coreclr/jit/gcencode.cpp b/src/coreclr/jit/gcencode.cpp index ad75dbf627099..96005e6057662 100644 --- a/src/coreclr/jit/gcencode.cpp +++ b/src/coreclr/jit/gcencode.cpp @@ -3888,7 +3888,7 @@ void GCInfo::gcInfoBlockHdrSave(GcInfoEncoder* gcInfoEncoder, unsigned methodSiz // const int osrOffset = ppInfo->GenericContextArgOffset() - 2 * REGSIZE_BYTES; assert(offset == osrOffset); -#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // PP info has virtual offset. This is also the caller SP offset. // const int osrOffset = ppInfo->GenericContextArgOffset(); @@ -3931,7 +3931,7 @@ void GCInfo::gcInfoBlockHdrSave(GcInfoEncoder* gcInfoEncoder, unsigned methodSiz // const int osrOffset = ppInfo->KeptAliveThisOffset() - 2 * REGSIZE_BYTES; assert(offset == osrOffset); -#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // PP info has virtual offset. This is also the caller SP offset. // const int osrOffset = ppInfo->KeptAliveThisOffset(); diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 50afc5c7cf1c0..16622e227de63 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -574,11 +574,11 @@ CONFIG_INTEGER(JitRandomGuardedDevirtualization, W("JitRandomGuardedDevirtualiza #endif // DEBUG // Enable insertion of patchpoints into Tier0 methods, switching to optimized where needed. -#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) CONFIG_INTEGER(TC_OnStackReplacement, W("TC_OnStackReplacement"), 1) #else CONFIG_INTEGER(TC_OnStackReplacement, W("TC_OnStackReplacement"), 0) -#endif // defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#endif // defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // Initial patchpoint counter value used by jitted code CONFIG_INTEGER(TC_OnStackReplacement_InitialCounter, W("TC_OnStackReplacement_InitialCounter"), 1000) // Enable partial compilation for Tier0 methods diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 0eaf5f3313c6f..11b452493c270 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -5229,7 +5229,7 @@ void Compiler::lvaFixVirtualFrameOffsets() if (opts.IsOSR()) { -#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) // Stack offset includes Tier0 frame. // JITDUMP("--- delta bump %d for OSR + Tier0 frame\n", info.compPatchpointInfo->TotalFrameSize()); @@ -5334,7 +5334,7 @@ void Compiler::lvaFixVirtualFrameOffsets() #endif // FEATURE_FIXED_OUT_ARGS -#if defined(TARGET_ARM64) || defined(TARGET_RISCV64) +#if defined(TARGET_ARM64) // We normally add alignment below the locals between them and the outgoing // arg space area. When we store fp/lr(ra) at the bottom, however, this will // be below the alignment. So we should not apply the alignment adjustment to @@ -5346,11 +5346,11 @@ void Compiler::lvaFixVirtualFrameOffsets() { lvaTable[lvaRetAddrVar].SetStackOffset(REGSIZE_BYTES); } -#elif defined(TARGET_LOONGARCH64) +#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) assert(codeGen->isFramePointerUsed()); if (lvaRetAddrVar != BAD_VAR_NUM) { - // For LoongArch64, the RA is below the fp. see the `genPushCalleeSavedRegisters` + // For LoongArch64 and RISCV64, the RA is below the fp. see the `genPushCalleeSavedRegisters` lvaTable[lvaRetAddrVar].SetStackOffset(-REGSIZE_BYTES); } #endif // !TARGET_LOONGARCH64 @@ -6139,17 +6139,11 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES; } -#elif defined(TARGET_LOONGARCH64) - - assert(compCalleeRegsPushed >= 2); - -#elif defined(TARGET_RISCV64) +#elif defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) - // Subtract off FP and RA. assert(compCalleeRegsPushed >= 2); - stkOffs -= (compCalleeRegsPushed - 2) * REGSIZE_BYTES; -#else // !TARGET_RISCV64 +#else // !TARGET_LOONGARCH64 && !TARGET_RISCV64 #ifdef TARGET_ARM // On ARM32 LR is part of the pushed registers and is always stored at the // top. @@ -6160,7 +6154,7 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() #endif stkOffs -= compCalleeRegsPushed * REGSIZE_BYTES; -#endif // !TARGET_RISCV64 +#endif // !TARGET_LOONGARCH64 && !TARGET_RISCV64 // (2) Account for the remainder of the frame // @@ -6882,11 +6876,6 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() } #endif // TARGET_ARM64 -#if defined(TARGET_RISCV64) - assert(isFramePointerUsed()); // Note that currently we always have a frame pointer - stkOffs -= 2 * REGSIZE_BYTES; -#endif // TARGET_RISCV64 - #if FEATURE_FIXED_OUT_ARGS if (lvaOutgoingArgSpaceSize > 0) { @@ -6903,8 +6892,8 @@ void Compiler::lvaAssignVirtualFrameOffsetsToLocals() } #endif // FEATURE_FIXED_OUT_ARGS -#ifdef TARGET_LOONGARCH64 - // For LoongArch64, CalleeSavedRegs are at bottom. +#if defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) + // For LoongArch64 and RISCV64, CalleeSavedRegs are at bottom. int pushedCount = 0; #else // compLclFrameSize equals our negated virtual stack offset minus the pushed registers and return address @@ -7891,7 +7880,7 @@ int Compiler::lvaToCallerSPRelativeOffset(int offset, bool isFpBased, bool forRo offset += codeGen->genCallerSPtoInitialSPdelta(); } -#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_AMD64) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) if (forRootFrame && opts.IsOSR()) { const PatchpointInfo* const ppInfo = info.compPatchpointInfo; @@ -7910,7 +7899,7 @@ int Compiler::lvaToCallerSPRelativeOffset(int offset, bool isFpBased, bool forRo // const int adjustment = ppInfo->TotalFrameSize() + REGSIZE_BYTES; -#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) +#elif defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) const int adjustment = ppInfo->TotalFrameSize(); #endif diff --git a/src/coreclr/jit/targetriscv64.h b/src/coreclr/jit/targetriscv64.h index a12bcc0498640..9cf0185a56935 100644 --- a/src/coreclr/jit/targetriscv64.h +++ b/src/coreclr/jit/targetriscv64.h @@ -298,6 +298,9 @@ #define B_DIST_SMALL_MAX_NEG (-4096) #define B_DIST_SMALL_MAX_POS (+4095) + // The number of bytes from the end the last probed page that must also be probed, to allow for some + // small SP adjustments without probes. If zero, then the stack pointer can point to the last byte/word + // on the stack guard page, and must be touched before any further "SUB SP". #define STACK_PROBE_BOUNDARY_THRESHOLD_BYTES 0 // clang-format on