From 988e2bb20b4498427c8a5b9c092b2cadf3b0c69c Mon Sep 17 00:00:00 2001 From: Tymoteusz Wenerski Date: Tue, 20 May 2025 15:55:36 +0200 Subject: [PATCH 1/4] simplify genZeroInitFrameUsingBlockInit --- src/coreclr/jit/codegenriscv64.cpp | 95 +++++++++++++----------------- 1 file changed, 40 insertions(+), 55 deletions(-) diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index beb805eb8a12aa..91b396e32784ea 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -720,7 +720,6 @@ void CodeGen::genFnEpilog(BasicBlock* block) void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNumber initReg, bool* pInitRegZeroed) { regNumber rAddr; - regNumber rCnt = REG_NA; // Invalid regMaskTP regMask; regMaskTP availMask = regSet.rsGetModifiedRegsMask() | RBM_INT_CALLEE_TRASH; // Set of available registers @@ -749,84 +748,70 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu *pInitRegZeroed = false; } - bool useLoop = false; - unsigned uCntBytes = untrLclHi - untrLclLo; - assert((uCntBytes % sizeof(int)) == 0); // The smallest stack slot is always 4 bytes. - unsigned int padding = untrLclLo & 0x7; + unsigned uLclBytes = untrLclHi - untrLclLo; + assert((uLclBytes % 4) == 0); // The smallest stack slot is always 4 bytes. + unsigned padding = untrLclLo & 0x7; if (padding) { assert(padding == 4); GetEmitter()->emitIns_R_R_I(INS_sw, EA_4BYTE, REG_R0, rAddr, 0); - uCntBytes -= 4; + uLclBytes -= 4; } - unsigned uCntSlots = uCntBytes / REGSIZE_BYTES; // How many register sized stack slots we're going to use. + unsigned uRegSlots = uLclBytes / REGSIZE_BYTES; + unsigned uAddrCurr = 0; - // When uCntSlots is 9 or less, we will emit a sequence of sd instructions inline. - // When it is 10 or greater, we will emit a loop containing a sd instruction. - // In both of these cases the sd instruction will write two zeros to memory - // and we will use a single str instruction at the end whenever we have an odd count. - if (uCntSlots >= 10) - useLoop = true; - - if (useLoop) + if (uRegSlots >= 12) { - // We pick the next lowest register number for rCnt + regNumber rEndAddr; noway_assert(availMask != RBM_NONE); regMask = genFindLowestBit(availMask); - rCnt = genRegNumFromMask(regMask); + rEndAddr = genRegNumFromMask(regMask); availMask &= ~regMask; - noway_assert(uCntSlots >= 2); - assert((genRegMask(rCnt) & intRegState.rsCalleeRegArgMaskLiveIn) == 0); // rCnt is not a live incoming - // argument reg - instGen_Set_Reg_To_Imm(EA_PTRSIZE, rCnt, (ssize_t)uCntSlots / 2); + // rEndAddr is not a live incoming argument reg + assert((genRegMask(rEndAddr) & intRegState.rsCalleeRegArgMaskLiveIn) == 0); - // TODO-RISCV64: maybe optimize further - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, 8 + padding); - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, 0 + padding); - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, rCnt, rCnt, -1); + ssize_t uLoopBytes = (uRegSlots & ~0x3) * REGSIZE_BYTES; - // bne rCnt, zero, -4 * 4 - ssize_t imm = -16; - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, rAddr, rAddr, 2 * REGSIZE_BYTES); - GetEmitter()->emitIns_R_R_I(INS_bne, EA_PTRSIZE, rCnt, REG_R0, imm); + if (uLoopBytes) + { + instGen_Set_Reg_To_Imm(EA_PTRSIZE, rEndAddr, uLoopBytes); + GetEmitter()->emitIns_R_R_R(INS_add, EA_PTRSIZE, rEndAddr, rEndAddr, rAddr); + + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding); + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding + REGSIZE_BYTES); + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding + 2 * REGSIZE_BYTES); + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding + 3 * REGSIZE_BYTES); - uCntBytes %= REGSIZE_BYTES * 2; + GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, rAddr, rAddr, 4 * REGSIZE_BYTES); + GetEmitter()->emitIns_R_R_I(INS_bne, EA_PTRSIZE, rAddr, rEndAddr, -5 << 2); + + uLclBytes -= uLoopBytes; + uAddrCurr = 0; + } } - else + + while (uLclBytes >= REGSIZE_BYTES) { - while (uCntBytes >= REGSIZE_BYTES * 2) - { - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, 8 + padding); - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, 0 + padding); - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, rAddr, rAddr, 2 * REGSIZE_BYTES + padding); - uCntBytes -= REGSIZE_BYTES * 2; - padding = 0; - } + GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, uAddrCurr + padding); + uLclBytes -= REGSIZE_BYTES; + uAddrCurr += REGSIZE_BYTES; } - if (uCntBytes >= REGSIZE_BYTES) // check and zero the last register-sized stack slot (odd number) + if (uAddrCurr != 0) { - if ((uCntBytes - REGSIZE_BYTES) == 0) - { - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding); - } - else - { - GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding); - GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, rAddr, rAddr, REGSIZE_BYTES); - } - uCntBytes -= REGSIZE_BYTES; + uAddrCurr -= REGSIZE_BYTES; } - if (uCntBytes > 0) + + if (uLclBytes != 0) { - assert(uCntBytes == sizeof(int)); - GetEmitter()->emitIns_R_R_I(INS_sw, EA_4BYTE, REG_R0, rAddr, padding); - uCntBytes -= sizeof(int); + assert(uLclBytes == 4); + GetEmitter()->emitIns_R_R_I(INS_sw, EA_4BYTE, REG_R0, rAddr, uAddrCurr + padding); + uLclBytes -= 4; } - noway_assert(uCntBytes == 0); + noway_assert(uLclBytes == 0); } void CodeGen::inst_JMP(emitJumpKind jmp, BasicBlock* tgtBlock) From 52e7c3de32f3282348b7ee59c09ae3ec062b630e Mon Sep 17 00:00:00 2001 From: Tymoteusz Wenerski Date: Tue, 27 May 2025 15:47:10 +0200 Subject: [PATCH 2/4] apply format.patch --- src/coreclr/jit/codegenriscv64.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index ea508ce6a87116..490aa408f0697c 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -766,7 +766,7 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu { regNumber rEndAddr; noway_assert(availMask != RBM_NONE); - regMask = genFindLowestBit(availMask); + regMask = genFindLowestBit(availMask); rEndAddr = genRegNumFromMask(regMask); availMask &= ~regMask; From e9d34a7206b7db9362d7d0330827c26b91727e84 Mon Sep 17 00:00:00 2001 From: Tymoteusz Wenerski Date: Mon, 2 Jun 2025 08:43:03 +0200 Subject: [PATCH 3/4] fix type mismatch --- src/coreclr/jit/codegenriscv64.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index 1aa8405494ad66..c9a163d8924836 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -748,9 +748,9 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu *pInitRegZeroed = false; } - unsigned uLclBytes = untrLclHi - untrLclLo; + ssize_t uLclBytes = untrLclHi - untrLclLo; assert((uLclBytes % 4) == 0); // The smallest stack slot is always 4 bytes. - unsigned padding = untrLclLo & 0x7; + ssize_t padding = untrLclLo & 0x7; if (padding) { @@ -759,8 +759,8 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu uLclBytes -= 4; } - unsigned uRegSlots = uLclBytes / REGSIZE_BYTES; - unsigned uAddrCurr = 0; + ssize_t uRegSlots = uLclBytes / REGSIZE_BYTES; + ssize_t uAddrCurr = 0; if (uRegSlots >= 12) { From 28c7311377a65174c04cad12720d5d85afe53d67 Mon Sep 17 00:00:00 2001 From: Tymoteusz Wenerski Date: Mon, 2 Jun 2025 15:58:50 +0200 Subject: [PATCH 4/4] apply review suggestions --- src/coreclr/jit/codegenriscv64.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/codegenriscv64.cpp b/src/coreclr/jit/codegenriscv64.cpp index c9a163d8924836..730cb2cb8427bc 100644 --- a/src/coreclr/jit/codegenriscv64.cpp +++ b/src/coreclr/jit/codegenriscv64.cpp @@ -777,8 +777,15 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu if (uLoopBytes) { - instGen_Set_Reg_To_Imm(EA_PTRSIZE, rEndAddr, uLoopBytes); - GetEmitter()->emitIns_R_R_R(INS_add, EA_PTRSIZE, rEndAddr, rEndAddr, rAddr); + if (emitter::isValidSimm12(uLoopBytes)) + { + GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, rEndAddr, rAddr, uLoopBytes); + } + else + { + instGen_Set_Reg_To_Imm(EA_PTRSIZE, rEndAddr, uLoopBytes); + GetEmitter()->emitIns_R_R_R(INS_add, EA_PTRSIZE, rEndAddr, rEndAddr, rAddr); + } GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding); GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding + REGSIZE_BYTES); @@ -786,7 +793,7 @@ void CodeGen::genZeroInitFrameUsingBlockInit(int untrLclHi, int untrLclLo, regNu GetEmitter()->emitIns_R_R_I(INS_sd, EA_PTRSIZE, REG_R0, rAddr, padding + 3 * REGSIZE_BYTES); GetEmitter()->emitIns_R_R_I(INS_addi, EA_PTRSIZE, rAddr, rAddr, 4 * REGSIZE_BYTES); - GetEmitter()->emitIns_R_R_I(INS_bne, EA_PTRSIZE, rAddr, rEndAddr, -5 << 2); + GetEmitter()->emitIns_R_R_I(INS_bltu, EA_PTRSIZE, rAddr, rEndAddr, -5 << 2); uLclBytes -= uLoopBytes; uAddrCurr = 0;