From 1ed67a5c4799c099cf613bf034e2ed7186e54bfd Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Mon, 5 Feb 2024 16:28:40 +0100 Subject: [PATCH] JIT: Support more scaled addressing modes on arm64 (#97921) We currently support scaled addressing modes when the index also needs an extension through contained `BFIZ` nodes. However, we did not support scaled addressing modes if the index was 64 bits. This adds that support as a natural extension to the `GT_LEA`. --- src/coreclr/jit/codegen.h | 10 ++- src/coreclr/jit/codegencommon.cpp | 100 +++++++++++++++-------------- src/coreclr/jit/codegeninterface.h | 1 + src/coreclr/jit/compiler.hpp | 44 ++++++++++++- src/coreclr/jit/gentree.cpp | 27 ++++---- src/coreclr/jit/lower.cpp | 43 ++++++------- 6 files changed, 137 insertions(+), 88 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 18a27b41aff49..0f91f798bb946 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -31,8 +31,14 @@ class CodeGen final : public CodeGenInterface // TODO-Cleanup: Abstract out the part of this that finds the addressing mode, and // move it to Lower - virtual bool genCreateAddrMode( - GenTree* addr, bool fold, bool* revPtr, GenTree** rv1Ptr, GenTree** rv2Ptr, unsigned* mulPtr, ssize_t* cnsPtr); + virtual bool genCreateAddrMode(GenTree* addr, + bool fold, + unsigned naturalMul, + bool* revPtr, + GenTree** rv1Ptr, + GenTree** rv2Ptr, + unsigned* mulPtr, + ssize_t* cnsPtr); #ifdef LATE_DISASM virtual const char* siStackVarName(size_t offs, size_t size, unsigned reg, unsigned stkOffs); diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp index 44de5c336ad51..c468473067b6e 100644 --- a/src/coreclr/jit/codegencommon.cpp +++ b/src/coreclr/jit/codegencommon.cpp @@ -965,7 +965,7 @@ void CodeGen::genDefineInlineTempLabel(BasicBlock* label) // Notes: // This only makes an adjustment if !FEATURE_FIXED_OUT_ARGS, if there is no frame pointer, // and if 'block' is a throw helper block with a non-zero stack level. - +// void CodeGen::genAdjustStackLevel(BasicBlock* block) { #if !FEATURE_FIXED_OUT_ARGS @@ -1003,35 +1003,33 @@ void CodeGen::genAdjustStackLevel(BasicBlock* block) #endif // !FEATURE_FIXED_OUT_ARGS } -/***************************************************************************** - * - * Take an address expression and try to find the best set of components to - * form an address mode; returns non-zero if this is successful. - * - * TODO-Cleanup: The RyuJIT backend never uses this to actually generate code. - * Refactor this code so that the underlying analysis can be used in - * the RyuJIT Backend to do lowering, instead of having to call this method with the - * option to not generate the code. - * - * 'fold' specifies if it is OK to fold the array index which hangs off - * a GT_NOP node. - * - * If successful, the parameters will be set to the following values: - * - * *rv1Ptr ... base operand - * *rv2Ptr ... optional operand - * *revPtr ... true if rv2 is before rv1 in the evaluation order - * *mulPtr ... optional multiplier (2/4/8) for rv2 - * Note that for [reg1 + reg2] and [reg1 + reg2 + icon], *mulPtr == 0. - * *cnsPtr ... integer constant [optional] - * - * IMPORTANT NOTE: This routine doesn't generate any code, it merely - * identifies the components that might be used to - * form an address mode later on. - */ - -bool CodeGen::genCreateAddrMode( - GenTree* addr, bool fold, bool* revPtr, GenTree** rv1Ptr, GenTree** rv2Ptr, unsigned* mulPtr, ssize_t* cnsPtr) +//------------------------------------------------------------------------ +// genCreateAddrMode: +// Take an address expression and try to find the best set of components to +// form an address mode; returns true if this is successful. +// +// Parameters: +// addr - Tree that potentially computes an address +// fold - Secifies if it is OK to fold the array index which hangs off a GT_NOP node. +// naturalMul - For arm64 specifies the natural multiplier for the address mode (i.e. the size of the parent +// indirection). +// revPtr - [out] True if rv2 is before rv1 in the evaluation order +// rv1Ptr - [out] Base operand +// rv2Ptr - [out] Optional operand +// mulPtr - [out] Optional multiplier for rv2. If non-zero and naturalMul is non-zero, it must match naturalMul. +// cnsPtr - [out] Integer constant [optional] +// +// Returns: +// True if some address mode components were extracted. +// +bool CodeGen::genCreateAddrMode(GenTree* addr, + bool fold, + unsigned naturalMul, + bool* revPtr, + GenTree** rv1Ptr, + GenTree** rv2Ptr, + unsigned* mulPtr, + ssize_t* cnsPtr) { /* The following indirections are valid address modes on x86/x64: @@ -1171,8 +1169,7 @@ bool CodeGen::genCreateAddrMode( goto AGAIN; -#if !defined(TARGET_ARMARCH) && !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64) - // TODO-ARM64-CQ, TODO-ARM-CQ: For now we don't try to create a scaled index. + // TODO-ARM-CQ: For now we don't try to create a scaled index. case GT_MUL: if (op1->gtOverflow()) { @@ -1182,10 +1179,11 @@ bool CodeGen::genCreateAddrMode( FALLTHROUGH; case GT_LSH: - - mul = op1->GetScaledIndex(); - if (mul) + { + unsigned mulCandidate = op1->GetScaledIndex(); + if (jitIsScaleIndexMul(mulCandidate, naturalMul)) { + mul = mulCandidate; /* We can use "[mul*rv2 + icon]" */ rv1 = nullptr; @@ -1194,7 +1192,7 @@ bool CodeGen::genCreateAddrMode( goto FOUND_AM; } break; -#endif // !defined(TARGET_ARMARCH) && !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64) + } default: break; @@ -1215,8 +1213,8 @@ bool CodeGen::genCreateAddrMode( switch (op1->gtOper) { -#if !defined(TARGET_ARMARCH) && !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64) - // TODO-ARM64-CQ, TODO-ARM-CQ: For now we don't try to create a scaled index. +#ifdef TARGET_XARCH + // TODO-ARM-CQ: For now we don't try to create a scaled index. case GT_ADD: if (op1->gtOverflow()) @@ -1237,6 +1235,7 @@ bool CodeGen::genCreateAddrMode( } } break; +#endif // TARGET_XARCH case GT_MUL: @@ -1248,11 +1247,12 @@ bool CodeGen::genCreateAddrMode( FALLTHROUGH; case GT_LSH: - - mul = op1->GetScaledIndex(); - if (mul) + { + unsigned mulCandidate = op1->GetScaledIndex(); + if (jitIsScaleIndexMul(mulCandidate, naturalMul)) { /* 'op1' is a scaled value */ + mul = mulCandidate; rv1 = op2; rv2 = op1->AsOp()->gtOp1; @@ -1260,7 +1260,7 @@ bool CodeGen::genCreateAddrMode( int argScale; while ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (argScale = rv2->GetScaledIndex()) != 0) { - if (jitIsScaleIndexMul(argScale * mul)) + if (jitIsScaleIndexMul(argScale * mul, naturalMul)) { mul = mul * argScale; rv2 = rv2->AsOp()->gtOp1; @@ -1277,7 +1277,7 @@ bool CodeGen::genCreateAddrMode( goto FOUND_AM; } break; -#endif // !TARGET_ARMARCH && !TARGET_LOONGARCH64 && !TARGET_RISCV64 + } case GT_COMMA: @@ -1291,7 +1291,7 @@ bool CodeGen::genCreateAddrMode( noway_assert(op2); switch (op2->gtOper) { -#if !defined(TARGET_ARMARCH) && !defined(TARGET_LOONGARCH64) && !defined(TARGET_RISCV64) +#ifdef TARGET_XARCH // TODO-ARM64-CQ, TODO-ARM-CQ: For now we only handle MUL and LSH because // arm doesn't support both scale and offset at the same. Offset is handled // at the emitter as a peephole optimization. @@ -1315,6 +1315,7 @@ bool CodeGen::genCreateAddrMode( goto AGAIN; } break; +#endif // TARGET_XARCH case GT_MUL: @@ -1326,16 +1327,17 @@ bool CodeGen::genCreateAddrMode( FALLTHROUGH; case GT_LSH: - - mul = op2->GetScaledIndex(); - if (mul) + { + unsigned mulCandidate = op2->GetScaledIndex(); + if (jitIsScaleIndexMul(mulCandidate, naturalMul)) { + mul = mulCandidate; // 'op2' is a scaled value...is it's argument also scaled? int argScale; rv2 = op2->AsOp()->gtOp1; while ((rv2->gtOper == GT_MUL || rv2->gtOper == GT_LSH) && (argScale = rv2->GetScaledIndex()) != 0) { - if (jitIsScaleIndexMul(argScale * mul)) + if (jitIsScaleIndexMul(argScale * mul, naturalMul)) { mul = mul * argScale; rv2 = rv2->AsOp()->gtOp1; @@ -1351,7 +1353,7 @@ bool CodeGen::genCreateAddrMode( goto FOUND_AM; } break; -#endif // TARGET_ARMARCH || TARGET_LOONGARCH64 || TARGET_RISCV64 + } case GT_COMMA: diff --git a/src/coreclr/jit/codegeninterface.h b/src/coreclr/jit/codegeninterface.h index c42ddd7d36d4d..a28a60b50ca8c 100644 --- a/src/coreclr/jit/codegeninterface.h +++ b/src/coreclr/jit/codegeninterface.h @@ -113,6 +113,7 @@ class CodeGenInterface // move it to Lower virtual bool genCreateAddrMode(GenTree* addr, bool fold, + unsigned naturalMul, bool* revPtr, GenTree** rv1Ptr, GenTree** rv2Ptr, diff --git a/src/coreclr/jit/compiler.hpp b/src/coreclr/jit/compiler.hpp index 22a36940820ee..d664a57bd2414 100644 --- a/src/coreclr/jit/compiler.hpp +++ b/src/coreclr/jit/compiler.hpp @@ -196,10 +196,45 @@ inline unsigned __int64 BitsBetween(unsigned __int64 value, unsigned __int64 end (end - 1); // Ones to the right of set bit in the end mask. } -/*****************************************************************************/ - +//------------------------------------------------------------------------------ +// jitIsScaleIndexMul: Check if the value is a valid addressing mode multiplier +// amount for x64. +// +// Parameters: +// val - The multiplier +// +// Returns: +// True if value is 1, 2, 4 or 8. +// inline bool jitIsScaleIndexMul(size_t val) { + // TODO-Cleanup: On arm64 the scale that can be used in addressing modes + // depends on the containing indirection, so this function should be reevaluated. + switch (val) + { + case 1: + case 2: + case 4: + case 8: + return true; + default: + return false; + } +} + +//------------------------------------------------------------------------------ +// jitIsScaleIndexMul: Check if the value is a valid addressing mode multiplier amount. +// +// Parameters: +// val - The multiplier +// naturalMul - For arm64, the natural multiplier (size of containing indirection) +// +// Returns: +// True if the multiplier can be used in an address mode. +// +inline bool jitIsScaleIndexMul(size_t val, unsigned naturalMul) +{ +#if defined(TARGET_XARCH) switch (val) { case 1: @@ -211,6 +246,11 @@ inline bool jitIsScaleIndexMul(size_t val) default: return false; } +#elif defined(TARGET_ARM64) + return val == naturalMul; +#else + return false; +#endif } // Returns "tree" iff "val" is a valid addressing mode scale shift amount on diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 2ef6833701ae5..e5593d67c560b 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -4538,20 +4538,21 @@ bool Compiler::gtMarkAddrMode(GenTree* addr, int* pCostEx, int* pCostSz, var_typ GenTree* base; // This is the base of the address. GenTree* idx; // This is the index. - if (codeGen->genCreateAddrMode(addr, false /*fold*/, &rev, &base, &idx, &mul, &cns)) - { + unsigned naturalMul = 0; +#ifdef TARGET_ARM64 + // Multiplier should be a "natural-scale" power of two number which is equal to target's width. + // + // *(ulong*)(data + index * 8); - can be optimized + // *(ulong*)(data + index * 7); - can not be optimized + // *(int*)(data + index * 2); - can not be optimized + // + naturalMul = genTypeSize(type); +#endif -#ifdef TARGET_ARMARCH - // Multiplier should be a "natural-scale" power of two number which is equal to target's width. - // - // *(ulong*)(data + index * 8); - can be optimized - // *(ulong*)(data + index * 7); - can not be optimized - // *(int*)(data + index * 2); - can not be optimized - // - if ((mul > 0) && (genTypeSize(type) != mul)) - { - return false; - } + if (codeGen->genCreateAddrMode(addr, false /*fold*/, naturalMul, &rev, &base, &idx, &mul, &cns)) + { +#ifdef TARGET_ARM64 + assert((mul == 0) || (mul == 1) || (mul == naturalMul)); #endif // We can form a complex addressing mode, so mark each of the interior diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index c859bbe876135..1cec2935f7922 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -6135,14 +6135,28 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par ssize_t offset = 0; bool rev = false; + var_types targetType = parent->OperIsIndir() ? parent->TypeGet() : TYP_UNDEF; + + unsigned naturalMul = 0; +#ifdef TARGET_ARM64 + // Multiplier should be a "natural-scale" power of two number which is equal to target's width. + // + // *(ulong*)(data + index * 8); - can be optimized + // *(ulong*)(data + index * 7); - can not be optimized + // *(int*)(data + index * 2); - can not be optimized + // + naturalMul = genTypeSize(targetType); +#endif + // Find out if an addressing mode can be constructed - bool doAddrMode = comp->codeGen->genCreateAddrMode(addr, // address - true, // fold - &rev, // reverse ops - &base, // base addr - &index, // index val - &scale, // scaling - &offset); // displacement + bool doAddrMode = comp->codeGen->genCreateAddrMode(addr, // address + true, // fold + naturalMul, // natural multiplier + &rev, // reverse ops + &base, // base addr + &index, // index val + &scale, // scaling + &offset); // displacement #ifdef TARGET_ARM64 if (parent->OperIsIndir() && parent->AsIndir()->IsVolatile()) @@ -6158,21 +6172,6 @@ bool Lowering::TryCreateAddrMode(GenTree* addr, bool isContainable, GenTree* par } #endif - var_types targetType = parent->OperIsIndir() ? parent->TypeGet() : TYP_UNDEF; - -#ifdef TARGET_ARMARCH - // Multiplier should be a "natural-scale" power of two number which is equal to target's width. - // - // *(ulong*)(data + index * 8); - can be optimized - // *(ulong*)(data + index * 7); - can not be optimized - // *(int*)(data + index * 2); - can not be optimized - // - if ((scale > 0) && (genTypeSize(targetType) != scale)) - { - return false; - } -#endif - if (scale == 0) { scale = 1;