From b0b1dcd59f1cc0db8d164538c1e3b8ace69a3132 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Thu, 6 Jun 2024 00:40:43 +0200 Subject: [PATCH 1/4] JIT: Build and consume FMA node operands in standard order The building and consumption of these operands can happen in op1, op2, op3 order regardless of whether the codegen uses the registers in a different order. Fix #102773 --- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 22 +----- src/coreclr/jit/lsraxarch.cpp | 83 ++++----------------- 2 files changed, 15 insertions(+), 90 deletions(-) diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 4aabaeda47cdd9..8b8bbac6280a2e 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -3071,6 +3071,8 @@ void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) regNumber targetReg = node->GetRegNum(); + genConsumeMultiOpOperands(node); + regNumber op1NodeReg = op1->GetRegNum(); regNumber op2NodeReg = op2->GetRegNum(); regNumber op3NodeReg = op3->GetRegNum(); @@ -3084,11 +3086,6 @@ void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) // Intrinsics with CopyUpperBits semantics cannot have op1 be contained assert(!copiesUpperBits || !op1->isContained()); - // We need to keep this in sync with lsraxarch.cpp - // Ideally we'd actually swap the operands in lsra and simplify codegen - // but its a bit more complicated to do so for many operands as well - // as being complicated to tell codegen how to pick the right instruction - instruction ins = INS_invalid; if (op1->isContained() || op1->isUsedFromSpillTemp()) @@ -3159,21 +3156,6 @@ void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) } } -#ifdef DEBUG - // Use nums are assigned in LIR order but this node is special and doesn't - // actually use operands. Fix up the use nums here to avoid asserts. - unsigned useNum1 = op1->gtUseNum; - unsigned useNum2 = op2->gtUseNum; - unsigned useNum3 = op3->gtUseNum; - emitOp1->gtUseNum = useNum1; - emitOp2->gtUseNum = useNum2; - emitOp3->gtUseNum = useNum3; -#endif - - genConsumeRegs(emitOp1); - genConsumeRegs(emitOp2); - genConsumeRegs(emitOp3); - assert(ins != INS_invalid); genHWIntrinsic_R_R_R_RM(ins, attr, targetReg, emitOp1->GetRegNum(), emitOp2->GetRegNum(), emitOp3, instOptions); genProduceReg(node); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index d3a7f075fdd08c..b7cb11b1fdb174 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2467,88 +2467,31 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3); - unsigned containedOpNum = 0; - - // containedOpNum remains 0 when no operand is contained or regOptional - if (op1->isContained() || op1->IsRegOptional()) - { - containedOpNum = 1; - } - else if (op2->isContained() || op2->IsRegOptional()) - { - containedOpNum = 2; - } - else if (op3->isContained() || op3->IsRegOptional()) + // Intrinsics with CopyUpperBits semantics must have op1 as target + if (copiesUpperBits) { - containedOpNum = 3; + resultOpNum = 1; } - GenTree* emitOp1 = op1; - GenTree* emitOp2 = op2; - GenTree* emitOp3 = op3; - - // Intrinsics with CopyUpperBits semantics must have op1 as target - assert(containedOpNum != 1 || !copiesUpperBits); - - // We need to keep this in sync with hwintrinsiccodegenxarch.cpp - // Ideally we'd actually swap the operands here and simplify codegen - // but its a bit more complicated to do so for many operands as well - // as being complicated to tell codegen how to pick the right instruction + GenTree* ops[] = {op1, op2, op3}; + GenTree* resultOp = ops[resultOpNum - 1]; - if (containedOpNum == 1) - { - // https://github.com/dotnet/runtime/issues/62215 - // resultOpNum might change between lowering and lsra, comment out assertion for now. - // assert(containedOpNum != resultOpNum); - // resultOpNum is 3 or 0: op3/? = ([op1] * op2) + op3 - std::swap(emitOp1, emitOp3); - - if (resultOpNum == 2) - { - // op2 = ([op1] * op2) + op3 - std::swap(emitOp1, emitOp2); - } - } - else if (containedOpNum == 3) + for (GenTree* op : ops) { - // assert(containedOpNum != resultOpNum); - if (resultOpNum == 2 && !copiesUpperBits) + if (op == resultOp) { - // op2 = (op1 * op2) + [op3] - std::swap(emitOp1, emitOp2); + tgtPrefUse = BuildUse(op); + srcCount++; } - // else: op1/? = (op1 * op2) + [op3] - } - else if (containedOpNum == 2) - { - // assert(containedOpNum != resultOpNum); - - // op1/? = (op1 * [op2]) + op3 - std::swap(emitOp2, emitOp3); - if (resultOpNum == 3 && !copiesUpperBits) + else if (op->isContained() || op->IsRegOptional()) { - // op3 = (op1 * [op2]) + op3 - std::swap(emitOp1, emitOp2); + srcCount += op->isContained() ? BuildOperandUses(op) : BuildDelayFreeUses(op, resultOp); } - } - else - { - // containedOpNum == 0 - // no extra work when resultOpNum is 0 or 1 - if (resultOpNum == 2) - { - std::swap(emitOp1, emitOp2); - } - else if (resultOpNum == 3) + else { - std::swap(emitOp1, emitOp3); + srcCount += BuildDelayFreeUses(op, resultOp); } } - tgtPrefUse = BuildUse(emitOp1); - - srcCount += 1; - srcCount += BuildDelayFreeUses(emitOp2, emitOp1); - srcCount += emitOp3->isContained() ? BuildOperandUses(emitOp3) : BuildDelayFreeUses(emitOp3, emitOp1); if (intrinsicTree->OperIsEmbRoundingEnabled() && !intrinsicTree->Op(4)->IsCnsIntOrI()) { From f4b6fd9ffc6caf94a014c21c8414c0820a2bdcf1 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Thu, 6 Jun 2024 01:28:57 +0200 Subject: [PATCH 2/4] Handle resultOpNum == 0 --- src/coreclr/jit/lsraxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index b7cb11b1fdb174..fcf49542e2203c 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2474,7 +2474,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } GenTree* ops[] = {op1, op2, op3}; - GenTree* resultOp = ops[resultOpNum - 1]; + GenTree* resultOp = resultOpNum == 0 ? nullptr : ops[resultOpNum - 1]; for (GenTree* op : ops) { From c36103c918af8e47b9a0bf8b698d0410a5ca1fd8 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Thu, 6 Jun 2024 01:52:15 +0200 Subject: [PATCH 3/4] Base it on the old code --- src/coreclr/jit/lsraxarch.cpp | 88 +++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index fcf49542e2203c..4a7e08e96c2e77 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2467,29 +2467,99 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou } unsigned resultOpNum = intrinsicTree->GetResultOpNumForRmwIntrinsic(user, op1, op2, op3); + unsigned containedOpNum = 0; + + // containedOpNum remains 0 when no operand is contained or regOptional + if (op1->isContained() || op1->IsRegOptional()) + { + containedOpNum = 1; + } + else if (op2->isContained() || op2->IsRegOptional()) + { + containedOpNum = 2; + } + else if (op3->isContained() || op3->IsRegOptional()) + { + containedOpNum = 3; + } + + GenTree* emitOp1 = op1; + GenTree* emitOp2 = op2; + GenTree* emitOp3 = op3; + // Intrinsics with CopyUpperBits semantics must have op1 as target - if (copiesUpperBits) + assert(containedOpNum != 1 || !copiesUpperBits); + + // We need to keep this in sync with hwintrinsiccodegenxarch.cpp + // Ideally we'd actually swap the operands here and simplify codegen + // but its a bit more complicated to do so for many operands as well + // as being complicated to tell codegen how to pick the right instruction + + if (containedOpNum == 1) { - resultOpNum = 1; + // https://github.com/dotnet/runtime/issues/62215 + // resultOpNum might change between lowering and lsra, comment out assertion for now. + // assert(containedOpNum != resultOpNum); + // resultOpNum is 3 or 0: op3/? = ([op1] * op2) + op3 + std::swap(emitOp1, emitOp3); + + if (resultOpNum == 2) + { + // op2 = ([op1] * op2) + op3 + std::swap(emitOp1, emitOp2); + } } + else if (containedOpNum == 3) + { + // assert(containedOpNum != resultOpNum); + if (resultOpNum == 2 && !copiesUpperBits) + { + // op2 = (op1 * op2) + [op3] + std::swap(emitOp1, emitOp2); + } + // else: op1/? = (op1 * op2) + [op3] + } + else if (containedOpNum == 2) + { + // assert(containedOpNum != resultOpNum); - GenTree* ops[] = {op1, op2, op3}; - GenTree* resultOp = resultOpNum == 0 ? nullptr : ops[resultOpNum - 1]; + // op1/? = (op1 * [op2]) + op3 + std::swap(emitOp2, emitOp3); + if (resultOpNum == 3 && !copiesUpperBits) + { + // op3 = (op1 * [op2]) + op3 + std::swap(emitOp1, emitOp2); + } + } + else + { + // containedOpNum == 0 + // no extra work when resultOpNum is 0 or 1 + if (resultOpNum == 2) + { + std::swap(emitOp1, emitOp2); + } + else if (resultOpNum == 3) + { + std::swap(emitOp1, emitOp3); + } + } + GenTree* ops[] = {op1, op2, op3}; for (GenTree* op : ops) { - if (op == resultOp) + if (op == emitOp1) { tgtPrefUse = BuildUse(op); srcCount++; } - else if (op->isContained() || op->IsRegOptional()) + else if (op == emitOp2) { - srcCount += op->isContained() ? BuildOperandUses(op) : BuildDelayFreeUses(op, resultOp); + srcCount += BuildDelayFreeUses(op, emitOp1); } - else + else if (op == emitOp3) { - srcCount += BuildDelayFreeUses(op, resultOp); + srcCount += op->isContained() ? BuildOperandUses(op) : BuildDelayFreeUses(op, emitOp1); } } From d6bbc456bf931216ed1576930c410e160d607249 Mon Sep 17 00:00:00 2001 From: Jakob Botsch Nielsen Date: Thu, 6 Jun 2024 08:13:51 -0700 Subject: [PATCH 4/4] Apply same fix to Permute intrinsics --- src/coreclr/jit/lsraxarch.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 4a7e08e96c2e77..d538f43ed3b83e 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2656,11 +2656,23 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou std::swap(emitOp1, emitOp2); } - tgtPrefUse = BuildUse(emitOp1); - - srcCount += 1; - srcCount += BuildDelayFreeUses(emitOp2, emitOp1); - srcCount += op3->isContained() ? BuildOperandUses(emitOp3) : BuildDelayFreeUses(emitOp3, emitOp1); + GenTree* ops[] = {op1, op2, op3}; + for (GenTree* op : ops) + { + if (op == emitOp1) + { + tgtPrefUse = BuildUse(op); + srcCount++; + } + else if (op == emitOp2) + { + srcCount += BuildDelayFreeUses(op, emitOp1); + } + else if (op == emitOp3) + { + srcCount += op->isContained() ? BuildOperandUses(op) : BuildDelayFreeUses(op, emitOp1); + } + } buildUses = false; break;