-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARM64: Fix lsra for AdvSimd_LoadAndInsertScalar #107786
Changes from all commits
7ebea21
9636970
067a311
f467f77
2d9392d
6f5b8f5
5476323
b1b063c
a8ee462
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3846,9 +3846,25 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, | |
return 0; | ||
} | ||
} | ||
|
||
// Don't mark as delay free if there is a mismatch in register types | ||
bool addDelayFreeUses = false; | ||
// Multi register nodes should not go via this route. | ||
assert(!node->IsMultiRegNode()); | ||
// Multi register nodes should always use fp registers (this includes vectors). | ||
assert(varTypeUsesFloatReg(node->TypeGet()) || !node->IsMultiRegNode()); | ||
if (rmwNode == nullptr || varTypeUsesSameRegType(rmwNode->TypeGet(), node->TypeGet()) || | ||
(rmwNode->IsMultiRegNode() && varTypeUsesFloatReg(node->TypeGet()))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. did not understand the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the RMW node is a multi register, then it'll always be vector registers. The normal node could be general register or vector register. We want to discount the case where node is a general register. So, Eg:
The RMW node is the multi register There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am wondering we should have an assert that if it is a multiRegNode, then There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added some asserts. That then broke things because for
Which is passing the entire |
||
{ | ||
addDelayFreeUses = true; | ||
} | ||
|
||
if (use != nullptr) | ||
{ | ||
AddDelayFreeUses(use, rmwNode); | ||
if (addDelayFreeUses) | ||
{ | ||
AddDelayFreeUses(use, rmwNode); | ||
} | ||
if (useRefPositionRef != nullptr) | ||
{ | ||
*useRefPositionRef = use; | ||
|
@@ -3864,15 +3880,20 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, | |
if (addrMode->HasBase() && !addrMode->Base()->isContained()) | ||
{ | ||
use = BuildUse(addrMode->Base(), candidates); | ||
AddDelayFreeUses(use, rmwNode); | ||
|
||
if (addDelayFreeUses) | ||
{ | ||
AddDelayFreeUses(use, rmwNode); | ||
} | ||
srcCount++; | ||
} | ||
|
||
if (addrMode->HasIndex() && !addrMode->Index()->isContained()) | ||
{ | ||
use = BuildUse(addrMode->Index(), candidates); | ||
AddDelayFreeUses(use, rmwNode); | ||
|
||
if (addDelayFreeUses) | ||
{ | ||
AddDelayFreeUses(use, rmwNode); | ||
} | ||
srcCount++; | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,9 @@ namespace JIT.HardwareIntrinsics.Arm | |
// Validates passing an instance member of a class works | ||
test.RunClassFldScenario(); | ||
|
||
// Validates passing an non const value works | ||
test.RunClassFldScenario_NotConstant(); | ||
|
||
// Validates passing the field of a local struct works | ||
test.RunStructLclFldScenario(); | ||
|
||
|
@@ -150,6 +153,7 @@ namespace JIT.HardwareIntrinsics.Arm | |
private static {Op1BaseType}[] _data1 = new {Op1BaseType}[Op1ElementCount]; | ||
|
||
private {Op1VectorType}<{Op1BaseType}> _fld1; | ||
private byte _fld2; | ||
private {Op1BaseType} _fld3; | ||
|
||
private DataTable _dataTable; | ||
|
@@ -161,6 +165,7 @@ namespace JIT.HardwareIntrinsics.Arm | |
for (var i = 0; i < Op1ElementCount; i++) { _data1[i] = {NextValueOp1}; } | ||
Unsafe.CopyBlockUnaligned(ref Unsafe.As<{Op1VectorType}<{Op1BaseType}>, byte>(ref _fld1), ref Unsafe.As<{Op1BaseType}, byte>(ref _data1[0]), (uint)Unsafe.SizeOf<{Op1VectorType}<{Op1BaseType}>>()); | ||
|
||
_fld2 = {ElementIndex}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lot of the tests that takes immediate value is missing this coverage. We should fix it some day. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added an issue: #108060 |
||
_fld3 = {NextValueOp3}; | ||
|
||
for (var i = 0; i < Op1ElementCount; i++) { _data1[i] = {NextValueOp1}; } | ||
|
@@ -247,6 +252,20 @@ namespace JIT.HardwareIntrinsics.Arm | |
ValidateResult(_fld1, _fld3, _dataTable.outArrayPtr); | ||
} | ||
|
||
public void RunClassFldScenario_NotConstant() | ||
{ | ||
TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_NotConstant)); | ||
|
||
fixed ({Op1BaseType}* pFld3 = &_fld3) | ||
{ | ||
var result = {Isa}.{Method}(_fld1, _fld2, pFld3); | ||
|
||
Unsafe.Write(_dataTable.outArrayPtr, result); | ||
} | ||
|
||
ValidateResult(_fld1, _fld3, _dataTable.outArrayPtr); | ||
} | ||
|
||
public void RunStructLclFldScenario() | ||
{ | ||
TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@kunalspathak : As suggested offline, added some additional checks in the delay slot logic. If the register type of the register does not match that of the delay slot register then do not add the delay free use. This fixes where we call
BuildDelayFreeUses()
forop2
which is a integer value.Instead, we could do these checks in
BuildHWIntrinsic()
to not callBuildDelayFreeUses()
, but thenBuildDelayFreeUses()
would still need the same checks turned into asserts. So it seemed simpler to do inside.This will work nicely with
BuildHWIntrinsic()
rewrite PR too.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ideally this should be assert and the callers should not be calling delay free uses for different register type. Did you turned it into an assert and see how many places are hit?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Difficult to test with an assert because 1) all the hwintrinsic tests will fail at the first assert 2) many of these issues will be due during
ConstantExpected
APIs, and for a lot of those we are only testing using hardcoded constants, meaning the assert will never be hit.Scanning the SVE API, all the methods we have that are
RMW
and have aConstantExpected
are:Then there are AdvSimd ones. Then there are possibly others.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is much easier to fix properly in the lsra rewrite PR, as the check can just be added into the main for loop in
BuildHWIntrinsic()