From 03b5226775da64e154130534c53cd056ccda095a Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Sun, 18 Feb 2024 11:39:33 +0100 Subject: [PATCH 01/23] Improve performance of decimal division --- .../src/System/Decimal.DecCalc.cs | 134 ++++++++++++++---- 1 file changed, 108 insertions(+), 26 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index ca15b1df6ea40..c530b8ccdfe37 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -194,27 +194,66 @@ private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result) /// Returns remainder. Quotient overwrites dividend. private static uint Div96By32(ref Buf12 bufNum, uint den) { - // TODO: https://github.com/dotnet/runtime/issues/5213 - ulong tmp, div; + if (X86.X86Base.X64.IsSupported) + { + uint hiRes = 0; + ulong remainder = bufNum.U2; + + if (remainder < den) + goto Div164bit; + + (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den); + + Div164bit: + bufNum.U2 = hiRes; + (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den); + return (uint)remainder; + } + else if (X86.X86Base.IsSupported) + { + uint remainder = 0; + if (bufNum.U2 != 0) + goto Div3Word; + if (bufNum.U1 >= den) + goto Div2Word; + + remainder = bufNum.U1; + bufNum.U1 = 0; + goto Div1Word; + + Div3Word: + (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, 0, den); + Div2Word: + (bufNum.U1, remainder) = X86.X86Base.DivRem(bufNum.U1, remainder, den); + Div1Word: + (bufNum.U0, remainder) = X86.X86Base.DivRem(bufNum.U0, remainder, den); + return remainder; + } + else { + ulong tmp, div, rem; + if (bufNum.U2 != 0) + { tmp = bufNum.High64; - div = tmp / den; + + (div, rem) = Math.DivRem(tmp, den); bufNum.High64 = div; - tmp = ((tmp - (uint)div * den) << 32) | bufNum.U0; + tmp = (rem << 32) | bufNum.U0; if (tmp == 0) return 0; - uint div32 = (uint)(tmp / den); - bufNum.U0 = div32; - return (uint)tmp - div32 * den; + (div, rem) = Math.DivRem(tmp, den); + bufNum.U0 = (uint)div; + return (uint)rem; } tmp = bufNum.Low64; if (tmp == 0) return 0; - div = tmp / den; + (div, rem) = Math.DivRem(tmp, den); bufNum.Low64 = div; - return (uint)(tmp - div * den); + return (uint)rem; + } } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -322,6 +361,14 @@ private static void Unscale(ref uint low, ref ulong high64, ref int scale) private static uint Div96By64(ref Buf12 bufNum, ulong den) { Debug.Assert(den > bufNum.High64); + + if (X86.X86Base.X64.IsSupported) + { + // Assert above states: den > bufNum.High64 so den > bufNum.U2 and we can be sure we will not overflow + (ulong quotient, bufNum.Low64) = X86.X86Base.X64.DivRem(bufNum.Low64, bufNum.U2, den); + return (uint)quotient; + } + ulong num; uint num2 = bufNum.U2; if (num2 == 0) @@ -392,6 +439,26 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den) return quo; } + private static uint BigMul64By32(ulong a, uint b, out ulong low) + { + if (IntPtr.Size == 8) + { + return (uint)Math.BigMul(a, b, out low); + } + else + { + uint al = (uint)a; + uint ah = (uint)(a >> 32); + uint bl = (uint)b; + + ulong mull = ((ulong)al) * bl; + ulong t = ((ulong)ah) * bl + (mull >> 32); + + low = (t << 32 | mull); + return (uint)(t >> 32); + } + } + /// /// Do partial divide, yielding 32-bit result and 96-bit remainder. /// Top divisor uint must be larger than top dividend uint. This is @@ -413,20 +480,25 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) // return 0; + + uint quo; + uint remainder; + if (X86.X86Base.IsSupported) + { + (quo, remainder) = X86.X86Base.DivRem(bufNum.U2, bufNum.U3, den); + } + else + { // TODO: https://github.com/dotnet/runtime/issues/5213 - uint quo = (uint)(dividend / den); - uint remainder = (uint)dividend - quo * den; + quo = (uint)(dividend / den); + remainder = (uint)dividend - quo * den; + } // Compute full remainder, rem = dividend - (quo * divisor). // - ulong prod1 = Math.BigMul(quo, bufDen.U0); // quo * lo divisor - ulong prod2 = Math.BigMul(quo, bufDen.U1); // quo * mid divisor - prod2 += prod1 >> 32; - prod1 = (uint)prod1 | (prod2 << 32); - prod2 >>= 32; - - ulong num = bufNum.Low64; - num -= prod1; + ulong prod1; + uint prod2 = BigMul64By32(bufDen.Low64, quo, out prod1); + ulong num = bufNum.Low64 - prod1; remainder -= (uint)prod2; // Propagate carries @@ -479,25 +551,35 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) /// Returns highest 32 bits of product private static uint IncreaseScale(ref Buf12 bufNum, uint power) { - ulong tmp = Math.BigMul(bufNum.U0, power); - bufNum.U0 = (uint)tmp; - tmp >>= 32; - tmp += Math.BigMul(bufNum.U1, power); - bufNum.U1 = (uint)tmp; - tmp >>= 32; - tmp += Math.BigMul(bufNum.U2, power); + ulong tmp = BigMul64By32(bufNum.Low64, power, out ulong low); + bufNum.Low64 = low; + tmp = Math.BigMul(bufNum.U2, power) + tmp; bufNum.U2 = (uint)tmp; return (uint)(tmp >> 32); } + /// + /// Multiply the two numbers 64bit * 32bit. + /// The 96 bits of the result overwrite the input. + /// + /// 64-bit number as array of uints, least-sig first + /// Scale factor to multiply by private static void IncreaseScale64(ref Buf12 bufNum, uint power) { + if (IntPtr.Size == 8) + { + bufNum.U2 = (uint)Math.BigMul(bufNum.Low64, power, out ulong low); + bufNum.Low64 = low; + } + else + { ulong tmp = Math.BigMul(bufNum.U0, power); bufNum.U0 = (uint)tmp; tmp >>= 32; tmp += Math.BigMul(bufNum.U1, power); bufNum.High64 = tmp; } + } /// /// See if we need to scale the result to fit it in 96 bits. From f593c2ed21a8c56c2d329a4bc6237b4c960293f6 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Sun, 18 Feb 2024 11:42:25 +0100 Subject: [PATCH 02/23] Remove usage of mulx via instrinct on 32bit x86 since it produces worse code than Job=ShortRun IterationCount=3 LaunchCount=1 WarmupCount=3 | Method | a | b | Mean | Error | StdDev | Allocated | |--------------------------- |-- |----------- |---------:|----------:|----------:|----------:| | Mul64By32_New | 3 | 4294967295 | 2.068 ns | 0.0459 ns | 0.0383 ns | - | | Mul64By32_Ori | 3 | 4294967295 | 2.916 ns | 0.0231 ns | 0.0193 ns | - | --- .../FallbackInterfaceMethodAttribute.cs | 20 +++++++++ .../src/System/Decimal.DecCalc.cs | 41 ++++++++++--------- .../System.Private.CoreLib/src/System/Math.cs | 10 +---- 3 files changed, 42 insertions(+), 29 deletions(-) create mode 100644 src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs diff --git a/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs b/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs new file mode 100644 index 0000000000000..5877e0c6bf334 --- /dev/null +++ b/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs @@ -0,0 +1,20 @@ +namespace System.Runtime.CompilerServices.Internal +{ + /// + /// INTERNAL: Make default Interface methods have "low priority" in case there are multiple + /// possible implementations (the "Diamond dependency problem"), + /// ensuring that any other conflicting implementaion will be selected at runtime. + /// + /// + /// This allows adding default method implementations for existing interfaces without + /// making it a binary breaking change. (It can still be a source breaking change) + /// + /// Should preferably only be used in the same assembly which defines + /// the interface method beeing overridden. + /// + /// + [AttributeUsage(AttributeTargets.Method, AllowMultiple = false, Inherited = false)] + sealed class FallbackInterfaceMethodAttribute : Attribute + { + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index c530b8ccdfe37..f21da60126629 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -213,7 +213,7 @@ private static uint Div96By32(ref Buf12 bufNum, uint den) { uint remainder = 0; - if (bufNum.U2 != 0) + if (bufNum.U2 != 0) goto Div3Word; if (bufNum.U1 >= den) goto Div2Word; @@ -235,25 +235,25 @@ private static uint Div96By32(ref Buf12 bufNum, uint den) ulong tmp, div, rem; if (bufNum.U2 != 0) { - tmp = bufNum.High64; + tmp = bufNum.High64; (div, rem) = Math.DivRem(tmp, den); - bufNum.High64 = div; + bufNum.High64 = div; tmp = (rem << 32) | bufNum.U0; - if (tmp == 0) - return 0; + if (tmp == 0) + return 0; (div, rem) = Math.DivRem(tmp, den); bufNum.U0 = (uint)div; return (uint)rem; - } + } - tmp = bufNum.Low64; - if (tmp == 0) - return 0; + tmp = bufNum.Low64; + if (tmp == 0) + return 0; (div, rem) = Math.DivRem(tmp, den); - bufNum.Low64 = div; + bufNum.Low64 = div; return (uint)rem; - } + } } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -456,7 +456,7 @@ private static uint BigMul64By32(ulong a, uint b, out ulong low) low = (t << 32 | mull); return (uint)(t >> 32); - } + } } /// @@ -489,7 +489,7 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) } else { - // TODO: https://github.com/dotnet/runtime/issues/5213 + // TODO: https://github.com/dotnet/runtime/issues/5213 quo = (uint)(dividend / den); remainder = (uint)dividend - quo * den; } @@ -560,7 +560,7 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power) /// /// Multiply the two numbers 64bit * 32bit. - /// The 96 bits of the result overwrite the input. + /// The 96 bits of the result overwrite the input. /// /// 64-bit number as array of uints, least-sig first /// Scale factor to multiply by @@ -573,12 +573,12 @@ private static void IncreaseScale64(ref Buf12 bufNum, uint power) } else { - ulong tmp = Math.BigMul(bufNum.U0, power); - bufNum.U0 = (uint)tmp; - tmp >>= 32; - tmp += Math.BigMul(bufNum.U1, power); - bufNum.High64 = tmp; - } + ulong tmp = Math.BigMul(bufNum.U0, power); + bufNum.U0 = (uint)tmp; + tmp >>= 32; + tmp += Math.BigMul(bufNum.U1, power); + bufNum.High64 = tmp; + } } /// @@ -2084,6 +2084,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2) { if (scale < 0) { + // TODO: consider 64bit powers curScale = Math.Min(9, -scale); goto HaveScale64; } diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs index 1d1c50a4e2b55..7e99bb0366edf 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Math.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs @@ -151,16 +151,8 @@ internal static void ThrowNegateTwosCompOverflow() throw new OverflowException(SR.Overflow_NegateTwosCompNum); } - internal static unsafe ulong BigMul(uint a, uint b) + internal static ulong BigMul(uint a, uint b) { -#if TARGET_32BIT - if (Bmi2.IsSupported) - { - uint low; - uint high = Bmi2.MultiplyNoFlags(a, b, &low); - return ((ulong)high << 32) | low; - } -#endif return ((ulong)a) * b; } From 218b373823a7b8a67ca09e786f1b799d6821b98e Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Tue, 20 Feb 2024 11:30:39 +0100 Subject: [PATCH 03/23] update part of Multiply - Add comment to BigMul64By32 and make it return nunit to avoid clearing upper 32 bits - Simplify IncreaseScale --- .../src/System/Decimal.DecCalc.cs | 176 +++++++----------- 1 file changed, 66 insertions(+), 110 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index f21da60126629..8ec88d5b631b9 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -192,6 +192,7 @@ private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result) /// 96-bit dividend as array of uints, least-sig first /// 32-bit divisor /// Returns remainder. Quotient overwrites dividend. + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static uint Div96By32(ref Buf12 bufNum, uint den) { if (X86.X86Base.X64.IsSupported) @@ -200,11 +201,11 @@ private static uint Div96By32(ref Buf12 bufNum, uint den) ulong remainder = bufNum.U2; if (remainder < den) - goto Div164bit; + goto DivOne64Bit; (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den); - Div164bit: + DivOne64Bit: bufNum.U2 = hiRes; (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den); return (uint)remainder; @@ -439,24 +440,28 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den) return quo; } - private static uint BigMul64By32(ulong a, uint b, out ulong low) + + /// + /// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in + /// + /// hi bits of the result + /// returns nuint instead of uint to skip clearing upper 32bits on 64bit platforms + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static nuint BigMul64By32(ulong a, uint b, out ulong low) { - if (IntPtr.Size == 8) - { - return (uint)Math.BigMul(a, b, out low); - } - else - { - uint al = (uint)a; - uint ah = (uint)(a >> 32); - uint bl = (uint)b; +#if TARGET_64BIT + return (nuint)Math.BigMul(a, b, out low); +#else + uint al = (uint)a; + uint ah = (uint)(a >> 32); + uint bl = b; - ulong mull = ((ulong)al) * bl; - ulong t = ((ulong)ah) * bl + (mull >> 32); + ulong prodL = ((ulong)al) * bl; + ulong prodH = ((ulong)ah) * bl + (prodL >> 32); - low = (t << 32 | mull); - return (uint)(t >> 32); - } + low = (prodH << 32 | (uint)prodL); + return (nuint)(prodH >> 32); +#endif } /// @@ -497,11 +502,12 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) // Compute full remainder, rem = dividend - (quo * divisor). // ulong prod1; - uint prod2 = BigMul64By32(bufDen.Low64, quo, out prod1); + uint prod2 = (uint)BigMul64By32(bufDen.Low64, quo, out prod1); ulong num = bufNum.Low64 - prod1; remainder -= (uint)prod2; // Propagate carries + // can be simplified if https://github.com/dotnet/runtime/issues/48247 is done // if (num > ~prod1) { @@ -551,11 +557,11 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) /// Returns highest 32 bits of product private static uint IncreaseScale(ref Buf12 bufNum, uint power) { - ulong tmp = BigMul64By32(bufNum.Low64, power, out ulong low); - bufNum.Low64 = low; - tmp = Math.BigMul(bufNum.U2, power) + tmp; - bufNum.U2 = (uint)tmp; - return (uint)(tmp >> 32); + ulong hi64 = BigMul64By32(bufNum.Low64, power, out ulong low64); + bufNum.Low64 = low64; + hi64 = Math.BigMul(bufNum.U2, power) + hi64; + bufNum.U2 = (uint)hi64; + return (uint)(hi64 >> 32); } /// @@ -566,19 +572,8 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power) /// Scale factor to multiply by private static void IncreaseScale64(ref Buf12 bufNum, uint power) { - if (IntPtr.Size == 8) - { - bufNum.U2 = (uint)Math.BigMul(bufNum.Low64, power, out ulong low); - bufNum.Low64 = low; - } - else - { - ulong tmp = Math.BigMul(bufNum.U0, power); - bufNum.U0 = (uint)tmp; - tmp >>= 32; - tmp += Math.BigMul(bufNum.U1, power); - bufNum.High64 = tmp; - } + bufNum.U2 = (uint)BigMul64By32(bufNum.Low64, power, out ulong low64); + bufNum.Low64 = low64; } /// @@ -1427,12 +1422,8 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2) else { // Left value is 32-bit, result fits in 4 uints - tmp = Math.BigMul(d1.Low, d2.Low); - bufProd.U0 = (uint)tmp; - - tmp = Math.BigMul(d1.Low, d2.Mid) + (tmp >> 32); - bufProd.U1 = (uint)tmp; - tmp >>= 32; + tmp = BigMul64By32(d2.Low64, d1.Low, out ulong low); + bufProd.Low64 = low; if (d2.High != 0) { @@ -1451,12 +1442,8 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2) else if ((d2.High | d2.Mid) == 0) { // Right value is 32-bit, result fits in 4 uints - tmp = Math.BigMul(d2.Low, d1.Low); - bufProd.U0 = (uint)tmp; - - tmp = Math.BigMul(d2.Low, d1.Mid) + (tmp >> 32); - bufProd.U1 = (uint)tmp; - tmp >>= 32; + tmp = BigMul64By32(d1.Low64, d2.Low, out ulong low); + bufProd.Low64 = low; if (d1.High != 0) { @@ -1473,80 +1460,50 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2) } else { - // Both operands have bits set in the upper 64 bits. + // At least one operand has bits set in the upper 64 bits. // // Compute and accumulate the 9 partial products into a - // 192-bit (24-byte) result. + // 192-bit (3*64bit) result. // - // [l-h][l-m][l-l] left high, middle, low - // x [r-h][r-m][r-l] right high, middle, low - // ------------------------------ + // [l-hi][l-lo] left high32, low64 + // x [r-hi][r-lo] right high32, low64 + // ------------------------------- // - // [0-h][0-l] l-l * r-l - // [1ah][1al] l-l * r-m - // [1bh][1bl] l-m * r-l - // [2ah][2al] l-m * r-m - // [2bh][2bl] l-l * r-h - // [2ch][2cl] l-h * r-l - // [3ah][3al] l-m * r-h - // [3bh][3bl] l-h * r-m - // [4-h][4-l] l-h * r-h + // [ 0-h][0-l ] l-lo * r-lo => 64 + 64 bit result + // [ h*l][h*l ] l-lo * r-hi => 32 + 64 bit result + // [ l*h][l*h ] l-hi * r-lo => 32 + 64 bit result + // [ h*h] l-hi * r-hi => 32 + 32 bit result // ------------------------------ - // [p-5][p-4][p-3][p-2][p-1][p-0] prod[] array + // [Hi64][Mid64][Low64] bufProd "array" // - tmp = Math.BigMul(d1.Low, d2.Low); - bufProd.U0 = (uint)tmp; - - ulong tmp2 = Math.BigMul(d1.Low, d2.Mid) + (tmp >> 32); - - tmp = Math.BigMul(d1.Mid, d2.Low); - tmp += tmp2; // this could generate carry - bufProd.U1 = (uint)tmp; - if (tmp < tmp2) // detect carry - tmp2 = (tmp >> 32) | (1UL << 32); - else - tmp2 = tmp >> 32; - - tmp = Math.BigMul(d1.Mid, d2.Mid) + tmp2; + ulong mid64 = Math.BigMul(d1.Low64, d2.Low64, out tmp); + bufProd.Low64 = tmp; - if ((d1.High | d2.High) > 0) + if ((d1.High | d2.High) != 0) { - // Highest 32 bits is non-zero. Calculate 5 more partial products. - // - tmp2 = Math.BigMul(d1.Low, d2.High); - tmp += tmp2; // this could generate carry - uint tmp3 = 0; - if (tmp < tmp2) // detect carry - tmp3 = 1; - - tmp2 = Math.BigMul(d1.High, d2.Low); - tmp += tmp2; // this could generate carry - bufProd.U2 = (uint)tmp; - if (tmp < tmp2) // detect carry - tmp3++; - tmp2 = ((ulong)tmp3 << 32) | (tmp >> 32); - - tmp = Math.BigMul(d1.Mid, d2.High); - tmp += tmp2; // this could generate carry - tmp3 = 0; - if (tmp < tmp2) // detect carry - tmp3 = 1; - - tmp2 = Math.BigMul(d1.High, d2.Mid); - tmp += tmp2; // this could generate carry - bufProd.U3 = (uint)tmp; - if (tmp < tmp2) // detect carry - tmp3++; - tmp = ((ulong)tmp3 << 32) | (tmp >> 32); - - bufProd.High64 = Math.BigMul(d1.High, d2.High) + tmp; - + // hi64 will never overflow since the result will always fit in 192 (2*96) bits + ulong hi64 = Math.BigMul(d1.High, d2.High); + + // Do crosswise multiplications between upper 32bit and lower 64 bits + hi64 += BigMul64By32(d1.Low64, d2.High, out tmp); + mid64 += tmp; + // propagate carry, can be simplified if https://github.com/dotnet/runtime/issues/48247 is done + if (mid64 < tmp) + ++hi64; + + hi64 += BigMul64By32(d2.Low64, d1.High, out tmp); + mid64 += tmp; + if (mid64 < tmp) + ++hi64; + + bufProd.Mid64 = mid64; + bufProd.High64 = hi64; hiProd = 5; } else { - bufProd.Mid64 = tmp; + bufProd.Mid64 = mid64; hiProd = 3; } } @@ -2084,7 +2041,6 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2) { if (scale < 0) { - // TODO: consider 64bit powers curScale = Math.Min(9, -scale); goto HaveScale64; } From f19ef92d693c52ff1e404c941126c3031eb34ece Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Thu, 22 Feb 2024 10:31:49 +0100 Subject: [PATCH 04/23] remove unintentional file --- .../FallbackInterfaceMethodAttribute.cs | 20 ------------------- 1 file changed, 20 deletions(-) delete mode 100644 src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs diff --git a/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs b/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs deleted file mode 100644 index 5877e0c6bf334..0000000000000 --- a/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs +++ /dev/null @@ -1,20 +0,0 @@ -namespace System.Runtime.CompilerServices.Internal -{ - /// - /// INTERNAL: Make default Interface methods have "low priority" in case there are multiple - /// possible implementations (the "Diamond dependency problem"), - /// ensuring that any other conflicting implementaion will be selected at runtime. - /// - /// - /// This allows adding default method implementations for existing interfaces without - /// making it a binary breaking change. (It can still be a source breaking change) - /// - /// Should preferably only be used in the same assembly which defines - /// the interface method beeing overridden. - /// - /// - [AttributeUsage(AttributeTargets.Method, AllowMultiple = false, Inherited = false)] - sealed class FallbackInterfaceMethodAttribute : Attribute - { - } -} From e984fc53651c5ae7f342829eca5c70e323688bf5 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Thu, 22 Feb 2024 18:31:08 +0100 Subject: [PATCH 05/23] Improve division by 64bit value on x64 --- .../src/System/Decimal.DecCalc.cs | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 8ec88d5b631b9..bbbfd4c05e639 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -352,6 +352,32 @@ private static void Unscale(ref uint low, ref ulong high64, ref int scale) scale--; } + /// + /// Do partial divide, yielding 64-bit result and 64-bit remainder. + /// Divisor must be larger than upper 64 bits of dividend. + /// + /// 128-bit dividend as array of uints, least-sig first + /// 64-bit divisor + /// Returns quotient. Remainder overwrites lower 64-bits of dividend. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe ulong Div128By64(Buf16* bufNum, ulong den) + { + Debug.Assert(den > bufNum->High64); + + if (X86.X86Base.X64.IsSupported) + { + // Assert above states: den > bufNum.High64 so den > bufNum.U2 and we can be sure we will not overflow + (ulong quotient, bufNum->Low64) = X86.X86Base.X64.DivRem(bufNum->Low64, bufNum->High64, den); + return quotient; + } + else + { + uint hiBits = Div96By64(ref *(Buf12*)&bufNum->U1, den); + uint loBits = Div96By64(ref *(Buf12*)bufNum, den); + return ((ulong)hiBits << 32 | loBits); + } + } + /// /// Do partial divide, yielding 32-bit result and 64-bit remainder. /// Divisor must be larger than upper 64 bits of dividend. @@ -2031,10 +2057,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2) // Have a 64-bit divisor in sdlDivisor. The remainder // (currently 96 bits spread over 4 uints) will be < divisor. // - bufQuo.U2 = 0; - bufQuo.U1 = Div96By64(ref *(Buf12*)&bufRem.U1, divisor); - bufQuo.U0 = Div96By64(ref *(Buf12*)&bufRem, divisor); - + bufQuo.Low64 = Div128By64(&bufRem, divisor); while (true) { if (bufRem.Low64 == 0) From f0d62fdba95c93e8e413aac0d21300b6639186f1 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Sat, 2 Mar 2024 14:06:47 +0100 Subject: [PATCH 06/23] Remove some more 64bit divides for x86 --- .../src/System/Decimal.DecCalc.cs | 70 +++++++++++++------ 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index bbbfd4c05e639..8542ce612a2c6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -200,12 +200,11 @@ private static uint Div96By32(ref Buf12 bufNum, uint den) uint hiRes = 0; ulong remainder = bufNum.U2; - if (remainder < den) - goto DivOne64Bit; - - (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den); + if (remainder >= den) + { + (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den); + } - DivOne64Bit: bufNum.U2 = hiRes; (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den); return (uint)remainder; @@ -222,12 +221,14 @@ private static uint Div96By32(ref Buf12 bufNum, uint den) remainder = bufNum.U1; bufNum.U1 = 0; goto Div1Word; - - Div3Word: - (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, 0, den); - Div2Word: +Div3Word: + if (bufNum.U2 < den) + (bufNum.U2, remainder) = (0, bufNum.U2); + else + (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, remainder, den); +Div2Word: (bufNum.U1, remainder) = X86.X86Base.DivRem(bufNum.U1, remainder, den); - Div1Word: +Div1Word: (bufNum.U0, remainder) = X86.X86Base.DivRem(bufNum.U0, remainder, den); return remainder; } @@ -815,13 +816,28 @@ private static int OverflowUnscale(ref Buf12 bufQuo, int scale, bool sticky) // We have overflown, so load the high bit with a one. const ulong highbit = 1UL << 32; bufQuo.U2 = (uint)(highbit / 10); - ulong tmp = ((highbit % 10) << 32) + bufQuo.U1; - uint div = (uint)(tmp / 10); - bufQuo.U1 = div; - tmp = ((tmp - div * 10) << 32) + bufQuo.U0; - div = (uint)(tmp / 10); - bufQuo.U0 = div; - uint remainder = (uint)(tmp - div * 10); + + uint remainder; +#if TARGET_32BIT + if (X86.X86Base.IsSupported) + { + // 32-bit RyuJIT doesn't convert 64-bit division by constant into multiplication by reciprocal. + // Do "32bit" divides instead of calling full 64bit helper + (bufQuo.U1, remainder) = X86.X86Base.DivRem(bufQuo.U1, (uint)(highbit % 10), 10); + (bufQuo.U0, remainder) = X86.X86Base.DivRem(bufQuo.U0, remainder, 10); + } + else +#endif + { + ulong tmp = ((highbit % 10) << 32) + bufQuo.U1; + uint div = (uint)(tmp / 10); + bufQuo.U1 = div; + tmp = ((tmp - div * 10) << 32) + bufQuo.U0; + div = (uint)(tmp / 10); + bufQuo.U0 = div; + remainder = (uint)(tmp - div * 10); + } + // The remainder is the last digit that does not fit, so we can use it to work out if we need to round up if (remainder > 5 || remainder == 5 && (sticky || (bufQuo.U0 & 1) != 0)) Add32To96(ref bufQuo, 1); @@ -1537,7 +1553,7 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2) // Check for leading zero uints on the product // uint* product = (uint*)&bufProd; - while (product[(int)hiProd] == 0) + while (product[hiProd] == 0) { if (hiProd == 0) goto ReturnZero; @@ -2017,9 +2033,20 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2) goto ThrowOverflow; ulong num = Math.BigMul(remainder, power); - // TODO: https://github.com/dotnet/runtime/issues/5213 - uint div = (uint)(num / den); - remainder = (uint)num - div * den; + uint div; +#if TARGET_32BIT + if (X86.X86Base.IsSupported) + { + (div, remainder) = X86.X86Base.DivRem((uint)num, (uint)(num >> 32), den); + } + else +#endif + { + // Do full 64bit divide and cast result to 32bit + var divRes = X86.X86Base.X64.IsSupported ? X86.X86Base.X64.DivRem(num, 0, (ulong)den) : Math.DivRem(num, den); + div = (uint)divRes.Quotient; + remainder = (uint)divRes.Remainder; + } if (!Add32To96(ref bufQuo, div)) { @@ -2057,6 +2084,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2) // Have a 64-bit divisor in sdlDivisor. The remainder // (currently 96 bits spread over 4 uints) will be < divisor. // + bufQuo.U2 = 0; bufQuo.Low64 = Div128By64(&bufRem, divisor); while (true) { From 792c3ebf52e6c90a8cacab22a411ae5cb9a220ac Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Sat, 2 Mar 2024 14:14:09 +0100 Subject: [PATCH 07/23] Call IncreaseScale in one more place --- .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 8542ce612a2c6..04c4fffd91597 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -2301,10 +2301,7 @@ internal static void VarDecMod(ref DecCalc d1, ref DecCalc d2) break; uint power = iCurScale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[iCurScale]; scale += iCurScale; - ulong tmp = Math.BigMul(bufQuo.U0, power); - bufQuo.U0 = (uint)tmp; - tmp >>= 32; - bufQuo.High64 = tmp + bufQuo.High64 * power; + IncreaseScale(ref bufQuo, power); if (power != TenToPowerNine) break; } From 6dab5f92a07e59defb9507e0757f0a91d83704c0 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Sun, 3 Mar 2024 23:21:00 +0100 Subject: [PATCH 08/23] add extra paranthesis --- .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 04c4fffd91597..db09733da2a81 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -486,7 +486,7 @@ private static nuint BigMul64By32(ulong a, uint b, out ulong low) ulong prodL = ((ulong)al) * bl; ulong prodH = ((ulong)ah) * bl + (prodL >> 32); - low = (prodH << 32 | (uint)prodL); + low = ((prodH << 32) | (uint)prodL); return (nuint)(prodH >> 32); #endif } From 83efb423c377f1608168f391feb6d1d94692e67a Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 4 Mar 2024 13:22:35 +0100 Subject: [PATCH 09/23] review: remove X86.X86Base.X64.DivRem --- .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index db09733da2a81..7da824c419373 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -2043,7 +2043,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2) #endif { // Do full 64bit divide and cast result to 32bit - var divRes = X86.X86Base.X64.IsSupported ? X86.X86Base.X64.DivRem(num, 0, (ulong)den) : Math.DivRem(num, den); + var divRes = Math.DivRem(num, den); div = (uint)divRes.Quotient; remainder = (uint)divRes.Remainder; } From a20f37f6ed0e613333da0737449663bf08403d10 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 4 Mar 2024 22:11:21 +0100 Subject: [PATCH 10/23] Remove BigMul64By32 and add overloads of Math.BigMul instead so that it can easily be removed once JIT recognize and optimize "ulong * uint" --- .../src/System/Decimal.DecCalc.cs | 38 ++++--------------- .../System.Private.CoreLib/src/System/Math.cs | 26 +++++++++++++ 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 7da824c419373..d995d8145748d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -467,30 +467,6 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den) return quo; } - - /// - /// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in - /// - /// hi bits of the result - /// returns nuint instead of uint to skip clearing upper 32bits on 64bit platforms - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static nuint BigMul64By32(ulong a, uint b, out ulong low) - { -#if TARGET_64BIT - return (nuint)Math.BigMul(a, b, out low); -#else - uint al = (uint)a; - uint ah = (uint)(a >> 32); - uint bl = b; - - ulong prodL = ((ulong)al) * bl; - ulong prodH = ((ulong)ah) * bl + (prodL >> 32); - - low = ((prodH << 32) | (uint)prodL); - return (nuint)(prodH >> 32); -#endif - } - /// /// Do partial divide, yielding 32-bit result and 96-bit remainder. /// Top divisor uint must be larger than top dividend uint. This is @@ -529,7 +505,7 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) // Compute full remainder, rem = dividend - (quo * divisor). // ulong prod1; - uint prod2 = (uint)BigMul64By32(bufDen.Low64, quo, out prod1); + uint prod2 = (uint)Math.BigMul(bufDen.Low64, quo, out prod1); ulong num = bufNum.Low64 - prod1; remainder -= (uint)prod2; @@ -584,7 +560,7 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) /// Returns highest 32 bits of product private static uint IncreaseScale(ref Buf12 bufNum, uint power) { - ulong hi64 = BigMul64By32(bufNum.Low64, power, out ulong low64); + ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64); bufNum.Low64 = low64; hi64 = Math.BigMul(bufNum.U2, power) + hi64; bufNum.U2 = (uint)hi64; @@ -599,7 +575,7 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power) /// Scale factor to multiply by private static void IncreaseScale64(ref Buf12 bufNum, uint power) { - bufNum.U2 = (uint)BigMul64By32(bufNum.Low64, power, out ulong low64); + bufNum.U2 = (uint)Math.BigMul(bufNum.Low64, power, out ulong low64); bufNum.Low64 = low64; } @@ -1464,7 +1440,7 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2) else { // Left value is 32-bit, result fits in 4 uints - tmp = BigMul64By32(d2.Low64, d1.Low, out ulong low); + tmp = Math.BigMul(d1.Low, d2.Low64, out ulong low); bufProd.Low64 = low; if (d2.High != 0) @@ -1484,7 +1460,7 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2) else if ((d2.High | d2.Mid) == 0) { // Right value is 32-bit, result fits in 4 uints - tmp = BigMul64By32(d1.Low64, d2.Low, out ulong low); + tmp = Math.BigMul(d1.Low64, d2.Low, out ulong low); bufProd.Low64 = low; if (d1.High != 0) @@ -1528,13 +1504,13 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2) ulong hi64 = Math.BigMul(d1.High, d2.High); // Do crosswise multiplications between upper 32bit and lower 64 bits - hi64 += BigMul64By32(d1.Low64, d2.High, out tmp); + hi64 += Math.BigMul(d1.Low64, d2.High, out tmp); mid64 += tmp; // propagate carry, can be simplified if https://github.com/dotnet/runtime/issues/48247 is done if (mid64 < tmp) ++hi64; - hi64 += BigMul64By32(d2.Low64, d1.High, out tmp); + hi64 += Math.BigMul(d2.Low64, d1.High, out tmp); mid64 += tmp; if (mid64 < tmp) ++hi64; diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs index 7e99bb0366edf..3d7952c51fddf 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Math.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs @@ -161,6 +161,32 @@ public static long BigMul(int a, int b) return ((long)a) * b; } + + /// + /// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in + /// + /// hi bits of the result + /// REMOVE once BigMul(ulong, ulong) is treated as intrinsics and optimizes 32 by 64 multiplications + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong BigMul(ulong a, uint b, out ulong low) + { +#if TARGET_64BIT + return Math.BigMul(ulong)a, (ulong)b, out low); +#else + ulong prodH = (((ulong)(uint)(a >> 32)) * b); + ulong prodL = ((ulong)(uint)a) * b; + prodH += (prodL >> 32); + + low = ((prodH << 32) | (uint)prodL); + return (prodH >> 32); +#endif + } + + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong BigMul(uint a, ulong b, out ulong low) + => BigMul(a, b, out low); + /// Produces the full product of two unsigned 64-bit numbers. /// The first number to multiply. /// The second number to multiply. From 2f6e107c9041909740c131a410120c0e22f0e5c8 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 4 Mar 2024 22:44:03 +0100 Subject: [PATCH 11/23] Simplify Div96By32 --- .../src/System/Decimal.DecCalc.cs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index d995d8145748d..3fedf3450dcd6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -197,15 +197,9 @@ private static uint Div96By32(ref Buf12 bufNum, uint den) { if (X86.X86Base.X64.IsSupported) { - uint hiRes = 0; ulong remainder = bufNum.U2; - if (remainder >= den) - { - (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den); - } - - bufNum.U2 = hiRes; + (bufNum.U2, remainder) = (remainder >= den) ? X86.X86Base.DivRem(bufNum.U2, 0u, den) : (0u, remainder); (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den); return (uint)remainder; } @@ -222,10 +216,7 @@ private static uint Div96By32(ref Buf12 bufNum, uint den) bufNum.U1 = 0; goto Div1Word; Div3Word: - if (bufNum.U2 < den) - (bufNum.U2, remainder) = (0, bufNum.U2); - else - (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, remainder, den); + (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, remainder, den); Div2Word: (bufNum.U1, remainder) = X86.X86Base.DivRem(bufNum.U1, remainder, den); Div1Word: From 14a662b485d460a825899fafd4ebf3fe12a4a2c6 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 4 Mar 2024 22:45:41 +0100 Subject: [PATCH 12/23] Remove 64 bit path from Div96By32 --- .../src/System/Decimal.DecCalc.cs | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 3fedf3450dcd6..e09ab579ca402 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -195,15 +195,7 @@ private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result) [MethodImpl(MethodImplOptions.AggressiveInlining)] private static uint Div96By32(ref Buf12 bufNum, uint den) { - if (X86.X86Base.X64.IsSupported) - { - ulong remainder = bufNum.U2; - - (bufNum.U2, remainder) = (remainder >= den) ? X86.X86Base.DivRem(bufNum.U2, 0u, den) : (0u, remainder); - (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den); - return (uint)remainder; - } - else if (X86.X86Base.IsSupported) + if (X86.X86Base.IsSupported) { uint remainder = 0; From cfcd3794a5282edd4ce91978ba81d39f2019579d Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 11 Mar 2024 22:38:09 +0100 Subject: [PATCH 13/23] Add Div64By32 helper to avoid check for X86 in multiple places --- .../src/System/Decimal.DecCalc.cs | 54 ++++++++----------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index e09ab579ca402..9a45fb75bced8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -186,6 +186,22 @@ private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result) result.High = (uint)high; } + // Do partial divide for the case where (left >> 32) < den + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static (uint Quotient, uint Remainder) Div64By32(ulong dividend, uint den) + { + if (X86.X86Base.IsSupported) + { + return X86.X86Base.DivRem((uint)dividend, (uint)(dividend >> 32), den); + } + else + { + // TODO: https://github.com/dotnet/runtime/issues/5213 + uint quo = (uint)(dividend / den); + return (quo, (uint)dividend - quo * den); + } + } + /// /// Do full divide, yielding 96-bit result and 32-bit remainder. /// @@ -235,8 +251,7 @@ private static uint Div96By32(ref Buf12 bufNum, uint den) tmp = bufNum.Low64; if (tmp == 0) return 0; - (div, rem) = Math.DivRem(tmp, den); - bufNum.Low64 = div; + (bufNum.Low64, rem) = Math.DivRem(tmp, den); return (uint)rem; } } @@ -425,9 +440,9 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den) // return 0; - // TODO: https://github.com/dotnet/runtime/issues/5213 - quo = (uint)(num64 / denHigh32); - num = bufNum.U0 | ((num64 - quo * denHigh32) << 32); // remainder + + (quo, uint rem) = Div64By32(num64, denHigh32); + num = bufNum.U0 | ((ulong)rem << 32); // remainder // Compute full remainder, rem = dividend - (quo * divisor). // @@ -471,19 +486,7 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) // return 0; - - uint quo; - uint remainder; - if (X86.X86Base.IsSupported) - { - (quo, remainder) = X86.X86Base.DivRem(bufNum.U2, bufNum.U3, den); - } - else - { - // TODO: https://github.com/dotnet/runtime/issues/5213 - quo = (uint)(dividend / den); - remainder = (uint)dividend - quo * den; - } + (uint quo, uint remainder) = Div64By32(dividend, den); // Compute full remainder, rem = dividend - (quo * divisor). // @@ -1992,20 +1995,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2) goto ThrowOverflow; ulong num = Math.BigMul(remainder, power); - uint div; -#if TARGET_32BIT - if (X86.X86Base.IsSupported) - { - (div, remainder) = X86.X86Base.DivRem((uint)num, (uint)(num >> 32), den); - } - else -#endif - { - // Do full 64bit divide and cast result to 32bit - var divRes = Math.DivRem(num, den); - div = (uint)divRes.Quotient; - remainder = (uint)divRes.Remainder; - } + (uint div, remainder) = Div64By32(num, den); if (!Add32To96(ref bufQuo, div)) { From 0bd0cc35662c39dcfc7fca19961197e4c3b7ae6e Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 11 Mar 2024 22:49:48 +0100 Subject: [PATCH 14/23] Use 64*32 multiply in more places --- .../src/System/Decimal.DecCalc.cs | 42 +++++-------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 9a45fb75bced8..0c24990ce6403 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -964,7 +964,7 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign) } uint power; - ulong tmp64, tmpLow; + ulong tmp64; // d1 will need to be multiplied by 10^scale so // it will have the same scale as d2. We could be @@ -1005,10 +1005,7 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign) power = TenToPowerNine; if (scale < MaxInt32Scale) power = UInt32Powers10[scale]; - tmpLow = Math.BigMul((uint)low64, power); - tmp64 = Math.BigMul((uint)(low64 >> 32), power) + (tmpLow >> 32); - low64 = (uint)tmpLow + (tmp64 << 32); - high = (uint)(tmp64 >> 32); + high = (uint)Math.BigMul(low64, power, out low64); if ((scale -= MaxInt32Scale) <= 0) goto AlignedAdd; } while (high == 0); @@ -1021,10 +1018,7 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign) power = TenToPowerNine; if (scale < MaxInt32Scale) power = UInt32Powers10[scale]; - tmpLow = Math.BigMul((uint)low64, power); - tmp64 = Math.BigMul((uint)(low64 >> 32), power) + (tmpLow >> 32); - low64 = (uint)tmpLow + (tmp64 << 32); - tmp64 >>= 32; + tmp64 = Math.BigMul(low64, power, out low64); tmp64 += Math.BigMul(high, power); scale -= MaxInt32Scale; @@ -1260,12 +1254,8 @@ internal static long VarCyFromDec(ref DecCalc pdecIn) if (pdecIn.High != 0) goto ThrowOverflow; uint pwr = UInt32Powers10[-scale]; - ulong high = Math.BigMul(pwr, pdecIn.Mid); - if (high > uint.MaxValue) - goto ThrowOverflow; - ulong low = Math.BigMul(pwr, pdecIn.Low); - low += high <<= 32; - if (low < high) + ulong high = Math.BigMul(pdecIn.Low64, pwr, out ulong low); + if (high != 0) goto ThrowOverflow; value = (long)low; } @@ -1348,10 +1338,7 @@ private static int VarDecCmpSub(in decimal d1, in decimal d2) do { uint power = scale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[scale]; - ulong tmpLow = Math.BigMul((uint)low64, power); - ulong tmp = Math.BigMul((uint)(low64 >> 32), power) + (tmpLow >> 32); - low64 = (uint)tmpLow + (tmp << 32); - tmp >>= 32; + ulong tmp = Math.BigMul(low64, power, out low64); tmp += Math.BigMul(high, power); // If the scaled value has more than 96 significant bits then it's greater than d2 if (tmp > uint.MaxValue) @@ -1638,12 +1625,8 @@ internal static void VarDecFromR4(float input, out DecCalc result) else { ulong low64 = Math.BigMul(mant, UInt32Powers10[power - 9]); - ulong hi64 = Math.BigMul(TenToPowerNine, (uint)(low64 >> 32)); - low64 = Math.BigMul(TenToPowerNine, (uint)low64); - result.Low = (uint)low64; - hi64 += low64 >> 32; - result.Mid = (uint)hi64; - hi64 >>= 32; + ulong hi64 = Math.BigMul(TenToPowerNine, low64, out low64); + result.Low64 = low64; result.High = (uint)hi64; } } @@ -2223,12 +2206,9 @@ internal static void VarDecMod(ref DecCalc d1, ref DecCalc d2) do { uint power = scale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[scale]; - ulong tmp = Math.BigMul(d2.Low, power); - d2.Low = (uint)tmp; - tmp >>= 32; - tmp += (d2.Mid + ((ulong)d2.High << 32)) * power; - d2.Mid = (uint)tmp; - d2.High = (uint)(tmp >> 32); + uint hi32 = (uint)Math.BigMul(d2.Low64, power, out ulong low64); + d2.Low64 = low64; + d2.High = hi32 + d2.High * power; } while ((scale -= MaxInt32Scale) > 0); scale = 0; } From 00ca6c7c4ca1f67ab09b0ea8eff0cfde66bd0bbd Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 11 Mar 2024 22:58:28 +0100 Subject: [PATCH 15/23] Add new IncreaseScale overload to fix issue with Store-To-Load forwarding * Gives around 20% faster perf for 64bit for full 96bit division * removes cast --- .../src/System/Decimal.DecCalc.cs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 0c24990ce6403..9379c4c1384d6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -553,6 +553,19 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power) return (uint)(hi64 >> 32); } + /// + /// Multiply the two numbers. The result overwrite the input. + /// + /// buffer + /// Scale factor to multiply by + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void IncreaseScale(ref Buf16 bufNum, uint power) + { + ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64); + bufNum.Low64 = low64; + bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64; + } + /// /// Multiply the two numbers 64bit * 32bit. /// The 96 bits of the result overwrite the input. @@ -2126,7 +2139,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2) if (IncreaseScale(ref bufQuo, power) != 0) goto ThrowOverflow; - bufRem.U3 = IncreaseScale(ref *(Buf12*)&bufRem, power); + IncreaseScale(ref bufRem, power); tmp = Div128By96(ref bufRem, ref bufDivisor); if (!Add32To96(ref bufQuo, tmp)) { From 0659dc284b6d7cdfb145ed0c52bc23b2bda914da Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 11 Mar 2024 23:23:52 +0100 Subject: [PATCH 16/23] Add back old 32bit code for IncreaseScale * makes the 5-10% regression 20% faster (25% speedup) for 1/3 --- .../src/System/Decimal.DecCalc.cs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 9379c4c1384d6..9b7c6e7dcd3ae 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -546,11 +546,23 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen) /// Returns highest 32 bits of product private static uint IncreaseScale(ref Buf12 bufNum, uint power) { +#if TARGET_64BIT ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64); bufNum.Low64 = low64; hi64 = Math.BigMul(bufNum.U2, power) + hi64; bufNum.U2 = (uint)hi64; return (uint)(hi64 >> 32); +#else + ulong tmp = Math.BigMul(bufNum.U0, power); + bufNum.U0 = (uint)tmp; + tmp >>= 32; + tmp += Math.BigMul(bufNum.U1, power); + bufNum.U1 = (uint)tmp; + tmp >>= 32; + tmp += Math.BigMul(bufNum.U2, power); + bufNum.U2 = (uint)tmp; + return (uint)(tmp >> 32); +#endif } /// @@ -559,11 +571,15 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power) /// buffer /// Scale factor to multiply by [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void IncreaseScale(ref Buf16 bufNum, uint power) + private unsafe static void IncreaseScale(ref Buf16 bufNum, uint power) { +#if TARGET_64BIT ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64); bufNum.Low64 = low64; bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64; +#else + bufNum.U3 = IncreaseScale(ref *(Buf12*)Unsafe.AsPointer(ref bufNum), power); +#endif } /// From 83191db705e26c32158a55ed9395498385c624ec Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Mon, 11 Mar 2024 23:38:43 +0100 Subject: [PATCH 17/23] 6% faster with longer increasescale --- .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 9b7c6e7dcd3ae..aa1729d37d6a5 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -578,7 +578,14 @@ private unsafe static void IncreaseScale(ref Buf16 bufNum, uint power) bufNum.Low64 = low64; bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64; #else - bufNum.U3 = IncreaseScale(ref *(Buf12*)Unsafe.AsPointer(ref bufNum), power); + ulong tmp = Math.BigMul(bufNum.U0, power); + bufNum.U0 = (uint)tmp; + tmp >>= 32; + tmp += Math.BigMul(bufNum.U1, power); + bufNum.U1 = (uint)tmp; + tmp >>= 32; + tmp += Math.BigMul(bufNum.U2, power); + bufNum.High64 = tmp; #endif } From f2ada41d6646a0655fa1e48012a9e5a11db2b833 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Tue, 12 Mar 2024 11:32:46 +0100 Subject: [PATCH 18/23] Add back Bmi2.MultiplyNoFlags in Math.BigMul --- .../System.Private.CoreLib/src/System/Math.cs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs index 3d7952c51fddf..44ef0875634de 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Math.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs @@ -151,8 +151,16 @@ internal static void ThrowNegateTwosCompOverflow() throw new OverflowException(SR.Overflow_NegateTwosCompNum); } - internal static ulong BigMul(uint a, uint b) + internal static unsafe ulong BigMul(uint a, uint b) { +#if TARGET_32BIT + if (Bmi2.IsSupported) + { + uint low; + uint hi = Bmi2.MultiplyNoFlags(a, b, &low); + return ((ulong)hi << 32) | low; + } +#endif return ((ulong)a) * b; } From 46aca48a208b5c65eb23e3e9ab42fef0a5b08d21 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Tue, 12 Mar 2024 14:00:08 +0100 Subject: [PATCH 19/23] fix Math.Bigmul compilation --- .../System.Private.CoreLib/src/System/Math.cs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs index 44ef0875634de..2294a6aed7e7c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Math.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs @@ -157,8 +157,8 @@ internal static unsafe ulong BigMul(uint a, uint b) if (Bmi2.IsSupported) { uint low; - uint hi = Bmi2.MultiplyNoFlags(a, b, &low); - return ((ulong)hi << 32) | low; + uint high = Bmi2.MultiplyNoFlags(a, b, &low); + return ((ulong)high << 32) | low; } #endif return ((ulong)a) * b; @@ -179,11 +179,10 @@ public static long BigMul(int a, int b) internal static ulong BigMul(ulong a, uint b, out ulong low) { #if TARGET_64BIT - return Math.BigMul(ulong)a, (ulong)b, out low); + return Math.BigMul((ulong)a, (ulong)b, out low); #else - ulong prodH = (((ulong)(uint)(a >> 32)) * b); ulong prodL = ((ulong)(uint)a) * b; - prodH += (prodL >> 32); + ulong prodH = (prodL >> 32) + (((ulong)(uint)(a >> 32)) * b); low = ((prodH << 32) | (uint)prodL); return (prodH >> 32); @@ -193,7 +192,7 @@ internal static ulong BigMul(ulong a, uint b, out ulong low) /// [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static ulong BigMul(uint a, ulong b, out ulong low) - => BigMul(a, b, out low); + => BigMul(b, a, out low); /// Produces the full product of two unsigned 64-bit numbers. /// The first number to multiply. From 488c294204eb220152f2f2891c40e2073afe68ae Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Tue, 12 Mar 2024 14:12:00 +0100 Subject: [PATCH 20/23] switch from Bigmul to (ulong)a * (uint)b in IncreaseScale * Gives up to 10ns (or 14% faster) --- .../src/System/Decimal.DecCalc.cs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index aa1729d37d6a5..75bb1fbb71f56 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -553,13 +553,13 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power) bufNum.U2 = (uint)hi64; return (uint)(hi64 >> 32); #else - ulong tmp = Math.BigMul(bufNum.U0, power); + ulong tmp = (ulong)bufNum.U0 * power; bufNum.U0 = (uint)tmp; tmp >>= 32; - tmp += Math.BigMul(bufNum.U1, power); + tmp += (ulong)bufNum.U1 * power; bufNum.U1 = (uint)tmp; tmp >>= 32; - tmp += Math.BigMul(bufNum.U2, power); + tmp += (ulong)bufNum.U2 * power; bufNum.U2 = (uint)tmp; return (uint)(tmp >> 32); #endif @@ -571,20 +571,20 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power) /// buffer /// Scale factor to multiply by [MethodImpl(MethodImplOptions.AggressiveInlining)] - private unsafe static void IncreaseScale(ref Buf16 bufNum, uint power) + private static void IncreaseScale(ref Buf16 bufNum, uint power) { #if TARGET_64BIT ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64); bufNum.Low64 = low64; bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64; #else - ulong tmp = Math.BigMul(bufNum.U0, power); + ulong tmp = (ulong)bufNum.U0 * power; bufNum.U0 = (uint)tmp; tmp >>= 32; - tmp += Math.BigMul(bufNum.U1, power); + tmp += (ulong)bufNum.U1 * power; bufNum.U1 = (uint)tmp; tmp >>= 32; - tmp += Math.BigMul(bufNum.U2, power); + tmp += (ulong)bufNum.U2 * power; bufNum.High64 = tmp; #endif } From 0087c61cd1fd6f4131fe1799fee9495ce3be269b Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Sat, 16 Mar 2024 13:56:39 +0100 Subject: [PATCH 21/23] Call IncreaseScale(ref Buf12) from IncreaseScale(ref Buf16) for 32bit code --- .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 75bb1fbb71f56..8fd4c57286843 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -578,14 +578,7 @@ private static void IncreaseScale(ref Buf16 bufNum, uint power) bufNum.Low64 = low64; bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64; #else - ulong tmp = (ulong)bufNum.U0 * power; - bufNum.U0 = (uint)tmp; - tmp >>= 32; - tmp += (ulong)bufNum.U1 * power; - bufNum.U1 = (uint)tmp; - tmp >>= 32; - tmp += (ulong)bufNum.U2 * power; - bufNum.High64 = tmp; + bufNum.U3 = IncreaseScale(ref Unsafe.As(ref bufNum), power); #endif } From baae42d34a35d1624bd55357733b48cf04ed5434 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Thu, 27 Jun 2024 21:26:48 +0200 Subject: [PATCH 22/23] Add #pragma warning disable CA2252 --- .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 8fd4c57286843..1448d229c32a6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -8,6 +8,8 @@ using X86 = System.Runtime.Intrinsics.X86; +#pragma warning disable CA2252 // X86Base.DivRem that requires opting into preview features is used in a few places + namespace System { public partial struct Decimal From 3a9789141050bad8f32a3f550c367842dfd16809 Mon Sep 17 00:00:00 2001 From: Daniel Svensson Date: Thu, 12 Sep 2024 17:11:07 +0200 Subject: [PATCH 23/23] Change supression to SYSLIB5004 now that DivRem is marked as [Experimental] --- .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs index 18c1c43940add..b4470bed195e0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs @@ -8,7 +8,7 @@ using X86 = System.Runtime.Intrinsics.X86; -#pragma warning disable CA2252 // X86Base.DivRem that requires opting into preview features is used in a few places +#pragma warning disable SYSLIB5004 // DivRem is marked as [Experimental], see https://github.com/dotnet/runtime/issues/82194 namespace System {