From 03b5226775da64e154130534c53cd056ccda095a Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Sun, 18 Feb 2024 11:39:33 +0100
Subject: [PATCH 01/23] Improve performance of decimal division

---
 .../src/System/Decimal.DecCalc.cs             | 134 ++++++++++++++----
 1 file changed, 108 insertions(+), 26 deletions(-)
diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index ca15b1df6ea407..c530b8ccdfe377 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -194,27 +194,66 @@ private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result)
             /// <returns>Returns remainder. Quotient overwrites dividend.</returns>
             private static uint Div96By32(ref Buf12 bufNum, uint den)
             {
-                // TODO: https://github.com/dotnet/runtime/issues/5213
-                ulong tmp, div;
+                if (X86.X86Base.X64.IsSupported)
+                {
+                    uint hiRes = 0;
+                    ulong remainder = bufNum.U2;
+
+                    if (remainder < den)
+                        goto Div164bit;
+
+                    (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den);
+
+                Div164bit:
+                    bufNum.U2 = hiRes;
+                    (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den);
+                    return (uint)remainder;
+                }
+                else if (X86.X86Base.IsSupported)
+                {
+                    uint remainder = 0;
+
                 if (bufNum.U2 != 0)
+                        goto Div3Word;
+                    if (bufNum.U1 >= den)
+                        goto Div2Word;
+
+                    remainder = bufNum.U1;
+                    bufNum.U1 = 0;
+                    goto Div1Word;
+
+                Div3Word:
+                    (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, 0, den);
+                Div2Word:
+                    (bufNum.U1, remainder) = X86.X86Base.DivRem(bufNum.U1, remainder, den);
+                Div1Word:
+                    (bufNum.U0, remainder) = X86.X86Base.DivRem(bufNum.U0, remainder, den);
+                    return remainder;
+                }
+                else
                 {
+                    ulong tmp, div, rem;
+                    if (bufNum.U2 != 0)
+                    {
                     tmp = bufNum.High64;
-                    div = tmp / den;
+
+                        (div, rem) = Math.DivRem(tmp, den);
                     bufNum.High64 = div;
-                    tmp = ((tmp - (uint)div * den) << 32) | bufNum.U0;
+                        tmp = (rem << 32) | bufNum.U0;
                     if (tmp == 0)
                         return 0;
-                    uint div32 = (uint)(tmp / den);
-                    bufNum.U0 = div32;
-                    return (uint)tmp - div32 * den;
+                        (div, rem) = Math.DivRem(tmp, den);
+                        bufNum.U0 = (uint)div;
+                        return (uint)rem;
                 }
 
                 tmp = bufNum.Low64;
                 if (tmp == 0)
                     return 0;
-                div = tmp / den;
+                    (div, rem) = Math.DivRem(tmp, den);
                 bufNum.Low64 = div;
-                return (uint)(tmp - div * den);
+                    return (uint)rem;
+            }
             }
 
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -322,6 +361,14 @@ private static void Unscale(ref uint low, ref ulong high64, ref int scale)
             private static uint Div96By64(ref Buf12 bufNum, ulong den)
             {
                 Debug.Assert(den > bufNum.High64);
+
+                if (X86.X86Base.X64.IsSupported)
+                {
+                    // Assert above states: den > bufNum.High64 so den > bufNum.U2 and we can be sure we will not overflow
+                    (ulong quotient, bufNum.Low64) = X86.X86Base.X64.DivRem(bufNum.Low64, bufNum.U2, den);
+                    return (uint)quotient;
+                }
+
                 ulong num;
                 uint num2 = bufNum.U2;
                 if (num2 == 0)
@@ -392,6 +439,26 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den)
                 return quo;
             }
 
+            private static uint BigMul64By32(ulong a, uint b, out ulong low)
+            {
+                if (IntPtr.Size == 8)
+                {
+                    return (uint)Math.BigMul(a, b, out low);
+                }
+                else
+                {
+                    uint al = (uint)a;
+                    uint ah = (uint)(a >> 32);
+                    uint bl = (uint)b;
+
+                    ulong mull = ((ulong)al) * bl;
+                    ulong t = ((ulong)ah) * bl + (mull >> 32);
+
+                    low = (t << 32 | mull);
+                    return (uint)(t >> 32);
+                }               
+            }
+
             /// <summary>
             /// Do partial divide, yielding 32-bit result and 96-bit remainder.
             /// Top divisor uint must be larger than top dividend uint. This is
@@ -413,20 +480,25 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
                     //
                     return 0;
 
+
+                uint quo;
+                uint remainder;
+                if (X86.X86Base.IsSupported)
+                {
+                    (quo, remainder) = X86.X86Base.DivRem(bufNum.U2, bufNum.U3, den);
+                }
+                else
+                {
                 // TODO: https://github.com/dotnet/runtime/issues/5213
-                uint quo = (uint)(dividend / den);
-                uint remainder = (uint)dividend - quo * den;
+                    quo = (uint)(dividend / den);
+                    remainder = (uint)dividend - quo * den;
+                }
 
                 // Compute full remainder, rem = dividend - (quo * divisor).
                 //
-                ulong prod1 = Math.BigMul(quo, bufDen.U0); // quo * lo divisor
-                ulong prod2 = Math.BigMul(quo, bufDen.U1); // quo * mid divisor
-                prod2 += prod1 >> 32;
-                prod1 = (uint)prod1 | (prod2 << 32);
-                prod2 >>= 32;
-
-                ulong num = bufNum.Low64;
-                num -= prod1;
+                ulong prod1;
+                uint prod2 = BigMul64By32(bufDen.Low64, quo, out prod1);
+                ulong num = bufNum.Low64 - prod1;
                 remainder -= (uint)prod2;
 
                 // Propagate carries
@@ -479,25 +551,35 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
             /// <returns>Returns highest 32 bits of product</returns>
             private static uint IncreaseScale(ref Buf12 bufNum, uint power)
             {
-                ulong tmp = Math.BigMul(bufNum.U0, power);
-                bufNum.U0 = (uint)tmp;
-                tmp >>= 32;
-                tmp += Math.BigMul(bufNum.U1, power);
-                bufNum.U1 = (uint)tmp;
-                tmp >>= 32;
-                tmp += Math.BigMul(bufNum.U2, power);
+                ulong tmp = BigMul64By32(bufNum.Low64, power, out ulong low);
+                bufNum.Low64 = low;
+                tmp = Math.BigMul(bufNum.U2, power) + tmp;
                 bufNum.U2 = (uint)tmp;
                 return (uint)(tmp >> 32);
             }
 
+            /// <summary>
+            /// Multiply the two numbers 64bit * 32bit.
+            /// The 96 bits of the result overwrite the input. 
+            /// </summary>
+            /// <param name="bufNum">64-bit number as array of uints, least-sig first</param>
+            /// <param name="power">Scale factor to multiply by</param>
             private static void IncreaseScale64(ref Buf12 bufNum, uint power)
             {
+                if (IntPtr.Size == 8)
+                {
+                    bufNum.U2 = (uint)Math.BigMul(bufNum.Low64, power, out ulong low);
+                    bufNum.Low64 = low;
+                }
+                else
+                {
                 ulong tmp = Math.BigMul(bufNum.U0, power);
                 bufNum.U0 = (uint)tmp;
                 tmp >>= 32;
                 tmp += Math.BigMul(bufNum.U1, power);
                 bufNum.High64 = tmp;
             }
+            }
 
             /// <summary>
             /// See if we need to scale the result to fit it in 96 bits.

From f593c2ed21a8c56c2d329a4bc6237b4c960293f6 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Sun, 18 Feb 2024 11:42:25 +0100
Subject: [PATCH 02/23] Remove usage of mulx via instrinct on 32bit x86 since
 it produces worse code than

Job=ShortRun  IterationCount=3  LaunchCount=1
WarmupCount=3

| Method                     | a | b          | Mean     | Error     | StdDev    | Allocated |
|--------------------------- |-- |----------- |---------:|----------:|----------:|----------:|
| Mul64By32_New | 3 | 4294967295 | 2.068 ns | 0.0459 ns | 0.0383 ns |         - |
| Mul64By32_Ori                |  3   | 4294967295 | 2.916 ns | 0.0231 ns | 0.0193 ns |         - |
---
 .../FallbackInterfaceMethodAttribute.cs       | 20 +++++++++
 .../src/System/Decimal.DecCalc.cs             | 41 ++++++++++---------
 .../System.Private.CoreLib/src/System/Math.cs | 10 +----
 3 files changed, 42 insertions(+), 29 deletions(-)
 create mode 100644 src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs

diff --git a/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs b/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs
new file mode 100644
index 00000000000000..5877e0c6bf334c
--- /dev/null
+++ b/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs
@@ -0,0 +1,20 @@
+﻿namespace System.Runtime.CompilerServices.Internal
+{
+    /// <summary>
+    /// INTERNAL: Make default Interface methods have "low priority" in case there are multiple 
+    /// possible implementations (the "Diamond dependency problem"), 
+    /// ensuring that any other conflicting implementaion will be selected at runtime.
+    /// </summary>
+    /// <remarks>
+    /// This allows adding default method implementations for existing interfaces without 
+    /// making it a binary breaking change. (It can still be a source breaking change)
+    /// <para>
+    /// Should preferably only be used in the same assembly which defines 
+    /// the interface method beeing overridden.
+    /// </para>
+    /// </remarks>
+    [AttributeUsage(AttributeTargets.Method, AllowMultiple = false, Inherited = false)]
+    sealed class FallbackInterfaceMethodAttribute : Attribute
+    {
+    }
+}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index c530b8ccdfe377..f21da601266293 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -213,7 +213,7 @@ private static uint Div96By32(ref Buf12 bufNum, uint den)
                 {
                     uint remainder = 0;
 
-                if (bufNum.U2 != 0)
+                    if (bufNum.U2 != 0)
                         goto Div3Word;
                     if (bufNum.U1 >= den)
                         goto Div2Word;
@@ -235,25 +235,25 @@ private static uint Div96By32(ref Buf12 bufNum, uint den)
                     ulong tmp, div, rem;
                     if (bufNum.U2 != 0)
                     {
-                    tmp = bufNum.High64;
+                        tmp = bufNum.High64;
 
                         (div, rem) = Math.DivRem(tmp, den);
-                    bufNum.High64 = div;
+                        bufNum.High64 = div;
                         tmp = (rem << 32) | bufNum.U0;
-                    if (tmp == 0)
-                        return 0;
+                        if (tmp == 0)
+                            return 0;
                         (div, rem) = Math.DivRem(tmp, den);
                         bufNum.U0 = (uint)div;
                         return (uint)rem;
-                }
+                    }
 
-                tmp = bufNum.Low64;
-                if (tmp == 0)
-                    return 0;
+                    tmp = bufNum.Low64;
+                    if (tmp == 0)
+                        return 0;
                     (div, rem) = Math.DivRem(tmp, den);
-                bufNum.Low64 = div;
+                    bufNum.Low64 = div;
                     return (uint)rem;
-            }
+                }
             }
 
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -456,7 +456,7 @@ private static uint BigMul64By32(ulong a, uint b, out ulong low)
 
                     low = (t << 32 | mull);
                     return (uint)(t >> 32);
-                }               
+                }
             }
 
             /// <summary>
@@ -489,7 +489,7 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
                 }
                 else
                 {
-                // TODO: https://github.com/dotnet/runtime/issues/5213
+                    // TODO: https://github.com/dotnet/runtime/issues/5213
                     quo = (uint)(dividend / den);
                     remainder = (uint)dividend - quo * den;
                 }
@@ -560,7 +560,7 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power)
 
             /// <summary>
             /// Multiply the two numbers 64bit * 32bit.
-            /// The 96 bits of the result overwrite the input. 
+            /// The 96 bits of the result overwrite the input.
             /// </summary>
             /// <param name="bufNum">64-bit number as array of uints, least-sig first</param>
             /// <param name="power">Scale factor to multiply by</param>
@@ -573,12 +573,12 @@ private static void IncreaseScale64(ref Buf12 bufNum, uint power)
                 }
                 else
                 {
-                ulong tmp = Math.BigMul(bufNum.U0, power);
-                bufNum.U0 = (uint)tmp;
-                tmp >>= 32;
-                tmp += Math.BigMul(bufNum.U1, power);
-                bufNum.High64 = tmp;
-            }
+                    ulong tmp = Math.BigMul(bufNum.U0, power);
+                    bufNum.U0 = (uint)tmp;
+                    tmp >>= 32;
+                    tmp += Math.BigMul(bufNum.U1, power);
+                    bufNum.High64 = tmp;
+                }
             }
 
             /// <summary>
@@ -2084,6 +2084,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
                             {
                                 if (scale < 0)
                                 {
+                                    // TODO: consider 64bit powers
                                     curScale = Math.Min(9, -scale);
                                     goto HaveScale64;
                                 }
diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs
index 1d1c50a4e2b552..7e99bb0366edfc 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Math.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs
@@ -151,16 +151,8 @@ internal static void ThrowNegateTwosCompOverflow()
             throw new OverflowException(SR.Overflow_NegateTwosCompNum);
         }
 
-        internal static unsafe ulong BigMul(uint a, uint b)
+        internal static ulong BigMul(uint a, uint b)
         {
-#if TARGET_32BIT
-            if (Bmi2.IsSupported)
-            {
-                uint low;
-                uint high = Bmi2.MultiplyNoFlags(a, b, &low);
-                return ((ulong)high << 32) | low;
-            }
-#endif
             return ((ulong)a) * b;
         }
 

From 218b373823a7b8a67ca09e786f1b799d6821b98e Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Tue, 20 Feb 2024 11:30:39 +0100
Subject: [PATCH 03/23] update part of Multiply

- Add comment to BigMul64By32 and make it return nunit to avoid clearing upper 32 bits
- Simplify IncreaseScale
---
 .../src/System/Decimal.DecCalc.cs             | 176 +++++++-----------
 1 file changed, 66 insertions(+), 110 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index f21da601266293..8ec88d5b631b98 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -192,6 +192,7 @@ private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result)
             /// <param name="bufNum">96-bit dividend as array of uints, least-sig first</param>
             /// <param name="den">32-bit divisor</param>
             /// <returns>Returns remainder. Quotient overwrites dividend.</returns>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
             private static uint Div96By32(ref Buf12 bufNum, uint den)
             {
                 if (X86.X86Base.X64.IsSupported)
@@ -200,11 +201,11 @@ private static uint Div96By32(ref Buf12 bufNum, uint den)
                     ulong remainder = bufNum.U2;
 
                     if (remainder < den)
-                        goto Div164bit;
+                        goto DivOne64Bit;
 
                     (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den);
 
-                Div164bit:
+                DivOne64Bit:
                     bufNum.U2 = hiRes;
                     (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den);
                     return (uint)remainder;
@@ -439,24 +440,28 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den)
                 return quo;
             }
 
-            private static uint BigMul64By32(ulong a, uint b, out ulong low)
+
+            /// <summary>
+            /// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in <paramref name="low"/>
+            /// </summary>
+            /// <returns>hi bits of the result</returns>
+            /// <remarks>returns nuint instead of uint to skip clearing upper 32bits on 64bit platforms</remarks>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static nuint BigMul64By32(ulong a, uint b, out ulong low)
             {
-                if (IntPtr.Size == 8)
-                {
-                    return (uint)Math.BigMul(a, b, out low);
-                }
-                else
-                {
-                    uint al = (uint)a;
-                    uint ah = (uint)(a >> 32);
-                    uint bl = (uint)b;
+#if TARGET_64BIT
+                return (nuint)Math.BigMul(a, b, out low);
+#else
+                uint al = (uint)a;
+                uint ah = (uint)(a >> 32);
+                uint bl = b;
 
-                    ulong mull = ((ulong)al) * bl;
-                    ulong t = ((ulong)ah) * bl + (mull >> 32);
+                ulong prodL = ((ulong)al) * bl;
+                ulong prodH = ((ulong)ah) * bl + (prodL >> 32);
 
-                    low = (t << 32 | mull);
-                    return (uint)(t >> 32);
-                }
+                low = (prodH << 32 | (uint)prodL);
+                return (nuint)(prodH >> 32);
+#endif
             }
 
             /// <summary>
@@ -497,11 +502,12 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
                 // Compute full remainder, rem = dividend - (quo * divisor).
                 //
                 ulong prod1;
-                uint prod2 = BigMul64By32(bufDen.Low64, quo, out prod1);
+                uint prod2 = (uint)BigMul64By32(bufDen.Low64, quo, out prod1);
                 ulong num = bufNum.Low64 - prod1;
                 remainder -= (uint)prod2;
 
                 // Propagate carries
+                // can be simplified if https://github.com/dotnet/runtime/issues/48247 is done
                 //
                 if (num > ~prod1)
                 {
@@ -551,11 +557,11 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
             /// <returns>Returns highest 32 bits of product</returns>
             private static uint IncreaseScale(ref Buf12 bufNum, uint power)
             {
-                ulong tmp = BigMul64By32(bufNum.Low64, power, out ulong low);
-                bufNum.Low64 = low;
-                tmp = Math.BigMul(bufNum.U2, power) + tmp;
-                bufNum.U2 = (uint)tmp;
-                return (uint)(tmp >> 32);
+                ulong hi64 = BigMul64By32(bufNum.Low64, power, out ulong low64);
+                bufNum.Low64 = low64;
+                hi64 = Math.BigMul(bufNum.U2, power) + hi64;
+                bufNum.U2 = (uint)hi64;
+                return (uint)(hi64 >> 32);
             }
 
             /// <summary>
@@ -566,19 +572,8 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power)
             /// <param name="power">Scale factor to multiply by</param>
             private static void IncreaseScale64(ref Buf12 bufNum, uint power)
             {
-                if (IntPtr.Size == 8)
-                {
-                    bufNum.U2 = (uint)Math.BigMul(bufNum.Low64, power, out ulong low);
-                    bufNum.Low64 = low;
-                }
-                else
-                {
-                    ulong tmp = Math.BigMul(bufNum.U0, power);
-                    bufNum.U0 = (uint)tmp;
-                    tmp >>= 32;
-                    tmp += Math.BigMul(bufNum.U1, power);
-                    bufNum.High64 = tmp;
-                }
+                bufNum.U2 = (uint)BigMul64By32(bufNum.Low64, power, out ulong low64);
+                bufNum.Low64 = low64;
             }
 
             /// <summary>
@@ -1427,12 +1422,8 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
                     else
                     {
                         // Left value is 32-bit, result fits in 4 uints
-                        tmp = Math.BigMul(d1.Low, d2.Low);
-                        bufProd.U0 = (uint)tmp;
-
-                        tmp = Math.BigMul(d1.Low, d2.Mid) + (tmp >> 32);
-                        bufProd.U1 = (uint)tmp;
-                        tmp >>= 32;
+                        tmp = BigMul64By32(d2.Low64, d1.Low, out ulong low);
+                        bufProd.Low64 = low;
 
                         if (d2.High != 0)
                         {
@@ -1451,12 +1442,8 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
                 else if ((d2.High | d2.Mid) == 0)
                 {
                     // Right value is 32-bit, result fits in 4 uints
-                    tmp = Math.BigMul(d2.Low, d1.Low);
-                    bufProd.U0 = (uint)tmp;
-
-                    tmp = Math.BigMul(d2.Low, d1.Mid) + (tmp >> 32);
-                    bufProd.U1 = (uint)tmp;
-                    tmp >>= 32;
+                    tmp = BigMul64By32(d1.Low64, d2.Low, out ulong low);
+                    bufProd.Low64 = low;
 
                     if (d1.High != 0)
                     {
@@ -1473,80 +1460,50 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
                 }
                 else
                 {
-                    // Both operands have bits set in the upper 64 bits.
+                    // At least one operand has bits set in the upper 64 bits.
                     //
                     // Compute and accumulate the 9 partial products into a
-                    // 192-bit (24-byte) result.
+                    // 192-bit (3*64bit) result.
                     //
-                    //        [l-h][l-m][l-l]      left high, middle, low
-                    //         x    [r-h][r-m][r-l]      right high, middle, low
-                    // ------------------------------
+                    //                [l-hi][l-lo]   left high32, low64
+                    //             x  [r-hi][r-lo]   right high32, low64
+                    // -------------------------------
                     //
-                    //             [0-h][0-l]      l-l * r-l
-                    //        [1ah][1al]      l-l * r-m
-                    //        [1bh][1bl]      l-m * r-l
-                    //       [2ah][2al]          l-m * r-m
-                    //       [2bh][2bl]          l-l * r-h
-                    //       [2ch][2cl]          l-h * r-l
-                    //      [3ah][3al]          l-m * r-h
-                    //      [3bh][3bl]          l-h * r-m
-                    // [4-h][4-l]              l-h * r-h
+                    //                [ 0-h][0-l ]   l-lo * r-lo => 64 + 64 bit result
+                    //          [ h*l][h*l ]         l-lo * r-hi => 32 + 64 bit result
+                    //          [ l*h][l*h ]         l-hi * r-lo => 32 + 64 bit result
+                    //          [ h*h]               l-hi * r-hi => 32 + 32 bit result
                     // ------------------------------
-                    // [p-5][p-4][p-3][p-2][p-1][p-0]      prod[] array
+                    //          [Hi64][Mid64][Low64]   bufProd "array"
                     //
 
-                    tmp = Math.BigMul(d1.Low, d2.Low);
-                    bufProd.U0 = (uint)tmp;
-
-                    ulong tmp2 = Math.BigMul(d1.Low, d2.Mid) + (tmp >> 32);
-
-                    tmp = Math.BigMul(d1.Mid, d2.Low);
-                    tmp += tmp2; // this could generate carry
-                    bufProd.U1 = (uint)tmp;
-                    if (tmp < tmp2) // detect carry
-                        tmp2 = (tmp >> 32) | (1UL << 32);
-                    else
-                        tmp2 = tmp >> 32;
-
-                    tmp = Math.BigMul(d1.Mid, d2.Mid) + tmp2;
+                    ulong mid64 = Math.BigMul(d1.Low64, d2.Low64, out tmp);
+                    bufProd.Low64 = tmp;
 
-                    if ((d1.High | d2.High) > 0)
+                    if ((d1.High | d2.High) != 0)
                     {
-                        // Highest 32 bits is non-zero.     Calculate 5 more partial products.
-                        //
-                        tmp2 = Math.BigMul(d1.Low, d2.High);
-                        tmp += tmp2; // this could generate carry
-                        uint tmp3 = 0;
-                        if (tmp < tmp2) // detect carry
-                            tmp3 = 1;
-
-                        tmp2 = Math.BigMul(d1.High, d2.Low);
-                        tmp += tmp2; // this could generate carry
-                        bufProd.U2 = (uint)tmp;
-                        if (tmp < tmp2) // detect carry
-                            tmp3++;
-                        tmp2 = ((ulong)tmp3 << 32) | (tmp >> 32);
-
-                        tmp = Math.BigMul(d1.Mid, d2.High);
-                        tmp += tmp2; // this could generate carry
-                        tmp3 = 0;
-                        if (tmp < tmp2) // detect carry
-                            tmp3 = 1;
-
-                        tmp2 = Math.BigMul(d1.High, d2.Mid);
-                        tmp += tmp2; // this could generate carry
-                        bufProd.U3 = (uint)tmp;
-                        if (tmp < tmp2) // detect carry
-                            tmp3++;
-                        tmp = ((ulong)tmp3 << 32) | (tmp >> 32);
-
-                        bufProd.High64 = Math.BigMul(d1.High, d2.High) + tmp;
-
+                        // hi64 will never overflow since the result will always fit in 192 (2*96) bits
+                        ulong hi64 = Math.BigMul(d1.High, d2.High);
+
+                        // Do crosswise multiplications between upper 32bit and lower 64 bits
+                        hi64 += BigMul64By32(d1.Low64, d2.High, out tmp);
+                        mid64 += tmp;
+                        // propagate carry, can be simplified if https://github.com/dotnet/runtime/issues/48247 is done
+                        if (mid64 < tmp)
+                            ++hi64;
+
+                        hi64 += BigMul64By32(d2.Low64, d1.High, out tmp);
+                        mid64 += tmp;
+                        if (mid64 < tmp)
+                            ++hi64;
+
+                        bufProd.Mid64 = mid64;
+                        bufProd.High64 = hi64;
                         hiProd = 5;
                     }
                     else
                     {
-                        bufProd.Mid64 = tmp;
+                        bufProd.Mid64 = mid64;
                         hiProd = 3;
                     }
                 }
@@ -2084,7 +2041,6 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
                             {
                                 if (scale < 0)
                                 {
-                                    // TODO: consider 64bit powers
                                     curScale = Math.Min(9, -scale);
                                     goto HaveScale64;
                                 }

From f19ef92d693c52ff1e404c941126c3031eb34ece Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Thu, 22 Feb 2024 10:31:49 +0100
Subject: [PATCH 04/23] remove unintentional file

---
 .../FallbackInterfaceMethodAttribute.cs       | 20 -------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs

diff --git a/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs b/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs
deleted file mode 100644
index 5877e0c6bf334c..00000000000000
--- a/src/coreclr/System.Private.CoreLib/src/System/Runtime/CompilerServices/Internal/FallbackInterfaceMethodAttribute.cs
+++ /dev/null
@@ -1,20 +0,0 @@
-﻿namespace System.Runtime.CompilerServices.Internal
-{
-    /// <summary>
-    /// INTERNAL: Make default Interface methods have "low priority" in case there are multiple 
-    /// possible implementations (the "Diamond dependency problem"), 
-    /// ensuring that any other conflicting implementaion will be selected at runtime.
-    /// </summary>
-    /// <remarks>
-    /// This allows adding default method implementations for existing interfaces without 
-    /// making it a binary breaking change. (It can still be a source breaking change)
-    /// <para>
-    /// Should preferably only be used in the same assembly which defines 
-    /// the interface method beeing overridden.
-    /// </para>
-    /// </remarks>
-    [AttributeUsage(AttributeTargets.Method, AllowMultiple = false, Inherited = false)]
-    sealed class FallbackInterfaceMethodAttribute : Attribute
-    {
-    }
-}

From e984fc53651c5ae7f342829eca5c70e323688bf5 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Thu, 22 Feb 2024 18:31:08 +0100
Subject: [PATCH 05/23] Improve division by 64bit value on x64

---
 .../src/System/Decimal.DecCalc.cs             | 31 ++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 8ec88d5b631b98..bbbfd4c05e6390 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -352,6 +352,32 @@ private static void Unscale(ref uint low, ref ulong high64, ref int scale)
                     scale--;
             }
 
+            /// <summary>
+            /// Do partial divide, yielding 64-bit result and 64-bit remainder.
+            /// Divisor must be larger than upper 64 bits of dividend.
+            /// </summary>
+            /// <param name="bufNum">128-bit dividend as array of uints, least-sig first</param>
+            /// <param name="den">64-bit divisor</param>
+            /// <returns>Returns quotient. Remainder overwrites lower 64-bits of dividend.</returns>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static unsafe ulong Div128By64(Buf16* bufNum, ulong den)
+            {
+                Debug.Assert(den > bufNum->High64);
+
+                if (X86.X86Base.X64.IsSupported)
+                {
+                    // Assert above states: den > bufNum.High64 so den > bufNum.U2 and we can be sure we will not overflow
+                    (ulong quotient, bufNum->Low64) = X86.X86Base.X64.DivRem(bufNum->Low64, bufNum->High64, den);
+                    return quotient;
+                }
+                else
+                {
+                    uint hiBits = Div96By64(ref *(Buf12*)&bufNum->U1, den);
+                    uint loBits = Div96By64(ref *(Buf12*)bufNum, den);
+                    return ((ulong)hiBits << 32 | loBits);
+                }
+            }
+
             /// <summary>
             /// Do partial divide, yielding 32-bit result and 64-bit remainder.
             /// Divisor must be larger than upper 64 bits of dividend.
@@ -2031,10 +2057,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
                         // Have a 64-bit divisor in sdlDivisor.  The remainder
                         // (currently 96 bits spread over 4 uints) will be < divisor.
                         //
-                        bufQuo.U2 = 0;
-                        bufQuo.U1 = Div96By64(ref *(Buf12*)&bufRem.U1, divisor);
-                        bufQuo.U0 = Div96By64(ref *(Buf12*)&bufRem, divisor);
-
+                        bufQuo.Low64 = Div128By64(&bufRem, divisor);
                         while (true)
                         {
                             if (bufRem.Low64 == 0)

From f0d62fdba95c93e8e413aac0d21300b6639186f1 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Sat, 2 Mar 2024 14:06:47 +0100
Subject: [PATCH 06/23] Remove some more 64bit divides for x86

---
 .../src/System/Decimal.DecCalc.cs             | 70 +++++++++++++------
 1 file changed, 49 insertions(+), 21 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index bbbfd4c05e6390..8542ce612a2c68 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -200,12 +200,11 @@ private static uint Div96By32(ref Buf12 bufNum, uint den)
                     uint hiRes = 0;
                     ulong remainder = bufNum.U2;
 
-                    if (remainder < den)
-                        goto DivOne64Bit;
-
-                    (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den);
+                    if (remainder >= den)
+                    {
+                        (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den);
+                    }
 
-                DivOne64Bit:
                     bufNum.U2 = hiRes;
                     (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den);
                     return (uint)remainder;
@@ -222,12 +221,14 @@ private static uint Div96By32(ref Buf12 bufNum, uint den)
                     remainder = bufNum.U1;
                     bufNum.U1 = 0;
                     goto Div1Word;
-
-                Div3Word:
-                    (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, 0, den);
-                Div2Word:
+Div3Word:
+                    if (bufNum.U2 < den)
+                        (bufNum.U2, remainder) = (0, bufNum.U2);
+                    else
+                        (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, remainder, den);
+Div2Word:
                     (bufNum.U1, remainder) = X86.X86Base.DivRem(bufNum.U1, remainder, den);
-                Div1Word:
+Div1Word:
                     (bufNum.U0, remainder) = X86.X86Base.DivRem(bufNum.U0, remainder, den);
                     return remainder;
                 }
@@ -815,13 +816,28 @@ private static int OverflowUnscale(ref Buf12 bufQuo, int scale, bool sticky)
                 // We have overflown, so load the high bit with a one.
                 const ulong highbit = 1UL << 32;
                 bufQuo.U2 = (uint)(highbit / 10);
-                ulong tmp = ((highbit % 10) << 32) + bufQuo.U1;
-                uint div = (uint)(tmp / 10);
-                bufQuo.U1 = div;
-                tmp = ((tmp - div * 10) << 32) + bufQuo.U0;
-                div = (uint)(tmp / 10);
-                bufQuo.U0 = div;
-                uint remainder = (uint)(tmp - div * 10);
+
+                uint remainder;
+#if TARGET_32BIT
+                if (X86.X86Base.IsSupported)
+                {
+                    // 32-bit RyuJIT doesn't convert 64-bit division by constant into multiplication by reciprocal.
+                    // Do "32bit" divides instead of calling full 64bit helper
+                    (bufQuo.U1, remainder) = X86.X86Base.DivRem(bufQuo.U1, (uint)(highbit % 10), 10);
+                    (bufQuo.U0, remainder) = X86.X86Base.DivRem(bufQuo.U0, remainder, 10);
+                }
+                else
+#endif
+                {
+                    ulong tmp = ((highbit % 10) << 32) + bufQuo.U1;
+                    uint div = (uint)(tmp / 10);
+                    bufQuo.U1 = div;
+                    tmp = ((tmp - div * 10) << 32) + bufQuo.U0;
+                    div = (uint)(tmp / 10);
+                    bufQuo.U0 = div;
+                    remainder = (uint)(tmp - div * 10);
+                }
+
                 // The remainder is the last digit that does not fit, so we can use it to work out if we need to round up
                 if (remainder > 5 || remainder == 5 && (sticky || (bufQuo.U0 & 1) != 0))
                     Add32To96(ref bufQuo, 1);
@@ -1537,7 +1553,7 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
                 // Check for leading zero uints on the product
                 //
                 uint* product = (uint*)&bufProd;
-                while (product[(int)hiProd] == 0)
+                while (product[hiProd] == 0)
                 {
                     if (hiProd == 0)
                         goto ReturnZero;
@@ -2017,9 +2033,20 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
                             goto ThrowOverflow;
 
                         ulong num = Math.BigMul(remainder, power);
-                        // TODO: https://github.com/dotnet/runtime/issues/5213
-                        uint div = (uint)(num / den);
-                        remainder = (uint)num - div * den;
+                        uint div;
+#if TARGET_32BIT
+                        if (X86.X86Base.IsSupported)
+                        {
+                            (div, remainder) = X86.X86Base.DivRem((uint)num, (uint)(num >> 32), den);
+                        }
+                        else
+#endif
+                        {
+                            // Do full 64bit divide and cast result to 32bit
+                            var divRes = X86.X86Base.X64.IsSupported ? X86.X86Base.X64.DivRem(num, 0, (ulong)den) : Math.DivRem(num, den);
+                            div = (uint)divRes.Quotient;
+                            remainder = (uint)divRes.Remainder;
+                        }
 
                         if (!Add32To96(ref bufQuo, div))
                         {
@@ -2057,6 +2084,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
                         // Have a 64-bit divisor in sdlDivisor.  The remainder
                         // (currently 96 bits spread over 4 uints) will be < divisor.
                         //
+                        bufQuo.U2 = 0;
                         bufQuo.Low64 = Div128By64(&bufRem, divisor);
                         while (true)
                         {

From 792c3ebf52e6c90a8cacab22a411ae5cb9a220ac Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Sat, 2 Mar 2024 14:14:09 +0100
Subject: [PATCH 07/23] Call IncreaseScale in one more place

---
 .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs     | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 8542ce612a2c68..04c4fffd915972 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -2301,10 +2301,7 @@ internal static void VarDecMod(ref DecCalc d1, ref DecCalc d2)
                                 break;
                             uint power = iCurScale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[iCurScale];
                             scale += iCurScale;
-                            ulong tmp = Math.BigMul(bufQuo.U0, power);
-                            bufQuo.U0 = (uint)tmp;
-                            tmp >>= 32;
-                            bufQuo.High64 = tmp + bufQuo.High64 * power;
+                            IncreaseScale(ref bufQuo, power);
                             if (power != TenToPowerNine)
                                 break;
                         }

From 6dab5f92a07e59defb9507e0757f0a91d83704c0 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Sun, 3 Mar 2024 23:21:00 +0100
Subject: [PATCH 08/23] add extra paranthesis

---
 .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 04c4fffd915972..db09733da2a81a 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -486,7 +486,7 @@ private static nuint BigMul64By32(ulong a, uint b, out ulong low)
                 ulong prodL = ((ulong)al) * bl;
                 ulong prodH = ((ulong)ah) * bl + (prodL >> 32);
 
-                low = (prodH << 32 | (uint)prodL);
+                low = ((prodH << 32) | (uint)prodL);
                 return (nuint)(prodH >> 32);
 #endif
             }

From 83efb423c377f1608168f391feb6d1d94692e67a Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 4 Mar 2024 13:22:35 +0100
Subject: [PATCH 09/23] review: remove X86.X86Base.X64.DivRem

---
 .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index db09733da2a81a..7da824c419373c 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -2043,7 +2043,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
 #endif
                         {
                             // Do full 64bit divide and cast result to 32bit
-                            var divRes = X86.X86Base.X64.IsSupported ? X86.X86Base.X64.DivRem(num, 0, (ulong)den) : Math.DivRem(num, den);
+                            var divRes = Math.DivRem(num, den);
                             div = (uint)divRes.Quotient;
                             remainder = (uint)divRes.Remainder;
                         }

From a20f37f6ed0e613333da0737449663bf08403d10 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 4 Mar 2024 22:11:21 +0100
Subject: [PATCH 10/23] Remove BigMul64By32 and add overloads of Math.BigMul
 instead so that it can easily be removed once JIT recognize and optimize
 "ulong * uint"

---
 .../src/System/Decimal.DecCalc.cs             | 38 ++++---------------
 .../System.Private.CoreLib/src/System/Math.cs | 26 +++++++++++++
 2 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 7da824c419373c..d995d8145748de 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -467,30 +467,6 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den)
                 return quo;
             }
 
-
-            /// <summary>
-            /// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in <paramref name="low"/>
-            /// </summary>
-            /// <returns>hi bits of the result</returns>
-            /// <remarks>returns nuint instead of uint to skip clearing upper 32bits on 64bit platforms</remarks>
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            private static nuint BigMul64By32(ulong a, uint b, out ulong low)
-            {
-#if TARGET_64BIT
-                return (nuint)Math.BigMul(a, b, out low);
-#else
-                uint al = (uint)a;
-                uint ah = (uint)(a >> 32);
-                uint bl = b;
-
-                ulong prodL = ((ulong)al) * bl;
-                ulong prodH = ((ulong)ah) * bl + (prodL >> 32);
-
-                low = ((prodH << 32) | (uint)prodL);
-                return (nuint)(prodH >> 32);
-#endif
-            }
-
             /// <summary>
             /// Do partial divide, yielding 32-bit result and 96-bit remainder.
             /// Top divisor uint must be larger than top dividend uint. This is
@@ -529,7 +505,7 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
                 // Compute full remainder, rem = dividend - (quo * divisor).
                 //
                 ulong prod1;
-                uint prod2 = (uint)BigMul64By32(bufDen.Low64, quo, out prod1);
+                uint prod2 = (uint)Math.BigMul(bufDen.Low64, quo, out prod1);
                 ulong num = bufNum.Low64 - prod1;
                 remainder -= (uint)prod2;
 
@@ -584,7 +560,7 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
             /// <returns>Returns highest 32 bits of product</returns>
             private static uint IncreaseScale(ref Buf12 bufNum, uint power)
             {
-                ulong hi64 = BigMul64By32(bufNum.Low64, power, out ulong low64);
+                ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64);
                 bufNum.Low64 = low64;
                 hi64 = Math.BigMul(bufNum.U2, power) + hi64;
                 bufNum.U2 = (uint)hi64;
@@ -599,7 +575,7 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power)
             /// <param name="power">Scale factor to multiply by</param>
             private static void IncreaseScale64(ref Buf12 bufNum, uint power)
             {
-                bufNum.U2 = (uint)BigMul64By32(bufNum.Low64, power, out ulong low64);
+                bufNum.U2 = (uint)Math.BigMul(bufNum.Low64, power, out ulong low64);
                 bufNum.Low64 = low64;
             }
 
@@ -1464,7 +1440,7 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
                     else
                     {
                         // Left value is 32-bit, result fits in 4 uints
-                        tmp = BigMul64By32(d2.Low64, d1.Low, out ulong low);
+                        tmp = Math.BigMul(d1.Low, d2.Low64, out ulong low);
                         bufProd.Low64 = low;
 
                         if (d2.High != 0)
@@ -1484,7 +1460,7 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
                 else if ((d2.High | d2.Mid) == 0)
                 {
                     // Right value is 32-bit, result fits in 4 uints
-                    tmp = BigMul64By32(d1.Low64, d2.Low, out ulong low);
+                    tmp = Math.BigMul(d1.Low64, d2.Low, out ulong low);
                     bufProd.Low64 = low;
 
                     if (d1.High != 0)
@@ -1528,13 +1504,13 @@ internal static unsafe void VarDecMul(ref DecCalc d1, ref DecCalc d2)
                         ulong hi64 = Math.BigMul(d1.High, d2.High);
 
                         // Do crosswise multiplications between upper 32bit and lower 64 bits
-                        hi64 += BigMul64By32(d1.Low64, d2.High, out tmp);
+                        hi64 += Math.BigMul(d1.Low64, d2.High, out tmp);
                         mid64 += tmp;
                         // propagate carry, can be simplified if https://github.com/dotnet/runtime/issues/48247 is done
                         if (mid64 < tmp)
                             ++hi64;
 
-                        hi64 += BigMul64By32(d2.Low64, d1.High, out tmp);
+                        hi64 += Math.BigMul(d2.Low64, d1.High, out tmp);
                         mid64 += tmp;
                         if (mid64 < tmp)
                             ++hi64;
diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs
index 7e99bb0366edfc..3d7952c51fddf9 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Math.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs
@@ -161,6 +161,32 @@ public static long BigMul(int a, int b)
             return ((long)a) * b;
         }
 
+
+        /// <summary>
+        /// Perform multiplication between 64 and 32 bit numbers, returning lower 64 bits in <paramref name="low"/>
+        /// </summary>
+        /// <returns>hi bits of the result</returns>
+        /// <remarks>REMOVE once BigMul(ulong, ulong) is treated as intrinsics and optimizes 32 by 64 multiplications</remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static ulong BigMul(ulong a, uint b, out ulong low)
+        {
+#if TARGET_64BIT
+            return Math.BigMul(ulong)a, (ulong)b, out low);
+#else
+            ulong prodH = (((ulong)(uint)(a >> 32)) * b);
+            ulong prodL = ((ulong)(uint)a) * b;
+            prodH += (prodL >> 32);
+
+            low = ((prodH << 32) | (uint)prodL);
+            return (prodH >> 32);
+#endif
+        }
+
+        /// <inheritdoc cref="BigMul(ulong, uint, out ulong)"/>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        internal static ulong BigMul(uint a, ulong b, out ulong low)
+            => BigMul(a, b, out low);
+
         /// <summary>Produces the full product of two unsigned 64-bit numbers.</summary>
         /// <param name="a">The first number to multiply.</param>
         /// <param name="b">The second number to multiply.</param>

From 2f6e107c9041909740c131a410120c0e22f0e5c8 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 4 Mar 2024 22:44:03 +0100
Subject: [PATCH 11/23] Simplify Div96By32

---
 .../src/System/Decimal.DecCalc.cs                   | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index d995d8145748de..3fedf3450dcd69 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -197,15 +197,9 @@ private static uint Div96By32(ref Buf12 bufNum, uint den)
             {
                 if (X86.X86Base.X64.IsSupported)
                 {
-                    uint hiRes = 0;
                     ulong remainder = bufNum.U2;
 
-                    if (remainder >= den)
-                    {
-                        (hiRes, remainder) = X86.X86Base.DivRem(bufNum.U2, 0u, den);
-                    }
-
-                    bufNum.U2 = hiRes;
+                    (bufNum.U2, remainder) = (remainder >= den) ? X86.X86Base.DivRem(bufNum.U2, 0u, den) : (0u, remainder);
                     (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den);
                     return (uint)remainder;
                 }
@@ -222,10 +216,7 @@ private static uint Div96By32(ref Buf12 bufNum, uint den)
                     bufNum.U1 = 0;
                     goto Div1Word;
 Div3Word:
-                    if (bufNum.U2 < den)
-                        (bufNum.U2, remainder) = (0, bufNum.U2);
-                    else
-                        (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, remainder, den);
+                    (bufNum.U2, remainder) = X86.X86Base.DivRem(bufNum.U2, remainder, den);
 Div2Word:
                     (bufNum.U1, remainder) = X86.X86Base.DivRem(bufNum.U1, remainder, den);
 Div1Word:

From 14a662b485d460a825899fafd4ebf3fe12a4a2c6 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 4 Mar 2024 22:45:41 +0100
Subject: [PATCH 12/23] Remove 64 bit path from Div96By32

---
 .../src/System/Decimal.DecCalc.cs                      | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 3fedf3450dcd69..e09ab579ca4023 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -195,15 +195,7 @@ private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result)
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             private static uint Div96By32(ref Buf12 bufNum, uint den)
             {
-                if (X86.X86Base.X64.IsSupported)
-                {
-                    ulong remainder = bufNum.U2;
-
-                    (bufNum.U2, remainder) = (remainder >= den) ? X86.X86Base.DivRem(bufNum.U2, 0u, den) : (0u, remainder);
-                    (bufNum.Low64, remainder) = X86.X86Base.X64.DivRem(bufNum.Low64, remainder, (ulong)den);
-                    return (uint)remainder;
-                }
-                else if (X86.X86Base.IsSupported)
+                if (X86.X86Base.IsSupported)
                 {
                     uint remainder = 0;
 

From cfcd3794a5282edd4ce91978ba81d39f2019579d Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 11 Mar 2024 22:38:09 +0100
Subject: [PATCH 13/23] Add Div64By32 helper to avoid check for X86 in multiple
 places

---
 .../src/System/Decimal.DecCalc.cs             | 54 ++++++++-----------
 1 file changed, 22 insertions(+), 32 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index e09ab579ca4023..9a45fb75bced8d 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -186,6 +186,22 @@ private static void UInt64x64To128(ulong a, ulong b, ref DecCalc result)
                 result.High = (uint)high;
             }
 
+            // Do partial divide for the case where (left >> 32) < den
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static (uint Quotient, uint Remainder) Div64By32(ulong dividend, uint den)
+            {
+                if (X86.X86Base.IsSupported)
+                {
+                    return X86.X86Base.DivRem((uint)dividend, (uint)(dividend >> 32), den);
+                }
+                else
+                {
+                    // TODO: https://github.com/dotnet/runtime/issues/5213
+                    uint quo = (uint)(dividend / den);
+                    return (quo, (uint)dividend - quo * den);
+                }
+            }
+
             /// <summary>
             /// Do full divide, yielding 96-bit result and 32-bit remainder.
             /// </summary>
@@ -235,8 +251,7 @@ private static uint Div96By32(ref Buf12 bufNum, uint den)
                     tmp = bufNum.Low64;
                     if (tmp == 0)
                         return 0;
-                    (div, rem) = Math.DivRem(tmp, den);
-                    bufNum.Low64 = div;
+                    (bufNum.Low64, rem) = Math.DivRem(tmp, den);
                     return (uint)rem;
                 }
             }
@@ -425,9 +440,9 @@ private static uint Div96By64(ref Buf12 bufNum, ulong den)
                     //
                     return 0;
 
-                // TODO: https://github.com/dotnet/runtime/issues/5213
-                quo = (uint)(num64 / denHigh32);
-                num = bufNum.U0 | ((num64 - quo * denHigh32) << 32); // remainder
+
+                (quo, uint rem) = Div64By32(num64, denHigh32);
+                num = bufNum.U0 | ((ulong)rem << 32); // remainder
 
                 // Compute full remainder, rem = dividend - (quo * divisor).
                 //
@@ -471,19 +486,7 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
                     //
                     return 0;
 
-
-                uint quo;
-                uint remainder;
-                if (X86.X86Base.IsSupported)
-                {
-                    (quo, remainder) = X86.X86Base.DivRem(bufNum.U2, bufNum.U3, den);
-                }
-                else
-                {
-                    // TODO: https://github.com/dotnet/runtime/issues/5213
-                    quo = (uint)(dividend / den);
-                    remainder = (uint)dividend - quo * den;
-                }
+                (uint quo, uint remainder) = Div64By32(dividend, den);
 
                 // Compute full remainder, rem = dividend - (quo * divisor).
                 //
@@ -1992,20 +1995,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
                             goto ThrowOverflow;
 
                         ulong num = Math.BigMul(remainder, power);
-                        uint div;
-#if TARGET_32BIT
-                        if (X86.X86Base.IsSupported)
-                        {
-                            (div, remainder) = X86.X86Base.DivRem((uint)num, (uint)(num >> 32), den);
-                        }
-                        else
-#endif
-                        {
-                            // Do full 64bit divide and cast result to 32bit
-                            var divRes = Math.DivRem(num, den);
-                            div = (uint)divRes.Quotient;
-                            remainder = (uint)divRes.Remainder;
-                        }
+                        (uint div, remainder) = Div64By32(num, den);
 
                         if (!Add32To96(ref bufQuo, div))
                         {

From 0bd0cc35662c39dcfc7fca19961197e4c3b7ae6e Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 11 Mar 2024 22:49:48 +0100
Subject: [PATCH 14/23] Use 64*32 multiply in more places

---
 .../src/System/Decimal.DecCalc.cs             | 42 +++++--------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 9a45fb75bced8d..0c24990ce64030 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -964,7 +964,7 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign)
                     }
 
                     uint power;
-                    ulong tmp64, tmpLow;
+                    ulong tmp64;
 
                     // d1 will need to be multiplied by 10^scale so
                     // it will have the same scale as d2.  We could be
@@ -1005,10 +1005,7 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign)
                             power = TenToPowerNine;
                             if (scale < MaxInt32Scale)
                                 power = UInt32Powers10[scale];
-                            tmpLow = Math.BigMul((uint)low64, power);
-                            tmp64 = Math.BigMul((uint)(low64 >> 32), power) + (tmpLow >> 32);
-                            low64 = (uint)tmpLow + (tmp64 << 32);
-                            high = (uint)(tmp64 >> 32);
+                            high = (uint)Math.BigMul(low64, power, out low64);
                             if ((scale -= MaxInt32Scale) <= 0)
                                 goto AlignedAdd;
                         } while (high == 0);
@@ -1021,10 +1018,7 @@ internal static unsafe void DecAddSub(ref DecCalc d1, ref DecCalc d2, bool sign)
                         power = TenToPowerNine;
                         if (scale < MaxInt32Scale)
                             power = UInt32Powers10[scale];
-                        tmpLow = Math.BigMul((uint)low64, power);
-                        tmp64 = Math.BigMul((uint)(low64 >> 32), power) + (tmpLow >> 32);
-                        low64 = (uint)tmpLow + (tmp64 << 32);
-                        tmp64 >>= 32;
+                        tmp64 = Math.BigMul(low64, power, out low64);
                         tmp64 += Math.BigMul(high, power);
 
                         scale -= MaxInt32Scale;
@@ -1260,12 +1254,8 @@ internal static long VarCyFromDec(ref DecCalc pdecIn)
                     if (pdecIn.High != 0)
                         goto ThrowOverflow;
                     uint pwr = UInt32Powers10[-scale];
-                    ulong high = Math.BigMul(pwr, pdecIn.Mid);
-                    if (high > uint.MaxValue)
-                        goto ThrowOverflow;
-                    ulong low = Math.BigMul(pwr, pdecIn.Low);
-                    low += high <<= 32;
-                    if (low < high)
+                    ulong high = Math.BigMul(pdecIn.Low64, pwr, out ulong low);
+                    if (high != 0)
                         goto ThrowOverflow;
                     value = (long)low;
                 }
@@ -1348,10 +1338,7 @@ private static int VarDecCmpSub(in decimal d1, in decimal d2)
                     do
                     {
                         uint power = scale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[scale];
-                        ulong tmpLow = Math.BigMul((uint)low64, power);
-                        ulong tmp = Math.BigMul((uint)(low64 >> 32), power) + (tmpLow >> 32);
-                        low64 = (uint)tmpLow + (tmp << 32);
-                        tmp >>= 32;
+                        ulong tmp = Math.BigMul(low64, power, out low64);
                         tmp += Math.BigMul(high, power);
                         // If the scaled value has more than 96 significant bits then it's greater than d2
                         if (tmp > uint.MaxValue)
@@ -1638,12 +1625,8 @@ internal static void VarDecFromR4(float input, out DecCalc result)
                         else
                         {
                             ulong low64 = Math.BigMul(mant, UInt32Powers10[power - 9]);
-                            ulong hi64 = Math.BigMul(TenToPowerNine, (uint)(low64 >> 32));
-                            low64 = Math.BigMul(TenToPowerNine, (uint)low64);
-                            result.Low = (uint)low64;
-                            hi64 += low64 >> 32;
-                            result.Mid = (uint)hi64;
-                            hi64 >>= 32;
+                            ulong hi64 = Math.BigMul(TenToPowerNine, low64, out low64);
+                            result.Low64 = low64;
                             result.High = (uint)hi64;
                         }
                     }
@@ -2223,12 +2206,9 @@ internal static void VarDecMod(ref DecCalc d1, ref DecCalc d2)
                     do
                     {
                         uint power = scale >= MaxInt32Scale ? TenToPowerNine : UInt32Powers10[scale];
-                        ulong tmp = Math.BigMul(d2.Low, power);
-                        d2.Low = (uint)tmp;
-                        tmp >>= 32;
-                        tmp += (d2.Mid + ((ulong)d2.High << 32)) * power;
-                        d2.Mid = (uint)tmp;
-                        d2.High = (uint)(tmp >> 32);
+                        uint hi32 = (uint)Math.BigMul(d2.Low64, power, out ulong low64);
+                        d2.Low64 = low64;
+                        d2.High = hi32 + d2.High * power;
                     } while ((scale -= MaxInt32Scale) > 0);
                     scale = 0;
                 }

From 00ca6c7c4ca1f67ab09b0ea8eff0cfde66bd0bbd Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 11 Mar 2024 22:58:28 +0100
Subject: [PATCH 15/23] Add new IncreaseScale overload to fix issue with
 Store-To-Load forwarding * Gives around 20% faster perf for 64bit for full
 96bit division * removes cast

---
 .../src/System/Decimal.DecCalc.cs                 | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 0c24990ce64030..9379c4c1384d63 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -553,6 +553,19 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power)
                 return (uint)(hi64 >> 32);
             }
 
+            /// <summary>
+            /// Multiply the two numbers. The result overwrite the input.
+            /// </summary>
+            /// <param name="bufNum">buffer</param>
+            /// <param name="power">Scale factor to multiply by</param>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static void IncreaseScale(ref Buf16 bufNum, uint power)
+            {
+                ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64);
+                bufNum.Low64 = low64;
+                bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64;
+            }
+
             /// <summary>
             /// Multiply the two numbers 64bit * 32bit.
             /// The 96 bits of the result overwrite the input.
@@ -2126,7 +2139,7 @@ internal static unsafe void VarDecDiv(ref DecCalc d1, ref DecCalc d2)
                             if (IncreaseScale(ref bufQuo, power) != 0)
                                 goto ThrowOverflow;
 
-                            bufRem.U3 = IncreaseScale(ref *(Buf12*)&bufRem, power);
+                            IncreaseScale(ref bufRem, power);
                             tmp = Div128By96(ref bufRem, ref bufDivisor);
                             if (!Add32To96(ref bufQuo, tmp))
                             {

From 0659dc284b6d7cdfb145ed0c52bc23b2bda914da Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 11 Mar 2024 23:23:52 +0100
Subject: [PATCH 16/23] Add back old 32bit code for IncreaseScale * makes the
 5-10% regression 20% faster (25% speedup) for 1/3

---
 .../src/System/Decimal.DecCalc.cs              | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 9379c4c1384d63..9b7c6e7dcd3ae8 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -546,11 +546,23 @@ private static uint Div128By96(ref Buf16 bufNum, ref Buf12 bufDen)
             /// <returns>Returns highest 32 bits of product</returns>
             private static uint IncreaseScale(ref Buf12 bufNum, uint power)
             {
+#if TARGET_64BIT
                 ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64);
                 bufNum.Low64 = low64;
                 hi64 = Math.BigMul(bufNum.U2, power) + hi64;
                 bufNum.U2 = (uint)hi64;
                 return (uint)(hi64 >> 32);
+#else
+                ulong tmp = Math.BigMul(bufNum.U0, power);
+                bufNum.U0 = (uint)tmp;
+                tmp >>= 32;
+                tmp += Math.BigMul(bufNum.U1, power);
+                bufNum.U1 = (uint)tmp;
+                tmp >>= 32;
+                tmp += Math.BigMul(bufNum.U2, power);
+                bufNum.U2 = (uint)tmp;
+                return (uint)(tmp >> 32);
+#endif
             }
 
             /// <summary>
@@ -559,11 +571,15 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power)
             /// <param name="bufNum">buffer</param>
             /// <param name="power">Scale factor to multiply by</param>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            private static void IncreaseScale(ref Buf16 bufNum, uint power)
+            private unsafe static void IncreaseScale(ref Buf16 bufNum, uint power)
             {
+#if TARGET_64BIT
                 ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64);
                 bufNum.Low64 = low64;
                 bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64;
+#else
+                bufNum.U3 = IncreaseScale(ref *(Buf12*)Unsafe.AsPointer(ref bufNum), power);
+#endif
             }
 
             /// <summary>

From 83191db705e26c32158a55ed9395498385c624ec Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Mon, 11 Mar 2024 23:38:43 +0100
Subject: [PATCH 17/23] 6% faster with longer increasescale

---
 .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 9b7c6e7dcd3ae8..aa1729d37d6a56 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -578,7 +578,14 @@ private unsafe static void IncreaseScale(ref Buf16 bufNum, uint power)
                 bufNum.Low64 = low64;
                 bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64;
 #else
-                bufNum.U3 = IncreaseScale(ref *(Buf12*)Unsafe.AsPointer(ref bufNum), power);
+                ulong tmp = Math.BigMul(bufNum.U0, power);
+                bufNum.U0 = (uint)tmp;
+                tmp >>= 32;
+                tmp += Math.BigMul(bufNum.U1, power);
+                bufNum.U1 = (uint)tmp;
+                tmp >>= 32;
+                tmp += Math.BigMul(bufNum.U2, power);
+                bufNum.High64 = tmp;
 #endif
             }
 

From f2ada41d6646a0655fa1e48012a9e5a11db2b833 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Tue, 12 Mar 2024 11:32:46 +0100
Subject: [PATCH 18/23] Add back  Bmi2.MultiplyNoFlags in Math.BigMul

---
 .../System.Private.CoreLib/src/System/Math.cs          | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs
index 3d7952c51fddf9..44ef0875634de8 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Math.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs
@@ -151,8 +151,16 @@ internal static void ThrowNegateTwosCompOverflow()
             throw new OverflowException(SR.Overflow_NegateTwosCompNum);
         }
 
-        internal static ulong BigMul(uint a, uint b)
+        internal static unsafe ulong BigMul(uint a, uint b)
         {
+#if TARGET_32BIT
+            if (Bmi2.IsSupported)
+            {
+                uint low;
+                uint hi = Bmi2.MultiplyNoFlags(a, b, &low);
+                return ((ulong)hi << 32) | low;
+            }
+#endif
             return ((ulong)a) * b;
         }
 

From 46aca48a208b5c65eb23e3e9ab42fef0a5b08d21 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Tue, 12 Mar 2024 14:00:08 +0100
Subject: [PATCH 19/23] fix Math.Bigmul compilation

---
 .../System.Private.CoreLib/src/System/Math.cs         | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Math.cs b/src/libraries/System.Private.CoreLib/src/System/Math.cs
index 44ef0875634de8..2294a6aed7e7c1 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Math.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Math.cs
@@ -157,8 +157,8 @@ internal static unsafe ulong BigMul(uint a, uint b)
             if (Bmi2.IsSupported)
             {
                 uint low;
-                uint hi = Bmi2.MultiplyNoFlags(a, b, &low);
-                return ((ulong)hi << 32) | low;
+                uint high = Bmi2.MultiplyNoFlags(a, b, &low);
+                return ((ulong)high << 32) | low;
             }
 #endif
             return ((ulong)a) * b;
@@ -179,11 +179,10 @@ public static long BigMul(int a, int b)
         internal static ulong BigMul(ulong a, uint b, out ulong low)
         {
 #if TARGET_64BIT
-            return Math.BigMul(ulong)a, (ulong)b, out low);
+            return Math.BigMul((ulong)a, (ulong)b, out low);
 #else
-            ulong prodH = (((ulong)(uint)(a >> 32)) * b);
             ulong prodL = ((ulong)(uint)a) * b;
-            prodH += (prodL >> 32);
+            ulong prodH = (prodL >> 32) + (((ulong)(uint)(a >> 32)) * b);
 
             low = ((prodH << 32) | (uint)prodL);
             return (prodH >> 32);
@@ -193,7 +192,7 @@ internal static ulong BigMul(ulong a, uint b, out ulong low)
         /// <inheritdoc cref="BigMul(ulong, uint, out ulong)"/>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal static ulong BigMul(uint a, ulong b, out ulong low)
-            => BigMul(a, b, out low);
+            => BigMul(b, a, out low);
 
         /// <summary>Produces the full product of two unsigned 64-bit numbers.</summary>
         /// <param name="a">The first number to multiply.</param>

From 488c294204eb220152f2f2891c40e2073afe68ae Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Tue, 12 Mar 2024 14:12:00 +0100
Subject: [PATCH 20/23] switch from Bigmul to (ulong)a * (uint)b in
 IncreaseScale * Gives up to 10ns (or 14% faster)

---
 .../src/System/Decimal.DecCalc.cs                  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index aa1729d37d6a56..75bb1fbb71f56a 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -553,13 +553,13 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power)
                 bufNum.U2 = (uint)hi64;
                 return (uint)(hi64 >> 32);
 #else
-                ulong tmp = Math.BigMul(bufNum.U0, power);
+                ulong tmp = (ulong)bufNum.U0 * power;
                 bufNum.U0 = (uint)tmp;
                 tmp >>= 32;
-                tmp += Math.BigMul(bufNum.U1, power);
+                tmp += (ulong)bufNum.U1 * power;
                 bufNum.U1 = (uint)tmp;
                 tmp >>= 32;
-                tmp += Math.BigMul(bufNum.U2, power);
+                tmp += (ulong)bufNum.U2 * power;
                 bufNum.U2 = (uint)tmp;
                 return (uint)(tmp >> 32);
 #endif
@@ -571,20 +571,20 @@ private static uint IncreaseScale(ref Buf12 bufNum, uint power)
             /// <param name="bufNum">buffer</param>
             /// <param name="power">Scale factor to multiply by</param>
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            private unsafe static void IncreaseScale(ref Buf16 bufNum, uint power)
+            private static void IncreaseScale(ref Buf16 bufNum, uint power)
             {
 #if TARGET_64BIT
                 ulong hi64 = Math.BigMul(bufNum.Low64, power, out ulong low64);
                 bufNum.Low64 = low64;
                 bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64;
 #else
-                ulong tmp = Math.BigMul(bufNum.U0, power);
+                ulong tmp = (ulong)bufNum.U0 * power;
                 bufNum.U0 = (uint)tmp;
                 tmp >>= 32;
-                tmp += Math.BigMul(bufNum.U1, power);
+                tmp += (ulong)bufNum.U1 * power;
                 bufNum.U1 = (uint)tmp;
                 tmp >>= 32;
-                tmp += Math.BigMul(bufNum.U2, power);
+                tmp += (ulong)bufNum.U2 * power;
                 bufNum.High64 = tmp;
 #endif
             }

From 0087c61cd1fd6f4131fe1799fee9495ce3be269b Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Sat, 16 Mar 2024 13:56:39 +0100
Subject: [PATCH 21/23] Call IncreaseScale(ref Buf12) from IncreaseScale(ref
 Buf16) for 32bit code

---
 .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 75bb1fbb71f56a..8fd4c57286843c 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -578,14 +578,7 @@ private static void IncreaseScale(ref Buf16 bufNum, uint power)
                 bufNum.Low64 = low64;
                 bufNum.High64 = Math.BigMul(bufNum.U2, power) + (nuint)hi64;
 #else
-                ulong tmp = (ulong)bufNum.U0 * power;
-                bufNum.U0 = (uint)tmp;
-                tmp >>= 32;
-                tmp += (ulong)bufNum.U1 * power;
-                bufNum.U1 = (uint)tmp;
-                tmp >>= 32;
-                tmp += (ulong)bufNum.U2 * power;
-                bufNum.High64 = tmp;
+                bufNum.U3 = IncreaseScale(ref Unsafe.As<Buf16, Buf12>(ref bufNum), power);
 #endif
             }
 

From baae42d34a35d1624bd55357733b48cf04ed5434 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <daniel.svensson@hotmail.se>
Date: Thu, 27 Jun 2024 21:26:48 +0200
Subject: [PATCH 22/23] Add #pragma warning disable CA2252

---
 .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs        | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 8fd4c57286843c..1448d229c32a69 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -8,6 +8,8 @@
 
 using X86 = System.Runtime.Intrinsics.X86;
 
+#pragma warning disable CA2252 // X86Base.DivRem that requires opting into preview features is used in a few places
+
 namespace System
 {
     public partial struct Decimal

From 3a9789141050bad8f32a3f550c367842dfd16809 Mon Sep 17 00:00:00 2001
From: Daniel Svensson <Daniel-Svensson@users.noreply.github.com>
Date: Thu, 12 Sep 2024 17:11:07 +0200
Subject: [PATCH 23/23] Change supression to SYSLIB5004 now that DivRem is
 marked as [Experimental]

---
 .../System.Private.CoreLib/src/System/Decimal.DecCalc.cs        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
index 18c1c43940add1..b4470bed195e07 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Decimal.DecCalc.cs
@@ -8,7 +8,7 @@
 
 using X86 = System.Runtime.Intrinsics.X86;
 
-#pragma warning disable CA2252 // X86Base.DivRem that requires opting into preview features is used in a few places
+#pragma warning disable SYSLIB5004 // DivRem is marked as [Experimental], see https://github.com/dotnet/runtime/issues/82194
 
 namespace System
 {