diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index d5db3dffa5..ee224e0b0b 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -4,11 +4,16 @@
 using System;
 using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Formats.Webp.Lossy
 {
-    internal static class LossyUtils
+    internal static unsafe class LossyUtils
     {
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
@@ -61,11 +66,12 @@ public static void Copy(Span<byte> src, Span<byte> dst, int w, int h)
         public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
         {
             int d = 0;
+            int dataSize = (4 * WebpConstants.Bps) - 16;
             for (int y = 0; y < 16 * WebpConstants.Bps; y += 4 * WebpConstants.Bps)
             {
                 for (int x = 0; x < 16; x += 4)
                 {
-                    d += Vp8Disto4X4(a.Slice(x + y), b.Slice(x + y), w, scratch);
+                    d += Vp8Disto4X4(a.Slice(x + y, dataSize), b.Slice(x + y, dataSize), w, scratch);
                 }
             }
 
@@ -75,9 +81,19 @@ public static int Vp8Disto16X16(Span<byte> a, Span<byte> b, Span<ushort> w, Span
         [MethodImpl(InliningOptions.ShortMethod)]
         public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
         {
-            int sum1 = TTransform(a, w, scratch);
-            int sum2 = TTransform(b, w, scratch);
-            return Math.Abs(sum2 - sum1) >> 5;
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
+            {
+                int diffSum = TTransformSse41(a, b, w, scratch);
+                return Math.Abs(diffSum) >> 5;
+            }
+            else
+#endif
+            {
+                int sum1 = TTransform(a, w, scratch);
+                int sum2 = TTransform(b, w, scratch);
+                return Math.Abs(sum2 - sum1) >> 5;
+            }
         }
 
         public static void DC16(Span<byte> dst, Span<byte> yuv, int offset)
@@ -589,6 +605,127 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
             return sum;
         }
 
+#if SUPPORTS_RUNTIME_INTRINSICS
+        /// <summary>
+        /// Hadamard transform
+        /// Returns the weighted sum of the absolute value of transformed coefficients.
+        /// w[] contains a row-major 4 by 4 symmetric matrix.
+        /// </summary>
+        public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
+        {
+            Span<int> sum = scratch.Slice(0, 4);
+            sum.Clear();
+
+            // Load and combine inputs.
+            Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
+            Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
+            Vector128<byte> ina2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 2, 16)));
+            Vector128<long> ina3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
+            Vector128<byte> inb0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB));
+            Vector128<byte> inb1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps, 16)));
+            Vector128<byte> inb2 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 2, 16)));
+            Vector128<long> inb3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
+
+            // Combine inA and inB (we'll do two transforms in parallel).
+            Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
+            Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
+            Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
+            Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
+            Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
+            Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
+            Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
+            Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
+
+            // a00 a01 a02 a03   b00 b01 b02 b03
+            // a10 a11 a12 a13   b10 b11 b12 b13
+            // a20 a21 a22 a23   b20 b21 b22 b23
+            // a30 a31 a32 a33   b30 b31 b32 b33
+            // Vertical pass first to avoid a transpose (vertical and horizontal passes
+            // are commutative because w/kWeightY is symmetric) and subsequent transpose.
+            // Calculate a and b (two 4x4 at once).
+            Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
+            Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
+            Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
+            Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
+            Vector128<short> b0 = Sse2.Add(a0, a1);
+            Vector128<short> b1 = Sse2.Add(a3, a2);
+            Vector128<short> b2 = Sse2.Subtract(a3, a2);
+            Vector128<short> b3 = Sse2.Subtract(a0, a1);
+
+            // a00 a01 a02 a03   b00 b01 b02 b03
+            // a10 a11 a12 a13   b10 b11 b12 b13
+            // a20 a21 a22 a23   b20 b21 b22 b23
+            // a30 a31 a32 a33   b30 b31 b32 b33
+            // Transpose the two 4x4.
+            Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
+            Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
+            Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
+            Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
+
+            // a00 a10 a01 a11   a02 a12 a03 a13
+            // a20 a30 a21 a31   a22 a32 a23 a33
+            // b00 b10 b01 b11   b02 b12 b03 b13
+            // b20 b30 b21 b31   b22 b32 b23 b33
+            Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
+            Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
+            Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
+            Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
+
+            // a00 a10 a20 a30 a01 a11 a21 a31
+            // b00 b10 b20 b30 b01 b11 b21 b31
+            // a02 a12 a22 a32 a03 a13 a23 a33
+            // b02 b12 a22 b32 b03 b13 b23 b33
+            Vector128<long> output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
+            Vector128<long> output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
+            Vector128<long> output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
+            Vector128<long> output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
+
+            // a00 a10 a20 a30   b00 b10 b20 b30
+            // a01 a11 a21 a31   b01 b11 b21 b31
+            // a02 a12 a22 a32   b02 b12 b22 b32
+            // a03 a13 a23 a33   b03 b13 b23 b33
+            // Horizontal pass and difference of weighted sums.
+            Vector128<ushort> w0 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w));
+            Vector128<ushort> w8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w.Slice(8, 8)));
+
+            // Calculate a and b (two 4x4 at once).
+            a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
+            a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
+            a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
+            a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
+            b0 = Sse2.Add(a0, a1);
+            b1 = Sse2.Add(a3, a2);
+            b2 = Sse2.Subtract(a3, a2);
+            b3 = Sse2.Subtract(a0, a1);
+
+            // Separate the transforms of inA and inB.
+            Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
+            Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
+            Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
+            Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
+
+            Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
+            Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
+            Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
+            Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
+
+            // weighted sums.
+            Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
+            Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
+            Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
+            Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
+            Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
+            Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
+
+            // difference of weighted sums.
+            Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
+
+            ref int outputRef = ref MemoryMarshal.GetReference(sum);
+            Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
+            return sum[3] + sum[2] + sum[1] + sum[0];
+        }
+#endif
+
         public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scratch)
         {
             TransformOne(src, dst, scratch);
diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
new file mode 100644
index 0000000000..f8b488fde5
--- /dev/null
+++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
@@ -0,0 +1,52 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using SixLabors.ImageSharp.Formats.Webp.Lossy;
+using SixLabors.ImageSharp.Tests.TestUtilities;
+using Xunit;
+
+namespace SixLabors.ImageSharp.Tests.Formats.WebP
+{
+    [Trait("Format", "Webp")]
+    public class LossyUtilsTests
+    {
+        private static void RunHadamardTransformTest()
+        {
+            byte[] a =
+            {
+                27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
+                129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
+                28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
+                26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
+                128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27
+            };
+
+            byte[] b =
+            {
+                28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
+                204, 204, 204, 204, 204, 204, 204, 204, 204, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+                28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 28, 28, 28,
+                28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+                204, 204, 204, 204, 204, 204, 204, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28
+            };
+
+            ushort[] w = { 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 };
+            int expected = 2;
+
+            int actual = LossyUtils.Vp8Disto4X4(a, b, w, new int[16]);
+            Assert.Equal(expected, actual);
+        }
+
+        [Fact]
+        public void HadamardTransform_Works() => RunHadamardTransformTest();
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Fact]
+        public void HadamardTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.AllowAll);
+
+        [Fact]
+        public void HadamardTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunHadamardTransformTest, HwIntrinsics.DisableHWIntrinsic);
+#endif
+
+    }
+}