diff --git a/src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/ArrowStringDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/ArrowStringDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/ArrowStringDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/BooleanDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/BooleanDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/BooleanDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/BooleanDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/ByteDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/ByteDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/ByteDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/ByteDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/CharDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/CharDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/CharDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/CharDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/DateTimeDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/DateTimeDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/DateTimeDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/DateTimeDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/DecimalDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/DecimalDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/DecimalDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/DecimalDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/DoubleDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/DoubleDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/DoubleDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/DoubleDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/Int16DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/Int16DataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/Int16DataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/Int16DataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/Int32DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/Int32DataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/Int32DataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/Int32DataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/Int64DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/Int64DataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/Int64DataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/Int64DataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/SByteDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/SByteDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/SByteDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/SByteDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/SingleDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/SingleDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/SingleDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/SingleDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/UInt16DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/UInt16DataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/UInt16DataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/UInt16DataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/UInt32DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/UInt32DataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/UInt32DataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/UInt32DataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/UInt64DataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/UInt64DataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/UInt64DataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/PrimitiveDataFrameColumns/UInt64DataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/StringDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/StringDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/StringDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/StringDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/VBufferDataFrameColumn.cs b/src/Microsoft.Data.Analysis/DataFrameColumns/VBufferDataFrameColumn.cs similarity index 100% rename from src/Microsoft.Data.Analysis/VBufferDataFrameColumn.cs rename to src/Microsoft.Data.Analysis/DataFrameColumns/VBufferDataFrameColumn.cs diff --git a/src/Microsoft.Data.Analysis/ArrayUtility.cs b/src/Microsoft.Data.Analysis/Utils/ArrayUtility.cs similarity index 100% rename from src/Microsoft.Data.Analysis/ArrayUtility.cs rename to src/Microsoft.Data.Analysis/Utils/ArrayUtility.cs diff --git a/src/Microsoft.Data.Analysis/BitUtility.cs b/src/Microsoft.Data.Analysis/Utils/BitUtility.cs similarity index 100% rename from src/Microsoft.Data.Analysis/BitUtility.cs rename to src/Microsoft.Data.Analysis/Utils/BitUtility.cs diff --git a/test/Microsoft.Data.Analysis.Tests/ArrowStringColumnTests.cs b/test/Microsoft.Data.Analysis.Tests/ArrowStringColumnTests.cs new file mode 100644 index 0000000000..c2f2c9040c --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/ArrowStringColumnTests.cs @@ -0,0 +1,106 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Apache.Arrow; +using Xunit; + +namespace Microsoft.Data.Analysis.Tests +{ + public class ArrowStringColumnTests + { + + [Fact] + public void TestBasicArrowStringColumn() + { + StringArray strArray = new StringArray.Builder().Append("foo").Append("bar").Build(); + Memory dataMemory = new byte[] { 102, 111, 111, 98, 97, 114 }; + Memory nullMemory = new byte[] { 0, 0, 0, 0 }; + Memory offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0 }; + + ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, strArray.Length, strArray.NullCount); + Assert.Equal(2, stringColumn.Length); + Assert.Equal("foo", stringColumn[0]); + Assert.Equal("bar", stringColumn[1]); + } + + [Fact] + public void TestArrowStringColumnWithNulls() + { + string data = "joemark"; + byte[] bytes = Encoding.UTF8.GetBytes(data); + Memory dataMemory = new Memory(bytes); + Memory nullMemory = new byte[] { 0b1101 }; + Memory offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 }; + ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, 4, 1); + + Assert.Equal(4, stringColumn.Length); + Assert.Equal("joe", stringColumn[0]); + Assert.Null(stringColumn[1]); + Assert.Equal("mark", stringColumn[2]); + Assert.Equal("", stringColumn[3]); + + List ret = stringColumn[0, 4]; + Assert.Equal("joe", ret[0]); + Assert.Null(ret[1]); + Assert.Equal("mark", ret[2]); + Assert.Equal("", ret[3]); + } + + [Fact] + public void TestArrowStringColumnClone() + { + StringArray strArray = new StringArray.Builder().Append("foo").Append("bar").Build(); + Memory dataMemory = new byte[] { 102, 111, 111, 98, 97, 114 }; + Memory nullMemory = new byte[] { 0, 0, 0, 0 }; + Memory offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0 }; + + ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, strArray.Length, strArray.NullCount); + + DataFrameColumn clone = stringColumn.Clone(numberOfNullsToAppend: 5); + Assert.Equal(7, clone.Length); + Assert.Equal(stringColumn[0], clone[0]); + Assert.Equal(stringColumn[1], clone[1]); + for (int i = 2; i < 7; i++) + Assert.Null(clone[i]); + } + + [Fact] + public void TestArrowStringApply() + { + ArrowStringDataFrameColumn column = DataFrameTests.CreateArrowStringColumn(10); + ArrowStringDataFrameColumn ret = column.Apply((string cur) => + { + if (cur != null) + { + return cur + "123"; + } + return null; + }); + for (long i = 0; i < column.Length; i++) + { + if (column[i] != null) + { + Assert.Equal(column[i] + "123", ret[i]); + } + else + { + Assert.Null(ret[i]); + } + } + Assert.Equal(1, ret.NullCount); + + // Test null counts + ret = column.Apply((string cur) => + { + return null; + }); + Assert.Equal(column.Length, ret.NullCount); + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameJoinTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameJoinExtensionsTests.cs similarity index 84% rename from test/Microsoft.Data.Analysis.Tests/DataFrameJoinTests.cs rename to test/Microsoft.Data.Analysis.Tests/DataFrameJoinExtensionsTests.cs index a465ee70f5..6a20bdee47 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameJoinTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameJoinExtensionsTests.cs @@ -8,10 +8,10 @@ namespace Microsoft.Data.Analysis.Tests { - public class DataFrameJoinTests + public class DataFrameJoinExtensionsTests { [Fact] - public void DataFrameJoinTests_GetSortedListsIntersection_EmptyCollections_EmptyResult() + public void GetSortedListsIntersection_EmptyCollections_EmptyResult() { // Arrange @@ -28,7 +28,7 @@ public void DataFrameJoinTests_GetSortedListsIntersection_EmptyCollections_Empty } [Fact] - public void DataFrameJoinTests_GetSortedListsIntersection_EmptyCollections_FirstIsNotEmpty_EmptyResult() + public void GetSortedListsIntersection_EmptyCollections_FirstIsNotEmpty_EmptyResult() { // Arrange @@ -51,7 +51,7 @@ public void DataFrameJoinTests_GetSortedListsIntersection_EmptyCollections_First } [Fact] - public void DataFrameJoinTests_GetSortedListsIntersection_EmptyCollections_SecondIsNotEmpty_EmptyResult() + public void GetSortedListsIntersection_EmptyCollections_SecondIsNotEmpty_EmptyResult() { // Arrange @@ -74,7 +74,7 @@ public void DataFrameJoinTests_GetSortedListsIntersection_EmptyCollections_Secon } [Fact] - public void DataFrameJoinTests_GetSortedListsIntersection_SortedCollections_WithoutIntersection_Success() + public void GetSortedListsIntersection_SortedCollections_WithoutIntersection_Success() { // Arrange @@ -105,7 +105,7 @@ public void DataFrameJoinTests_GetSortedListsIntersection_SortedCollections_With } [Fact] - public void DataFrameJoinTests_GetSortedListsIntersection_SortedCollections_WithIntersection_Success() + public void GetSortedListsIntersection_SortedCollections_WithIntersection_Success() { // Arrange diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.BinaryOperations.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.BinaryOperations.cs new file mode 100644 index 0000000000..c077bd201d --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.BinaryOperations.cs @@ -0,0 +1,477 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Apache.Arrow; +using Xunit; + +namespace Microsoft.Data.Analysis.Tests +{ + public partial class DataFrameTests + { + [Fact] + public void TestBinaryOperations() + { + DataFrame df = MakeDataFrameWithTwoColumns(12); + IReadOnlyList listOfInts = new List() { 5, 5 }; + + // The following binary ops return a copy + var ret = df.Add(5); + Assert.Equal(0, df[0, 0]); + Assert.Equal(5, ret[0, 0]); + ret = df.Add(listOfInts); + Assert.Equal(0, df[0, 0]); + Assert.Equal(5, ret[0, 0]); + ret = df.Subtract(5); + Assert.Equal(0, df[0, 0]); + Assert.Equal(-5, ret[0, 0]); + ret = df.Subtract(listOfInts); + Assert.Equal(0, df[0, 0]); + Assert.Equal(-5, ret[0, 0]); + ret = df.Multiply(5); + Assert.Equal(1, df[1, 0]); + Assert.Equal(5, ret[1, 0]); + ret = df.Multiply(listOfInts); + Assert.Equal(1, df[1, 0]); + Assert.Equal(5, ret[1, 0]); + ret = df.Divide(5); + Assert.Equal(5, df[5, 0]); + Assert.Equal(1, ret[5, 0]); + ret = df.Divide(listOfInts); + Assert.Equal(5, df[5, 0]); + Assert.Equal(1, ret[5, 0]); + ret = df.Modulo(5); + Assert.Equal(5, df[5, 0]); + Assert.Equal(0, ret[5, 0]); + ret = df.Modulo(listOfInts); + Assert.Equal(5, df[5, 0]); + Assert.Equal(0, ret[5, 0]); + + Assert.Equal(true, df.ElementwiseGreaterThanOrEqual(5)[7, 0]); + Assert.Equal(true, df.ElementwiseGreaterThanOrEqual(listOfInts)[7, 0]); + Assert.Equal(true, df.ElementwiseLessThanOrEqual(5)[4, 0]); + Assert.Equal(true, df.ElementwiseLessThanOrEqual(listOfInts)[4, 0]); + Assert.Equal(false, df.ElementwiseGreaterThan(5)[5, 0]); + Assert.Equal(false, df.ElementwiseGreaterThan(listOfInts)[5, 0]); + Assert.Equal(false, df.ElementwiseLessThan(5)[5, 0]); + Assert.Equal(false, df.ElementwiseLessThan(listOfInts)[5, 0]); + // The following binary ops are in place + Assert.Equal(5, df.Add(5, inPlace: true)[0, 0]); + Assert.Equal(10, df.Add(listOfInts, inPlace: true)[0, 0]); + Assert.Equal(5, df.Subtract(5, inPlace: true)[0, 0]); + Assert.Equal(0, df.Subtract(listOfInts, inPlace: true)[0, 0]); + Assert.Equal(5, df.Multiply(5, inPlace: true)[1, 0]); + Assert.Equal(25, df.Multiply(listOfInts, inPlace: true)[1, 0]); + Assert.Equal(5, df.Divide(5, inPlace: true)[1, 0]); + Assert.Equal(1, df.Divide(listOfInts, inPlace: true)[1, 0]); + Assert.Equal(1, df.Modulo(5, inPlace: true)[1, 0]); + Assert.Equal(1, df.Modulo(listOfInts, inPlace: true)[1, 0]); + Assert.Equal(2, df.LeftShift(1)[1, 0]); + Assert.Equal(1, df.RightShift(1)[2, 0]); + } + + [Fact] + public void TestBinaryOperationsWithColumns() + { + int length = 10; + var df1 = MakeDataFrameWithNumericColumns(length); + var df2 = MakeDataFrameWithNumericColumns(length); + + DataFrameColumn newColumn; + DataFrameColumn verify; + for (int i = 0; i < df1.Columns.Count; i++) + { + newColumn = df1.Columns[df1.Columns[i].Name] + df2.Columns[df2.Columns[i].Name]; + verify = newColumn.ElementwiseEquals(df1.Columns[i] * 2); + Assert.Equal(true, verify[0]); + + newColumn = df1.Columns[df1.Columns[i].Name] - df2.Columns[df2.Columns[i].Name]; + verify = newColumn.ElementwiseEquals(0); + Assert.Equal(true, verify[0]); + + newColumn = df1.Columns[df1.Columns[i].Name] * df2.Columns[df2.Columns[i].Name]; + verify = newColumn.ElementwiseEquals(df1.Columns[i] * df1.Columns[i]); + Assert.Equal(true, verify[0]); + + var df1Column = df1.Columns[i] + 1; + var df2Column = df2.Columns[i] + 1; + newColumn = df1Column / df2Column; + verify = newColumn.ElementwiseEquals(1); + Assert.Equal(true, verify[0]); + + newColumn = df1Column % df2Column; + verify = newColumn.ElementwiseEquals(0); + Assert.Equal(true, verify[0]); + + verify = df1.Columns[df1.Columns[i].Name].ElementwiseEquals(df2.Columns[df2.Columns[i].Name]); + Assert.True(verify.All()); + + verify = df1.Columns[df1.Columns[i].Name].ElementwiseNotEquals(df2.Columns[df2.Columns[i].Name]); + Assert.False(verify.Any()); + + verify = df1.Columns[df1.Columns[i].Name].ElementwiseGreaterThanOrEqual(df2.Columns[df2.Columns[i].Name]); + Assert.True(verify.All()); + + verify = df1.Columns[df1.Columns[i].Name].ElementwiseLessThanOrEqual(df2.Columns[df2.Columns[i].Name]); + Assert.True(verify.All()); + + verify = df1.Columns[df1.Columns[i].Name].ElementwiseGreaterThan(df2.Columns[df2.Columns[i].Name]); + Assert.False(verify.Any()); + + verify = df1.Columns[df1.Columns[i].Name].ElementwiseLessThan(df2.Columns[df2.Columns[i].Name]); + Assert.False(verify.Any()); + } + } + + [Fact] + public void TestBinaryOperationsWithConversions() + { + DataFrame df = DataFrameTests.MakeDataFrameWithTwoColumns(10); + + // Add a double to an int column + DataFrame dfd = df.Add(5.0f); + var dtype = dfd.Columns[0].DataType; + Assert.True(dtype == typeof(double)); + + // Add a decimal to an int column + DataFrame dfm = df.Add(5.0m); + dtype = dfm.Columns[0].DataType; + Assert.True(dtype == typeof(decimal)); + + // int + bool should throw + Assert.Throws(() => df.Add(true)); + + var dataFrameColumn1 = new DoubleDataFrameColumn("Double1", Enumerable.Range(0, 10).Select(x => (double)x)); + df.Columns[0] = dataFrameColumn1; + // Double + comparison ops should throw + Assert.Throws(() => df.And(true)); + } + + [Fact] + public void TestBinaryOperationsOnBoolColumn() + { + var df = new DataFrame(); + var dataFrameColumn1 = new BooleanDataFrameColumn("Bool1", Enumerable.Range(0, 10).Select(x => true)); + var dataFrameColumn2 = new BooleanDataFrameColumn("Bool2", Enumerable.Range(0, 10).Select(x => true)); + df.Columns.Insert(0, dataFrameColumn1); + df.Columns.Insert(1, dataFrameColumn2); + + // bool + int should throw + Assert.Throws(() => df.Add(5)); + // Left shift should throw + Assert.Throws(() => df.LeftShift(5)); + + IReadOnlyList listOfBools = new List() { true, false }; + // boolean and And should work + var newdf = df.And(true); + Assert.Equal(true, newdf[4, 0]); + var newdf1 = df.And(listOfBools); + Assert.Equal(false, newdf1[4, 1]); + + newdf = df.Or(true); + Assert.Equal(true, newdf[4, 0]); + newdf1 = df.Or(listOfBools); + Assert.Equal(true, newdf1[4, 1]); + + newdf = df.Xor(true); + Assert.Equal(false, newdf[4, 0]); + newdf1 = df.Xor(listOfBools); + Assert.Equal(true, newdf1[4, 1]); + } + + [Fact] + public void TestBinaryOperationsOnDateTimeColumn() + { + var df = new DataFrame(); + var dataFrameColumn1 = new DateTimeDataFrameColumn("DateTime1", Enumerable.Range(0, 5).Select(x => SampleDateTime.AddDays(x))); + // Make the second data frame column have one value that is different + var dataFrameColumn2 = new DateTimeDataFrameColumn("DateTime2", Enumerable.Range(0, 4).Select(x => SampleDateTime.AddDays(x))); + dataFrameColumn2.Append(SampleDateTime.AddDays(6)); + df.Columns.Insert(0, dataFrameColumn1); + df.Columns.Insert(1, dataFrameColumn2); + + // DateTime + int should throw + Assert.Throws(() => df.Add(5)); + // Left shift should throw + Assert.Throws(() => df.LeftShift(5)); + // Right shift should throw + Assert.Throws(() => df.RightShift(5)); + + // And should throw + Assert.Throws(() => df.And(true)); + // Or should throw + Assert.Throws(() => df.Or(true)); + // Xor should throw + Assert.Throws(() => df.Xor(true)); + + var equalsResult = dataFrameColumn1.ElementwiseEquals(dataFrameColumn2); + Assert.True(equalsResult[0]); + Assert.False(equalsResult[4]); + + var equalsToScalarResult = df["DateTime1"].ElementwiseEquals(SampleDateTime); + Assert.True(equalsToScalarResult[0]); + Assert.False(equalsToScalarResult[1]); + + var notEqualsResult = dataFrameColumn1.ElementwiseNotEquals(dataFrameColumn2); + Assert.False(notEqualsResult[0]); + Assert.True(notEqualsResult[4]); + + var notEqualsToScalarResult = df["DateTime1"].ElementwiseNotEquals(SampleDateTime); + Assert.False(notEqualsToScalarResult[0]); + Assert.True(notEqualsToScalarResult[1]); + } + + [Fact] + public void TestBinaryOperationsOnArrowStringColumn() + { + var df = new DataFrame(); + var strArrayBuilder = new StringArray.Builder(); + for (int i = 0; i < 10; i++) + { + strArrayBuilder.Append(i.ToString()); + } + StringArray strArray = strArrayBuilder.Build(); + + ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", strArray.ValueBuffer.Memory, strArray.ValueOffsetsBuffer.Memory, strArray.NullBitmapBuffer.Memory, strArray.Length, strArray.NullCount); + df.Columns.Insert(0, stringColumn); + + DataFrameColumn newCol = stringColumn.ElementwiseEquals(4); + Assert.Equal(true, newCol[4]); + Assert.Equal(false, newCol[0]); + Assert.Equal(false, newCol[5]); + + newCol = stringColumn.ElementwiseEquals("4"); + Assert.Equal(true, newCol[4]); + Assert.Equal(false, newCol[0]); + + newCol = stringColumn.ElementwiseEquals("foo"); + Assert.False(newCol.All()); + newCol = stringColumn.ElementwiseEquals(null); + Assert.False(newCol.All()); + + ArrowStringDataFrameColumn stringColumnCopy = new ArrowStringDataFrameColumn("String", strArray.ValueBuffer.Memory, strArray.ValueOffsetsBuffer.Memory, strArray.NullBitmapBuffer.Memory, strArray.Length, strArray.NullCount); + newCol = stringColumn.ElementwiseEquals(stringColumnCopy); + Assert.True(newCol.All()); + + DataFrameColumn stringColumnCopyAsBaseColumn = stringColumnCopy; + newCol = stringColumn.ElementwiseEquals(stringColumnCopyAsBaseColumn); + Assert.True(newCol.All()); + + newCol = stringColumn.ElementwiseNotEquals(5); + Assert.Equal(true, newCol[0]); + Assert.Equal(false, newCol[5]); + + newCol = stringColumn.ElementwiseNotEquals("5"); + Assert.Equal(true, newCol[0]); + Assert.Equal(false, newCol[5]); + + newCol = stringColumn.ElementwiseNotEquals("foo"); + Assert.True(newCol.All()); + newCol = stringColumn.ElementwiseNotEquals(null); + Assert.True(newCol.All()); + + newCol = stringColumn.ElementwiseNotEquals(stringColumnCopy); + Assert.False(newCol.All()); + + newCol = stringColumn.ElementwiseNotEquals(stringColumnCopyAsBaseColumn); + Assert.False(newCol.All()); + } + + [Fact] + public void TestBinaryOperationsOnStringColumn() + { + var df = new DataFrame(); + DataFrameColumn stringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, 10).Select(x => x.ToString())); + df.Columns.Insert(0, stringColumn); + + DataFrameColumn newCol = stringColumn.ElementwiseEquals(5); + Assert.Equal(true, newCol[5]); + Assert.Equal(false, newCol[0]); + + newCol = (stringColumn as StringDataFrameColumn).ElementwiseEquals("5"); + Assert.Equal(true, newCol[5]); + Assert.Equal(false, newCol[0]); + + DataFrameColumn stringColumnCopy = new StringDataFrameColumn("String", Enumerable.Range(0, 10).Select(x => x.ToString())); + newCol = stringColumn.ElementwiseEquals(stringColumnCopy); + Assert.Equal(true, newCol[5]); + Assert.Equal(true, newCol[0]); + + StringDataFrameColumn typedStringColumn = stringColumn as StringDataFrameColumn; + StringDataFrameColumn typedStringColumnCopy = stringColumnCopy as StringDataFrameColumn; + newCol = typedStringColumn.ElementwiseEquals(typedStringColumnCopy); + Assert.True(newCol.All()); + + newCol = stringColumn.ElementwiseNotEquals(5); + Assert.Equal(false, newCol[5]); + Assert.Equal(true, newCol[0]); + + newCol = typedStringColumn.ElementwiseNotEquals("5"); + Assert.Equal(false, newCol[5]); + Assert.Equal(true, newCol[0]); + + newCol = stringColumn.ElementwiseNotEquals(stringColumnCopy); + Assert.Equal(false, newCol[5]); + Assert.Equal(false, newCol[0]); + + newCol = typedStringColumn.ElementwiseNotEquals(typedStringColumnCopy); + Assert.False(newCol.All()); + + newCol = typedStringColumn.Add("suffix"); + for (int i = 0; i < newCol.Length; i++) + { + Assert.Equal(newCol[i], typedStringColumn[i] + "suffix"); + } + DataFrameColumn addString = typedStringColumn + "suffix"; + for (int i = 0; i < addString.Length; i++) + { + Assert.Equal(addString[i], typedStringColumn[i] + "suffix"); + } + Assert.True(newCol.ElementwiseEquals(addString).All()); + addString = "prefix" + typedStringColumn; + for (int i = 0; i < addString.Length; i++) + { + Assert.Equal(addString[i], "prefix" + typedStringColumn[i]); + } + } + + [Fact] + public void TestBinaryOperatorsWithConversions() + { + var df = MakeDataFrameWithNumericColumns(10); + + DataFrame tempDf = df + 1; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + (double)1); + tempDf = df + 1.1; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + 1.1); + tempDf = df + 1.1m; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + 1.1m); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + tempDf = df - 1.1; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] - 1.1); + tempDf = df - 1.1m; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] - 1.1m); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + tempDf = df * 1.1; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] * 1.1); + tempDf = df * 1.1m; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] * 1.1m); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + tempDf = df / 1.1; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] / 1.1); + tempDf = df / 1.1m; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] / 1.1m); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + tempDf = df % 1.1; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] % 1.1); + tempDf = df % 1.1m; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] % 1.1m); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + tempDf = 1 + df; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + (double)1); + tempDf = 1.1 + df; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + 1.1); + tempDf = 1.1m + df; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + 1.1m); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + tempDf = 1.1 - df; + Assert.Equal(tempDf[0, 0], 1.1 - (byte)df[0, 0]); + tempDf = 1.1m - df; + Assert.Equal(tempDf[0, 0], 1.1m - (byte)df[0, 0]); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + tempDf = 1.1 * df; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] * 1.1); + tempDf = 1.1m * df; + Assert.Equal(tempDf[0, 0], (byte)df[0, 0] * 1.1m); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + // To prevent a divide by zero + var plusOne = df + 1; + tempDf = 1.1 / plusOne; + Assert.Equal(tempDf[0, 0], 1.1 / (double)plusOne[0, 0]); + var plusDecimal = df + 1.1m; + tempDf = 1.1m / plusDecimal; + Assert.Equal(tempDf[0, 0], (1.1m) / (decimal)plusDecimal[0, 0]); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + tempDf = 1.1 % plusOne; + Assert.Equal(tempDf[0, 0], 1.1 % (double)plusOne[0, 0]); + tempDf = 1.1m % plusDecimal; + Assert.Equal(tempDf[0, 0], 1.1m % (decimal)plusDecimal[0, 0]); + Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); + + Assert.Equal((byte)0, df[0, 0]); + } + + [Fact] + public void TestBinaryOperationsOnColumns() + { + Int32DataFrameColumn column = new Int32DataFrameColumn("Int", Enumerable.Range(0, 10)); + Assert.ThrowsAny(() => column.Add(5.5, inPlace: true)); + Assert.ThrowsAny(() => column.ReverseAdd(5.5, inPlace: true)); + string str = "A String"; + Assert.ThrowsAny(() => column.Add(str, inPlace: true)); + Assert.ThrowsAny(() => column.ReverseAdd(str, inPlace: true)); + } + + [Fact] + public void TestBinaryOperationsOnExplodedNumericColumns() + { + DataFrame df = MakeDataFrameWithNumericAndBoolColumns(10, withNulls: false); + Int32DataFrameColumn ints = df.Columns["Int"] as Int32DataFrameColumn; + Int32DataFrameColumn res = ints.Add(1).Subtract(1).Multiply(10).Divide(10).LeftShift(2).RightShift(2); + Assert.True(res.ElementwiseEquals(ints).All()); + Assert.True(res.ElementwiseGreaterThanOrEqual(ints).All()); + Assert.True(res.ElementwiseLessThanOrEqual(ints).All()); + Assert.False(res.ElementwiseNotEquals(ints).All()); + Assert.False(res.ElementwiseGreaterThan(ints).All()); + Assert.False(res.ElementwiseLessThan(ints).All()); + + // Test inPlace + Int32DataFrameColumn inPlace = ints.Add(1, inPlace: true).Subtract(1, inPlace: true).Multiply(10, inPlace: true).Divide(10, inPlace: true).LeftShift(2, inPlace: true).RightShift(2, inPlace: true).Add(100, inPlace: true); + Assert.True(inPlace.ElementwiseEquals(ints).All()); + Assert.True(inPlace.ElementwiseGreaterThanOrEqual(ints).All()); + Assert.True(inPlace.ElementwiseLessThanOrEqual(ints).All()); + Assert.False(inPlace.ElementwiseNotEquals(ints).All()); + Assert.False(inPlace.ElementwiseGreaterThan(ints).All()); + Assert.False(inPlace.ElementwiseLessThan(ints).All()); + + Assert.False(inPlace.ElementwiseEquals(res).All()); + Assert.True(inPlace.ElementwiseGreaterThanOrEqual(res).All()); + Assert.False(inPlace.ElementwiseLessThanOrEqual(res).All()); + Assert.True(inPlace.ElementwiseNotEquals(res).All()); + Assert.True(inPlace.ElementwiseGreaterThan(res).All()); + Assert.False(inPlace.ElementwiseLessThan(res).All()); + + // Test Bool column + BooleanDataFrameColumn bools = df.Columns["Bool"] as BooleanDataFrameColumn; + BooleanDataFrameColumn allFalse = bools.Or(true).And(true).Xor(true); + Assert.True(allFalse.ElementwiseEquals(false).All()); + + // Test inPlace + BooleanDataFrameColumn inPlaceAllFalse = bools.Or(true, inPlace: true).And(true, inPlace: true).Xor(true, inPlace: true); + Assert.True(inPlaceAllFalse.ElementwiseEquals(bools).All()); + + // Test Reverse Operations + Int32DataFrameColumn reverse = ints.ReverseAdd(1).ReverseSubtract(1).ReverseMultiply(-1); + Assert.True(reverse.ElementwiseEquals(ints).All()); + + // Test inPlace + Int32DataFrameColumn reverseInPlace = ints.ReverseAdd(1, inPlace: true).ReverseSubtract(1, inPlace: true).ReverseMultiply(-1, inPlace: true).ReverseDivide(100, inPlace: true); + Assert.True(reverseInPlace.ElementwiseEquals(ints).All()); + Assert.False(reverseInPlace.ElementwiseEquals(reverse).All()); + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Computations.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Computations.cs new file mode 100644 index 0000000000..d62048aa89 --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Computations.cs @@ -0,0 +1,477 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Data.Analysis.Tests +{ + public partial class DataFrameTests + { + [Fact] + public void TestComputations() + { + DataFrame df = MakeDataFrameWithAllMutableColumnTypes(10); + df["Int"][0] = -10; + Assert.Equal(-10, df.Columns["Int"][0]); + + DataFrameColumn absColumn = df.Columns["Int"].Abs(); + Assert.Equal(10, absColumn[0]); + Assert.Equal(-10, df.Columns["Int"][0]); + df.Columns["Int"].Abs(true); + Assert.Equal(10, df.Columns["Int"][0]); + + Assert.Throws(() => df.Columns["Byte"].All()); + Assert.Throws(() => df.Columns["Byte"].Any()); + Assert.Throws(() => df.Columns["Char"].All()); + Assert.Throws(() => df.Columns["Char"].Any()); + Assert.Throws(() => df.Columns["DateTime"].All()); + Assert.Throws(() => df.Columns["DateTime"].Any()); + Assert.Throws(() => df.Columns["Decimal"].All()); + Assert.Throws(() => df.Columns["Decimal"].Any()); + Assert.Throws(() => df.Columns["Double"].All()); + Assert.Throws(() => df.Columns["Double"].Any()); + Assert.Throws(() => df.Columns["Float"].All()); + Assert.Throws(() => df.Columns["Float"].Any()); + Assert.Throws(() => df.Columns["Int"].All()); + Assert.Throws(() => df.Columns["Int"].Any()); + Assert.Throws(() => df.Columns["Long"].All()); + Assert.Throws(() => df.Columns["Long"].Any()); + Assert.Throws(() => df.Columns["Sbyte"].All()); + Assert.Throws(() => df.Columns["Sbyte"].Any()); + Assert.Throws(() => df.Columns["Short"].All()); + Assert.Throws(() => df.Columns["Short"].Any()); + Assert.Throws(() => df.Columns["Uint"].All()); + Assert.Throws(() => df.Columns["Uint"].Any()); + Assert.Throws(() => df.Columns["Ulong"].All()); + Assert.Throws(() => df.Columns["Ulong"].Any()); + Assert.Throws(() => df.Columns["Ushort"].All()); + Assert.Throws(() => df.Columns["Ushort"].Any()); + + bool any = df.Columns["Bool"].Any(); + bool all = df.Columns["Bool"].All(); + Assert.True(any); + Assert.False(all); + + // Test the computation results + df.Columns["Double"][0] = 100.0; + DataFrameColumn doubleColumn = df.Columns["Double"].CumulativeMax(); + for (int i = 0; i < doubleColumn.Length; i++) + { + if (i == 5) + Assert.Null(doubleColumn[i]); + else + Assert.Equal(100.0, (double)doubleColumn[i]); + } + Assert.Equal(1.0, df.Columns["Double"][1]); + df.Columns["Double"].CumulativeMax(true); + for (int i = 0; i < df.Columns["Double"].Length; i++) + { + if (i == 5) + Assert.Null(df.Columns["Double"][i]); + else + Assert.Equal(100.0, (double)df.Columns["Double"][i]); + } + + df.Columns["Float"][0] = -10.0f; + DataFrameColumn floatColumn = df.Columns["Float"].CumulativeMin(); + for (int i = 0; i < floatColumn.Length; i++) + { + if (i == 5) + Assert.Null(floatColumn[i]); + else + Assert.Equal(-10.0f, (float)floatColumn[i]); + } + Assert.Equal(9.0f, df.Columns["Float"][9]); + df.Columns["Float"].CumulativeMin(true); + for (int i = 0; i < df.Columns["Float"].Length; i++) + { + if (i == 5) + Assert.Null(df.Columns["Float"][i]); + else + Assert.Equal(-10.0f, (float)df.Columns["Float"][i]); + } + + DataFrameColumn uintColumn = df.Columns["Uint"].CumulativeProduct(); + Assert.Equal((uint)0, uintColumn[8]); + Assert.Equal((uint)8, df.Columns["Uint"][8]); + df.Columns["Uint"].CumulativeProduct(true); + Assert.Equal((uint)0, df.Columns["Uint"][9]); + + DataFrameColumn ushortColumn = df.Columns["Ushort"].CumulativeSum(); + Assert.Equal((ushort)40, ushortColumn[9]); + Assert.Equal((ushort)9, df.Columns["Ushort"][9]); + df.Columns["Ushort"].CumulativeSum(true); + Assert.Equal((ushort)40, df.Columns["Ushort"][9]); + + Assert.Equal(100.0, df.Columns["Double"].Max()); + Assert.Equal(-10.0f, df.Columns["Float"].Min()); + Assert.Equal((uint)0, df.Columns["Uint"].Product()); + Assert.Equal((ushort)130, df.Columns["Ushort"].Sum()); + + df.Columns["Double"][0] = 100.1; + Assert.Equal(100.1, df.Columns["Double"][0]); + DataFrameColumn roundColumn = df.Columns["Double"].Round(); + Assert.Equal(100.0, roundColumn[0]); + Assert.Equal(100.1, df.Columns["Double"][0]); + df.Columns["Double"].Round(true); + Assert.Equal(100.0, df.Columns["Double"][0]); + + // Test that none of the numeric column types throw + for (int i = 0; i < df.Columns.Count; i++) + { + DataFrameColumn column = df.Columns[i]; + if (column.DataType == typeof(bool)) + { + Assert.Throws(() => column.CumulativeMax()); + Assert.Throws(() => column.CumulativeMin()); + Assert.Throws(() => column.CumulativeProduct()); + Assert.Throws(() => column.CumulativeSum()); + Assert.Throws(() => column.Max()); + Assert.Throws(() => column.Min()); + Assert.Throws(() => column.Product()); + Assert.Throws(() => column.Sum()); + continue; + } + else if (column.DataType == typeof(string)) + { + Assert.Throws(() => column.CumulativeMax()); + Assert.Throws(() => column.CumulativeMin()); + Assert.Throws(() => column.CumulativeProduct()); + Assert.Throws(() => column.CumulativeSum()); + Assert.Throws(() => column.Max()); + Assert.Throws(() => column.Min()); + Assert.Throws(() => column.Product()); + Assert.Throws(() => column.Sum()); + continue; + } + else if (column.DataType == typeof(DateTime)) + { + column.CumulativeMax(); + column.CumulativeMin(); + column.Max(); + column.Min(); + + Assert.Throws(() => column.CumulativeProduct()); + Assert.Throws(() => column.CumulativeSum()); + Assert.Throws(() => column.Product()); + Assert.Throws(() => column.Sum()); + continue; + } + + column.CumulativeMax(); + column.CumulativeMin(); + column.CumulativeProduct(); + column.CumulativeSum(); + column.Max(); + column.Min(); + column.Product(); + column.Sum(); + } + } + + [Fact] + public void TestComputationsIncludingDateTime() + { + DataFrame df = MakeDataFrameWithNumericStringAndDateTimeColumns(10); + df["Int"][0] = -10; + Assert.Equal(-10, df.Columns["Int"][0]); + + DataFrameColumn absColumn = df.Columns["Int"].Abs(); + Assert.Equal(10, absColumn[0]); + Assert.Equal(-10, df.Columns["Int"][0]); + df.Columns["Int"].Abs(true); + Assert.Equal(10, df.Columns["Int"][0]); + + Assert.Throws(() => df.Columns["Byte"].All()); + Assert.Throws(() => df.Columns["Byte"].Any()); + Assert.Throws(() => df.Columns["Char"].All()); + Assert.Throws(() => df.Columns["Char"].Any()); + Assert.Throws(() => df.Columns["Decimal"].All()); + Assert.Throws(() => df.Columns["Decimal"].Any()); + Assert.Throws(() => df.Columns["Double"].All()); + Assert.Throws(() => df.Columns["Double"].Any()); + Assert.Throws(() => df.Columns["Float"].All()); + Assert.Throws(() => df.Columns["Float"].Any()); + Assert.Throws(() => df.Columns["Int"].All()); + Assert.Throws(() => df.Columns["Int"].Any()); + Assert.Throws(() => df.Columns["Long"].All()); + Assert.Throws(() => df.Columns["Long"].Any()); + Assert.Throws(() => df.Columns["Sbyte"].All()); + Assert.Throws(() => df.Columns["Sbyte"].Any()); + Assert.Throws(() => df.Columns["Short"].All()); + Assert.Throws(() => df.Columns["Short"].Any()); + Assert.Throws(() => df.Columns["Uint"].All()); + Assert.Throws(() => df.Columns["Uint"].Any()); + Assert.Throws(() => df.Columns["Ulong"].All()); + Assert.Throws(() => df.Columns["Ulong"].Any()); + Assert.Throws(() => df.Columns["Ushort"].All()); + Assert.Throws(() => df.Columns["Ushort"].Any()); + Assert.Throws(() => df.Columns["DateTime"].All()); + Assert.Throws(() => df.Columns["DateTime"].Any()); + + // Test the computation results + var maxDate = SampleDateTime.AddDays(100); + df.Columns["DateTime"][0] = maxDate; + DataFrameColumn dateTimeColumn = df.Columns["DateTime"].CumulativeMax(); + for (int i = 0; i < dateTimeColumn.Length; i++) + { + if (i == 5) + Assert.Null(dateTimeColumn[i]); + else + Assert.Equal(maxDate, (DateTime)dateTimeColumn[i]); + } + Assert.Equal(maxDate, dateTimeColumn.Max()); + + df.Columns["Double"][0] = 100.0; + DataFrameColumn doubleColumn = df.Columns["Double"].CumulativeMax(); + for (int i = 0; i < doubleColumn.Length; i++) + { + if (i == 5) + Assert.Null(doubleColumn[i]); + else + Assert.Equal(100.0, (double)doubleColumn[i]); + } + Assert.Equal(1.0, df.Columns["Double"][1]); + df.Columns["Double"].CumulativeMax(true); + for (int i = 0; i < df.Columns["Double"].Length; i++) + { + if (i == 5) + Assert.Null(df.Columns["Double"][i]); + else + Assert.Equal(100.0, (double)df.Columns["Double"][i]); + } + + df.Columns["Float"][0] = -10.0f; + DataFrameColumn floatColumn = df.Columns["Float"].CumulativeMin(); + for (int i = 0; i < floatColumn.Length; i++) + { + if (i == 5) + Assert.Null(floatColumn[i]); + else + Assert.Equal(-10.0f, (float)floatColumn[i]); + } + Assert.Equal(9.0f, df.Columns["Float"][9]); + df.Columns["Float"].CumulativeMin(true); + for (int i = 0; i < df.Columns["Float"].Length; i++) + { + if (i == 5) + Assert.Null(df.Columns["Float"][i]); + else + Assert.Equal(-10.0f, (float)df.Columns["Float"][i]); + } + + DataFrameColumn uintColumn = df.Columns["Uint"].CumulativeProduct(); + Assert.Equal((uint)0, uintColumn[8]); + Assert.Equal((uint)8, df.Columns["Uint"][8]); + df.Columns["Uint"].CumulativeProduct(true); + Assert.Equal((uint)0, df.Columns["Uint"][9]); + + DataFrameColumn ushortColumn = df.Columns["Ushort"].CumulativeSum(); + Assert.Equal((ushort)40, ushortColumn[9]); + Assert.Equal((ushort)9, df.Columns["Ushort"][9]); + df.Columns["Ushort"].CumulativeSum(true); + Assert.Equal((ushort)40, df.Columns["Ushort"][9]); + + Assert.Equal(100.0, df.Columns["Double"].Max()); + Assert.Equal(-10.0f, df.Columns["Float"].Min()); + Assert.Equal((uint)0, df.Columns["Uint"].Product()); + Assert.Equal((ushort)130, df.Columns["Ushort"].Sum()); + + df.Columns["Double"][0] = 100.1; + Assert.Equal(100.1, df.Columns["Double"][0]); + DataFrameColumn roundColumn = df.Columns["Double"].Round(); + Assert.Equal(100.0, roundColumn[0]); + Assert.Equal(100.1, df.Columns["Double"][0]); + df.Columns["Double"].Round(true); + Assert.Equal(100.0, df.Columns["Double"][0]); + + // Test that none of the numeric column types throw + for (int i = 0; i < df.Columns.Count; i++) + { + DataFrameColumn column = df.Columns[i]; + if (column.DataType == typeof(bool)) + { + Assert.Throws(() => column.CumulativeMax()); + Assert.Throws(() => column.CumulativeMin()); + Assert.Throws(() => column.CumulativeProduct()); + Assert.Throws(() => column.CumulativeSum()); + Assert.Throws(() => column.Max()); + Assert.Throws(() => column.Min()); + Assert.Throws(() => column.Product()); + Assert.Throws(() => column.Sum()); + continue; + } + else if (column.DataType == typeof(string)) + { + Assert.Throws(() => column.CumulativeMax()); + Assert.Throws(() => column.CumulativeMin()); + Assert.Throws(() => column.CumulativeProduct()); + Assert.Throws(() => column.CumulativeSum()); + Assert.Throws(() => column.Max()); + Assert.Throws(() => column.Min()); + Assert.Throws(() => column.Product()); + Assert.Throws(() => column.Sum()); + continue; + } + else if (column.DataType == typeof(DateTime)) + { + Assert.Throws(() => column.CumulativeProduct()); + Assert.Throws(() => column.CumulativeSum()); + Assert.Throws(() => column.Product()); + Assert.Throws(() => column.Sum()); + continue; + } + column.CumulativeMax(); + column.CumulativeMin(); + column.CumulativeProduct(); + column.CumulativeSum(); + column.Max(); + column.Min(); + column.Product(); + column.Sum(); + } + } + + [Fact] + public void TestIntComputations_MaxMin_WithNulls() + { + var column = new Int32DataFrameColumn("Int", new int?[] + { + null, + 2, + 1, + 4, + 3, + null + }); + + Assert.Equal(1, column.Min()); + Assert.Equal(4, column.Max()); + } + + [Fact] + public void TestIntSum_OnColumnWithNullsOnly() + { + var column = new Int32DataFrameColumn("Int", new int?[] { null, null }); + Assert.Null(column.Sum()); + } + + [Fact] + public void TestIntSum_OnEmptyColumn() + { + var column = new Int32DataFrameColumn("Int"); + Assert.Null(column.Sum()); + } + + [Fact] + public void TestIntComputations_MaxMin_OnEmptyColumn() + { + var column = new Int32DataFrameColumn("Int"); + + Assert.Null(column.Min()); + Assert.Null(column.Max()); + } + + [Fact] + public void TestDateTimeComputations_MaxMin_OnEmptyColumn() + { + var column = new DateTimeDataFrameColumn("DateTime"); + + Assert.Null(column.Min()); + Assert.Null(column.Max()); + } + + [Fact] + public void TestDateTimeComputations_MaxMin_WithNulls() + { + var dateTimeColumn = new DateTimeDataFrameColumn("DateTime", new DateTime?[] + { + null, + new DateTime(2022, 1, 1), + new DateTime(2020, 1, 1), + new DateTime(2023, 1, 1), + new DateTime(2021, 1, 1), + null + }); + + Assert.Equal(new DateTime(2020, 1, 1), dateTimeColumn.Min()); + Assert.Equal(new DateTime(2023, 1, 1), dateTimeColumn.Max()); + } + + [Theory] + [InlineData(5, 10)] + [InlineData(-15, 10)] + [InlineData(-5, 10)] + public void TestComputations_WithNegativeNumbers_MaxMin_Calculated(int startingFrom, int length) + { + // Arrange + + IEnumerable range = Enumerable.Range(startingFrom, length); + + int max = range.Max(); + int min = range.Min(); + + DataFrame df = MakeDataFrameWithNumericColumns(length, withNulls: false, startingFrom); + + var byteColumn = (PrimitiveDataFrameColumn)df.Columns["Byte"]; + var decimalColumn = (PrimitiveDataFrameColumn)df.Columns["Decimal"]; + var doubleColumn = (PrimitiveDataFrameColumn)df.Columns["Double"]; + var floatColumn = (PrimitiveDataFrameColumn)df.Columns["Float"]; + var intColumn = (PrimitiveDataFrameColumn)df.Columns["Int"]; + var longColumn = (PrimitiveDataFrameColumn)df.Columns["Long"]; + var sbyteColumn = (PrimitiveDataFrameColumn)df.Columns["Sbyte"]; + var shortColumn = (PrimitiveDataFrameColumn)df.Columns["Short"]; + var uintColumn = (PrimitiveDataFrameColumn)df.Columns["Uint"]; + var ulongColumn = (PrimitiveDataFrameColumn)df.Columns["Ulong"]; + var ushortColumn = (PrimitiveDataFrameColumn)df.Columns["Ushort"]; + + // Act, Assert + + // We need to iterate over all range with conversion to byte due to negative numbers issue + Assert.Equal((byte)byteColumn.Max(), range.Select(x => (byte)x).Max()); + + Assert.Equal((decimal)decimalColumn.Max(), (decimal)max); + Assert.Equal((double)doubleColumn.Max(), (double)max); + Assert.Equal((float)floatColumn.Max(), (float)max); + Assert.Equal((int)intColumn.Max(), (int)max); + Assert.Equal((long)longColumn.Max(), (long)max); + Assert.Equal((sbyte)sbyteColumn.Max(), (sbyte)max); + Assert.Equal((short)shortColumn.Max(), (short)max); + + // We need to iterate over all range with conversion to uint due to negative numbers issue + Assert.Equal((uint)uintColumn.Max(), range.Select(x => (uint)x).Max()); + + // We need to iterate over all range with conversion to ulong due to negative numbers issue + Assert.Equal((ulong)ulongColumn.Max(), range.Select(x => (ulong)x).Max()); + + // We need to iterate over all range with conversion to ushort due to negative numbers issue + Assert.Equal((ushort)ushortColumn.Max(), range.Select(x => (ushort)x).Max()); + + // We need to iterate over all range with conversion to byte due to negative numbers issue + Assert.Equal((byte)byteColumn.Min(), range.Select(x => (byte)x).Min()); + + Assert.Equal((decimal)decimalColumn.Min(), (decimal)min); + Assert.Equal((double)doubleColumn.Min(), (double)min); + Assert.Equal((float)floatColumn.Min(), (float)min); + Assert.Equal((int)intColumn.Min(), (int)min); + Assert.Equal((long)longColumn.Min(), (long)min); + Assert.Equal((sbyte)sbyteColumn.Min(), (sbyte)min); + Assert.Equal((short)shortColumn.Min(), (short)min); + + // We need to iterate over all range with conversion to uint due to negative numbers issue + Assert.Equal((uint)uintColumn.Min(), range.Select(x => (uint)x).Min()); + + // We need to iterate over all range with conversion to ulong due to negative numbers issue + Assert.Equal((ulong)ulongColumn.Min(), range.Select(x => (ulong)x).Min()); + + // We need to iterate over all range with conversion to ushort due to negative numbers issue + Assert.Equal((ushort)ushortColumn.Min(), range.Select(x => (ushort)x).Min()); + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Filter.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Filter.cs new file mode 100644 index 0000000000..765d78b5dd --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Filter.cs @@ -0,0 +1,63 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Data.Analysis.Tests +{ + public partial class DataFrameTests + { + [Fact] + public void TestColumnFilter() + { + DataFrame df = MakeDataFrameWithNumericColumns(10); + DataFrameColumn filtered = df.Columns["Int"].Filter(3, 7); + Assert.Equal(4, filtered.Length); + Assert.Equal(3, filtered[0]); + Assert.Equal(4, filtered[1]); + Assert.Equal(6, filtered[2]); + Assert.Equal(7, filtered[3]); + } + + [Fact] + public void TestDataFrameFilter() + { + DataFrame df = MakeDataFrameWithAllMutableColumnTypes(10); + DataFrame boolColumnFiltered = df[df.Columns["Bool"].ElementwiseEquals(true)]; + List verify = new List { 0, 2, 4, 6, 8 }; + Assert.Equal(5, boolColumnFiltered.Rows.Count); + for (int i = 0; i < boolColumnFiltered.Columns.Count; i++) + { + DataFrameColumn column = boolColumnFiltered.Columns[i]; + if (column.Name == "Char" || column.Name == "Bool" || column.Name == "String" || column.Name == "DateTime") + continue; + for (int j = 0; j < column.Length; j++) + { + Assert.Equal(verify[j].ToString(), column[j].ToString()); + } + } + DataFrame intEnumerableFiltered = df[Enumerable.Range(0, 10)]; + DataFrame boolEnumerableFiltered = df[Enumerable.Range(0, 10).Select(x => true)]; + DataFrame longEnumerableFiltered = df[Enumerable.Range(0, 10).Select(x => (long)x)]; + Assert.Equal(intEnumerableFiltered.Columns.Count, df.Columns.Count); + Assert.Equal(boolEnumerableFiltered.Columns.Count, df.Columns.Count); + Assert.Equal(longEnumerableFiltered.Columns.Count, df.Columns.Count); + for (int i = 0; i < intEnumerableFiltered.Columns.Count; i++) + { + DataFrameColumn intFilteredColumn = intEnumerableFiltered.Columns[i]; + DataFrameColumn dfColumn = df.Columns[i]; + DataFrameColumn boolFilteredColumn = boolEnumerableFiltered.Columns[i]; + DataFrameColumn longFilteredColumn = longEnumerableFiltered.Columns[i]; + Assert.True(intFilteredColumn.ElementwiseEquals(dfColumn).All()); + Assert.True(boolFilteredColumn.ElementwiseEquals(dfColumn).All()); + Assert.True(longFilteredColumn.ElementwiseEquals(dfColumn).All()); + } + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Join.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Join.cs new file mode 100644 index 0000000000..e782296dd9 --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Join.cs @@ -0,0 +1,162 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Data.Analysis.Tests +{ + public partial class DataFrameTests + { + [Fact] + public void TestJoin() + { + DataFrame left = MakeDataFrameWithAllMutableColumnTypes(10); + DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); + + // Tests with right.Rows.Count < left.Rows.Count + // Left join + DataFrame join = left.Join(right); + Assert.Equal(join.Rows.Count, left.Rows.Count); + Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Null(join.Columns["Int_right"][6]); + VerifyJoin(join, left, right, JoinAlgorithm.Left); + + // Right join + join = left.Join(right, joinAlgorithm: JoinAlgorithm.Right); + Assert.Equal(join.Rows.Count, right.Rows.Count); + Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(join.Columns["Int_right"][3], right.Columns["Int"][3]); + Assert.Null(join.Columns["Int_right"][2]); + VerifyJoin(join, left, right, JoinAlgorithm.Right); + + // Outer join + join = left.Join(right, joinAlgorithm: JoinAlgorithm.FullOuter); + Assert.Equal(join.Rows.Count, left.Rows.Count); + Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Null(join.Columns["Int_right"][6]); + VerifyJoin(join, left, right, JoinAlgorithm.FullOuter); + + // Inner join + join = left.Join(right, joinAlgorithm: JoinAlgorithm.Inner); + Assert.Equal(join.Rows.Count, right.Rows.Count); + Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(join.Columns["Int_right"][3], right.Columns["Int"][3]); + Assert.Null(join.Columns["Int_right"][2]); + VerifyJoin(join, left, right, JoinAlgorithm.Inner); + + // Tests with right.Rows.Count > left.Rows.Count + // Left join + right = MakeDataFrameWithAllMutableColumnTypes(15); + join = left.Join(right); + Assert.Equal(join.Rows.Count, left.Rows.Count); + Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(join.Columns["Int_right"][6], right.Columns["Int"][6]); + VerifyJoin(join, left, right, JoinAlgorithm.Left); + + // Right join + join = left.Join(right, joinAlgorithm: JoinAlgorithm.Right); + Assert.Equal(join.Rows.Count, right.Rows.Count); + Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(join.Columns["Int_right"][2], right.Columns["Int"][2]); + Assert.Null(join.Columns["Int_left"][12]); + VerifyJoin(join, left, right, JoinAlgorithm.Right); + + // Outer join + join = left.Join(right, joinAlgorithm: JoinAlgorithm.FullOuter); + Assert.Equal(join.Rows.Count, right.Rows.Count); + Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Null(join.Columns["Int_left"][12]); + VerifyJoin(join, left, right, JoinAlgorithm.FullOuter); + + // Inner join + join = left.Join(right, joinAlgorithm: JoinAlgorithm.Inner); + Assert.Equal(join.Rows.Count, left.Rows.Count); + Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(join.Columns["Int_right"][2], right.Columns["Int"][2]); + VerifyJoin(join, left, right, JoinAlgorithm.Inner); + } + + private void VerifyJoin(DataFrame join, DataFrame left, DataFrame right, JoinAlgorithm joinAlgorithm) + { + Int64DataFrameColumn mapIndices = new Int64DataFrameColumn("map", join.Rows.Count); + for (long i = 0; i < join.Rows.Count; i++) + { + mapIndices[i] = i; + } + for (int i = 0; i < join.Columns.Count; i++) + { + DataFrameColumn joinColumn = join.Columns[i]; + DataFrameColumn isEqual; + + if (joinAlgorithm == JoinAlgorithm.Left) + { + if (i < left.Columns.Count) + { + DataFrameColumn leftColumn = left.Columns[i]; + isEqual = joinColumn.ElementwiseEquals(leftColumn); + } + else + { + int columnIndex = i - left.Columns.Count; + DataFrameColumn rightColumn = right.Columns[columnIndex]; + DataFrameColumn compareColumn = rightColumn.Length <= join.Rows.Count ? rightColumn.Clone(numberOfNullsToAppend: join.Rows.Count - rightColumn.Length) : rightColumn.Clone(mapIndices); + isEqual = joinColumn.ElementwiseEquals(compareColumn); + } + } + else if (joinAlgorithm == JoinAlgorithm.Right) + { + if (i < left.Columns.Count) + { + DataFrameColumn leftColumn = left.Columns[i]; + DataFrameColumn compareColumn = leftColumn.Length <= join.Rows.Count ? leftColumn.Clone(numberOfNullsToAppend: join.Rows.Count - leftColumn.Length) : leftColumn.Clone(mapIndices); + isEqual = joinColumn.ElementwiseEquals(compareColumn); + } + else + { + int columnIndex = i - left.Columns.Count; + DataFrameColumn rightColumn = right.Columns[columnIndex]; + isEqual = joinColumn.ElementwiseEquals(rightColumn); + } + } + else if (joinAlgorithm == JoinAlgorithm.Inner) + { + if (i < left.Columns.Count) + { + DataFrameColumn leftColumn = left.Columns[i]; + isEqual = joinColumn.ElementwiseEquals(leftColumn.Clone(mapIndices)); + } + else + { + int columnIndex = i - left.Columns.Count; + DataFrameColumn rightColumn = right.Columns[columnIndex]; + isEqual = joinColumn.ElementwiseEquals(rightColumn.Clone(mapIndices)); + } + } + else + { + if (i < left.Columns.Count) + { + DataFrameColumn leftColumn = left.Columns[i]; + isEqual = joinColumn.ElementwiseEquals(leftColumn.Clone(numberOfNullsToAppend: join.Rows.Count - leftColumn.Length)); + } + else + { + int columnIndex = i - left.Columns.Count; + DataFrameColumn rightColumn = right.Columns[columnIndex]; + isEqual = joinColumn.ElementwiseEquals(rightColumn.Clone(numberOfNullsToAppend: join.Rows.Count - rightColumn.Length)); + } + } + for (int j = 0; j < join.Rows.Count; j++) + { + Assert.Equal(true, isEqual[j]); + } + } + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Merge.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Merge.cs new file mode 100644 index 0000000000..b507e846e8 --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Merge.cs @@ -0,0 +1,807 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Data.Analysis.Tests +{ + public partial class DataFrameTests + { + [Theory] + [InlineData(1, 2)] + [InlineData(2, 1)] + public void TestDataCorrectnessForInnerMerge(int leftCount, int rightCount) + { + DataFrame left = MakeDataFrameWithNumericColumns(leftCount, false); + DataFrameColumn leftStringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, leftCount).Select(x => "Left")); + left.Columns.Insert(left.Columns.Count, leftStringColumn); + + DataFrame right = MakeDataFrameWithNumericColumns(rightCount, false); + DataFrameColumn rightStringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, rightCount).Select(x => "Right")); + right.Columns.Insert(right.Columns.Count, rightStringColumn); + + DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); + + Assert.Equal("Left", (string)merge.Columns["String_left"][0]); + Assert.Equal("Right", (string)merge.Columns["String_right"][0]); + } + + [Fact] + public void TestMerge() + { + DataFrame left = MakeDataFrameWithAllMutableColumnTypes(10); + DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); + + // Tests with right.Rows.Count < left.Rows.Count + // Left merge + DataFrame merge = left.Merge(right, "Int", "Int"); + Assert.Equal(10, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Null(merge.Columns["Int_right"][6]); + Assert.Null(merge.Columns["Int_left"][5]); + VerifyMerge(merge, left, right, JoinAlgorithm.Left); + + // Right merge + merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Right); + Assert.Equal(5, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(merge.Columns["Int_right"][3], right.Columns["Int"][3]); + Assert.Null(merge.Columns["Int_right"][2]); + VerifyMerge(merge, left, right, JoinAlgorithm.Right); + + // Outer merge + merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); + Assert.Equal(merge.Rows.Count, left.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Null(merge.Columns["Int_right"][6]); + VerifyMerge(merge, left, right, JoinAlgorithm.FullOuter); + + // Inner merge + merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); + Assert.Equal(merge.Rows.Count, right.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(merge.Columns["Int_right"][2], right.Columns["Int"][3]); + Assert.Null(merge.Columns["Int_right"][4]); + VerifyMerge(merge, left, right, JoinAlgorithm.Inner); + + // Tests with right.Rows.Count > left.Rows.Count + // Left merge + right = MakeDataFrameWithAllMutableColumnTypes(15); + merge = left.Merge(right, "Int", "Int"); + Assert.Equal(merge.Rows.Count, left.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(merge.Columns["Int_right"][6], right.Columns["Int"][6]); + VerifyMerge(merge, left, right, JoinAlgorithm.Left); + + // Right merge + merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Right); + Assert.Equal(merge.Rows.Count, right.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(merge.Columns["Int_right"][2], right.Columns["Int"][2]); + Assert.Null(merge.Columns["Int_left"][12]); + VerifyMerge(merge, left, right, JoinAlgorithm.Right); + + // Outer merge + merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); + Assert.Equal(16, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Null(merge.Columns["Int_left"][12]); + Assert.Null(merge.Columns["Int_left"][15]); + VerifyMerge(merge, left, right, JoinAlgorithm.FullOuter); + + // Inner merge + merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); + Assert.Equal(9, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + Assert.Equal(merge.Columns["Int_right"][2], right.Columns["Int"][2]); + VerifyMerge(merge, left, right, JoinAlgorithm.Inner); + } + + private void MatchRowsOnMergedDataFrame(DataFrame merge, DataFrame left, DataFrame right, long mergeRow, long? leftRow, long? rightRow) + { + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + DataFrameRow dataFrameMergeRow = merge.Rows[mergeRow]; + int columnIndex = 0; + foreach (object value in dataFrameMergeRow) + { + object compare = null; + if (columnIndex < left.Columns.Count) + { + if (leftRow != null) + { + compare = left.Rows[leftRow.Value][columnIndex]; + } + } + else + { + int rightColumnIndex = columnIndex - left.Columns.Count; + if (rightRow != null) + { + compare = right.Rows[rightRow.Value][rightColumnIndex]; + } + } + Assert.Equal(value, compare); + columnIndex++; + } + } + + [Theory] + [InlineData(10, 5, JoinAlgorithm.Left)] + [InlineData(5, 10, JoinAlgorithm.Right)] + public void TestMergeEdgeCases_LeftOrRight(int leftLength, int rightLength, JoinAlgorithm joinAlgorithm) + { + DataFrame left = MakeDataFrameWithAllMutableColumnTypes(leftLength); + if (leftLength > 5) + { + left["Int"][8] = null; + } + DataFrame right = MakeDataFrameWithAllMutableColumnTypes(rightLength); + if (rightLength > 5) + { + right["Int"][8] = null; + } + + DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: joinAlgorithm); + Assert.Equal(10, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + int[] matchedFullRows = new int[] { 0, 1, 3, 4 }; + for (long i = 0; i < matchedFullRows.Length; i++) + { + int rowIndex = matchedFullRows[i]; + MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, rowIndex, rowIndex); + } + + int[] matchedLeftOrRightRowsNullOtherRows = new int[] { 2, 5, 6, 7, 8, 9 }; + for (long i = 0; i < matchedLeftOrRightRowsNullOtherRows.Length; i++) + { + int rowIndex = matchedLeftOrRightRowsNullOtherRows[i]; + if (leftLength > 5) + { + MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, rowIndex, null); + } + else + { + MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, null, rowIndex); + } + } + } + + [Fact] + public void TestMergeEdgeCases_Inner() + { + DataFrame left = MakeDataFrameWithAllMutableColumnTypes(5); + DataFrame right = MakeDataFrameWithAllMutableColumnTypes(10); + left["Int"][3] = null; + right["Int"][6] = null; + // Creates this case: + /* + * Left: Right: + * 0 0 + * 1 1 + * null(2) 2 + * null(3) 3 + * 4 4 + * null(5) + * null(6) + * 7 + * 8 + * 9 + */ + /* + * Merge will result in a DataFrame like: + * Int_Left Int_Right + * 0 0 + * 1 1 + * 4 4 + * null(2) null(5) + * null(3) null(5) + * null(2) null(6) + * null(3) null(6) + */ + + DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); + Assert.Equal(7, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + int[] mergeRows = new int[] { 0, 1, 2, 3, 4, 5, 6 }; + int[] leftRows = new int[] { 0, 1, 4, 2, 3, 2, 3 }; + int[] rightRows = new int[] { 0, 1, 4, 5, 5, 6, 6 }; + for (long i = 0; i < mergeRows.Length; i++) + { + int rowIndex = mergeRows[i]; + int leftRowIndex = leftRows[i]; + int rightRowIndex = rightRows[i]; + MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, leftRowIndex, rightRowIndex); + } + } + + [Fact] + public void TestMergeEdgeCases_Outer() + { + DataFrame left = MakeDataFrameWithAllMutableColumnTypes(5); + left["Int"][3] = null; + DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); + right["Int"][1] = 5; + right["Int"][3] = null; + right["Int"][4] = 6; + + // Creates this case: + /* + * Left: Right: RowIndex: + * 0 0 0 + * 1 5 1 + * null null 2 + * null(3) null(3) 3 + * 4 6 4 + */ + + /* + * Merge will result in a DataFrame like: + * Int_left: Int_right: Merged: Index: + * 0 0 0 - 0 0 + * 1 null 1 - N 1 + * null null 2 - 2 2 + * null null(3) 2 - 3 3 + * null(3) null 3 - 2 4 + * null(3) null(3) 3 - 3 5 + * 4 null 4 - N 6 + * null 5 N - 1 7 + * null 6 N - 4 8 + */ + + DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); + Assert.Equal(9, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + int[] mergeRows = new int[] { 0, 2, 3, 4, 5 }; + int[] leftRows = new int[] { 0, 2, 2, 3, 3 }; + int[] rightRows = new int[] { 0, 2, 3, 2, 3 }; + for (long i = 0; i < mergeRows.Length; i++) + { + int rowIndex = mergeRows[i]; + int leftRowIndex = leftRows[i]; + int rightRowIndex = rightRows[i]; + MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, leftRowIndex, rightRowIndex); + } + + mergeRows = new int[] { 1, 6 }; + leftRows = new int[] { 1, 4 }; + for (long i = 0; i < mergeRows.Length; i++) + { + int rowIndex = mergeRows[i]; + int leftRowIndex = leftRows[i]; + MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, leftRowIndex, null); + } + + mergeRows = new int[] { 7, 8 }; + rightRows = new int[] { 1, 4 }; + for (long i = 0; i < mergeRows.Length; i++) + { + int rowIndex = mergeRows[i]; + int rightRowIndex = rightRows[i]; + MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, null, rightRowIndex); + } + } + + [Fact] + public void TestMerge_ByTwoColumns_Complex_LeftJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2, 3, 4, 5 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 0, 1, 1, 2, 2, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 3, 1, 2, 1, 2, 1 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2, 3 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1, 1 })); + + // Creates this case: + /* ------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * ------------------------- + * 0 0 3 | 0 1 1 + * 1 1 1 | 1 1 2 + * 2 1 2 | 2 1 1 + * 3 2 1 | 3 2 1 + * 4 2 2 + * 5 3 1 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 0 3 0 - N + * 1 1 1 0 1 1 1 - 0 + * 1 1 1 2 1 1 1 - 2 + * 2 1 2 1 1 2 2 - 1 + * 3 2 1 3 2 1 3 - 3 + * 4 2 2 4 - N + * 5 3 1 5 - N + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, null), + (1, 0), + (1, 2), + (2, 1), + (3, 3), + (4, null), + (5, null) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + + } + + [Fact] + public void TestMerge_ByTwoColumns_Simple_ManyToMany_LeftJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); + + // Creates this case: + /* --------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * --------------------------- + * 0 1 1 | 0 1 1 + * 1 1 1 | 1 1 1 + * 2 3 3 | 2 0 0 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 1 1 0 1 1 0 - 0 + * 0 1 1 1 1 1 0 - 1 + * 1 1 1 0 1 1 1 - 0 + * 1 1 1 1 1 1 1 - 1 + * 2 3 3 2 - N + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, 0), + (0, 1), + (1, 0), + (1, 1), + (2, null) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByTwoColumns_Simple_ManyToMany_RightJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); + + // Creates this case: + /* --------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * --------------------------- + * 0 1 1 | 0 1 1 + * 1 1 1 | 1 1 1 + * 2 3 3 | 2 0 0 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 1 1 0 1 1 0 - 0 + * 1 1 1 0 1 1 1 - 0 + * 0 1 1 1 1 1 0 - 1 + * 1 1 1 1 1 1 1 - 1 + * 2 0 0 N - 2 + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.Right); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, 0), + (1, 0), + (0, 1), + (1, 1), + (null, 2) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByTwoColumns_Simple_ManyToMany_InnerJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); + + // Creates this case: + /* --------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * --------------------------- + * 0 1 1 | 0 1 1 + * 1 1 1 | 1 1 1 + * 2 3 3 | 2 0 0 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 1 1 0 1 1 0 - 0 + * 1 1 1 0 1 1 1 - 0 + * 0 1 1 1 1 1 0 - 1 + * 1 1 1 1 1 1 1 - 1 + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.Inner); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, 0), + (1, 0), + (0, 1), + (1, 1) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByTwoColumns_Simple_ManyToMany_OuterJoin() + { + //Test left merge by to int type columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); + + // Creates this case: + /* --------------------------- + * Left | Right + * I G1 G2 | I G1 G2 + * --------------------------- + * 0 1 1 | 0 1 1 + * 1 1 1 | 1 1 1 + * 2 3 3 | 2 0 0 + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 IR Merged: + * ------------------------- + * 0 1 1 0 1 1 0 - 0 + * 0 1 1 1 1 1 0 - 1 + * 1 1 1 0 1 1 1 - 0 + * 1 1 1 1 1 1 1 - 1 + * 2 3 3 2 - N + * 2 0 0 N - 2 + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.FullOuter); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, 0), + (0, 1), + (1, 0), + (1, 1), + (2, null), + (null, 2) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByThreeColumns_OneToOne_LeftJoin() + { + //Test merge by LEFT join of int and string columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1 })); + left.Columns.Add(new StringDataFrameColumn("G3", new[] { "A", "B", "C" })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 0, 1, 1 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 2 })); + right.Columns.Add(new StringDataFrameColumn("G3", new[] { "Z", "Y", "B" })); + + // Creates this case: + /* ----------------------------- + * Left | Right + * I G1 G2 G3 | I G1 G2 G3 + * ------------------------------ + * 0 1 1 A | 0 0 1 Z + * 1 1 2 B | 1 1 1 Y + * 2 2 1 C | 2 1 2 B + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 G3 IR Merged: + * ------------------------- + * 0 1 1 A 0 - N + * 1 1 2 B 2 1 2 B 1 - 2 + * 2 2 1 C 2 - N + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2", "G3" }, new[] { "G1", "G2", "G3" }); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (0, null), + (1, 2), + (2, null) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_ByThreeColumns_OneToOne_RightJoin() + { + //Test merge by RIGHT join of int and string columns + + //Arrange + var left = new DataFrame(); + left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 2 })); + left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1 })); + left.Columns.Add(new StringDataFrameColumn("G3", new[] { "A", "B", "C" })); + + var right = new DataFrame(); + right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); + right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 0, 1, 1 })); + right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 2 })); + right.Columns.Add(new StringDataFrameColumn("G3", new[] { "Z", "Y", "B" })); + + // Creates this case: + /* ----------------------------- + * Left | Right + * I G1 G2 G3 | I G1 G2 G3 + * ------------------------------ + * 0 1 1 A | 0 0 1 Z + * 1 1 2 B | 1 1 1 Y + * 2 2 1 C | 2 1 2 B + */ + + /* + * Merge will result in a DataFrame like: + * IL G1 G2 G3 IR Merged: + * ------------------------- + * 0 0 1 Z N - 0 + * 1 1 1 Y N - 1 + * 1 1 2 B 2 1 2 B 1 - 2 + */ + + //Act + var merge = left.Merge(right, new[] { "G1", "G2", "G3" }, new[] { "G1", "G2", "G3" }, joinAlgorithm: JoinAlgorithm.Right); + + //Assert + var expectedMerged = new (int? Left, int? Right)[] { + (null, 0), + (null, 1), + (1, 2) + }; + + Assert.Equal(expectedMerged.Length, merge.Rows.Count); + Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); + + for (long i = 0; i < expectedMerged.Length; i++) + { + MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); + } + } + + [Fact] + public void TestMerge_Issue5778() + { + DataFrame left = MakeDataFrameWithAllMutableColumnTypes(2, false); + DataFrame right = MakeDataFrameWithAllMutableColumnTypes(1); + + DataFrame merge = left.Merge(right, "Int", "Int"); + + Assert.Equal(2, merge.Rows.Count); + Assert.Equal(0, (int)merge.Columns["Int_left"][0]); + Assert.Equal(1, (int)merge.Columns["Int_left"][1]); + MatchRowsOnMergedDataFrame(merge, left, right, 0, 0, 0); + MatchRowsOnMergedDataFrame(merge, left, right, 1, 1, 0); + } + + [Fact] + //Issue 6127 + public void TestMerge_CorrectColumnTypes() + { + DataFrame left = MakeDataFrameWithAllMutableColumnTypes(2, false); + DataFrame right = MakeDataFrameWithAllMutableColumnTypes(1); + + DataFrame merge = left.Merge(right, "Int", "Int"); + + Assert.NotNull(merge.Columns.GetBooleanColumn("Bool_left")); + Assert.NotNull(merge.Columns.GetBooleanColumn("Bool_right")); + + Assert.NotNull(merge.Columns.GetDecimalColumn("Decimal_left")); + Assert.NotNull(merge.Columns.GetDecimalColumn("Decimal_right")); + + Assert.NotNull(merge.Columns.GetSingleColumn("Float_left")); + Assert.NotNull(merge.Columns.GetSingleColumn("Float_right")); + + Assert.NotNull(merge.Columns.GetDoubleColumn("Double_left")); + Assert.NotNull(merge.Columns.GetDoubleColumn("Double_right")); + + Assert.NotNull(merge.Columns.GetByteColumn("Byte_left")); + Assert.NotNull(merge.Columns.GetByteColumn("Byte_right")); + + Assert.NotNull(merge.Columns.GetCharColumn("Char_left")); + Assert.NotNull(merge.Columns.GetCharColumn("Char_right")); + + Assert.NotNull(merge.Columns.GetInt16Column("Short_left")); + Assert.NotNull(merge.Columns.GetInt16Column("Short_right")); + + Assert.NotNull(merge.Columns.GetUInt16Column("Ushort_left")); + Assert.NotNull(merge.Columns.GetUInt16Column("Ushort_right")); + + Assert.NotNull(merge.Columns.GetInt32Column("Int_left")); + Assert.NotNull(merge.Columns.GetInt32Column("Int_right")); + + Assert.NotNull(merge.Columns.GetUInt32Column("Uint_left")); + Assert.NotNull(merge.Columns.GetUInt32Column("Uint_right")); + + Assert.NotNull(merge.Columns.GetInt64Column("Long_left")); + Assert.NotNull(merge.Columns.GetInt64Column("Long_right")); + + Assert.NotNull(merge.Columns.GetUInt64Column("Ulong_left")); + Assert.NotNull(merge.Columns.GetUInt64Column("Ulong_right")); + + Assert.NotNull(merge.Columns.GetDateTimeColumn("DateTime_left")); + Assert.NotNull(merge.Columns.GetDateTimeColumn("DateTime_right")); + } + + private void VerifyMerge(DataFrame merge, DataFrame left, DataFrame right, JoinAlgorithm joinAlgorithm) + { + if (joinAlgorithm == JoinAlgorithm.Left || joinAlgorithm == JoinAlgorithm.Inner) + { + HashSet intersection = new HashSet(); + for (int i = 0; i < merge.Columns["Int_left"].Length; i++) + { + if (merge.Columns["Int_left"][i] == null) + continue; + intersection.Add((int)merge.Columns["Int_left"][i]); + } + for (int i = 0; i < left.Columns["Int"].Length; i++) + { + if (left.Columns["Int"][i] != null && intersection.Contains((int)left.Columns["Int"][i])) + intersection.Remove((int)left.Columns["Int"][i]); + } + Assert.Empty(intersection); + } + else if (joinAlgorithm == JoinAlgorithm.Right) + { + HashSet intersection = new HashSet(); + for (int i = 0; i < merge.Columns["Int_right"].Length; i++) + { + if (merge.Columns["Int_right"][i] == null) + continue; + intersection.Add((int)merge.Columns["Int_right"][i]); + } + for (int i = 0; i < right.Columns["Int"].Length; i++) + { + if (right.Columns["Int"][i] != null && intersection.Contains((int)right.Columns["Int"][i])) + intersection.Remove((int)right.Columns["Int"][i]); + } + Assert.Empty(intersection); + } + else if (joinAlgorithm == JoinAlgorithm.FullOuter) + { + VerifyMerge(merge, left, right, JoinAlgorithm.Left); + VerifyMerge(merge, left, right, JoinAlgorithm.Right); + } + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Sort.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Sort.cs new file mode 100644 index 0000000000..f198ff003d --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Sort.cs @@ -0,0 +1,139 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Xunit; + +namespace Microsoft.Data.Analysis.Tests +{ + public partial class DataFrameTests + { + [Fact] + public void TestSplitAndSort() + { + DataFrame df = MakeDataFrameWithAllMutableColumnTypes(20); + df.Columns["Int"][0] = 100000; + df.Columns["Int"][df.Rows.Count - 1] = -1; + df.Columns["Int"][5] = 200000; + DataFrame dfTest; + DataFrame dfTrain = SplitTrainTest(df, 0.8f, out dfTest); + + // Sort by "Int" in ascending order + var sortedDf = dfTrain.OrderBy("Int"); + Assert.Null(sortedDf.Columns["Int"][sortedDf.Rows.Count - 1]); + Assert.Equal(1, sortedDf.Columns["Int"][0]); + Assert.Equal(100000, sortedDf.Columns["Int"][sortedDf.Rows.Count - 3]); + Assert.Equal(200000, sortedDf.Columns["Int"][sortedDf.Rows.Count - 2]); + } + + [Fact] + public void TestStringColumnSort() + { + // StringDataFrameColumn specific sort tests + StringDataFrameColumn strColumn = new StringDataFrameColumn("String", 0); + Assert.Equal(0, strColumn.NullCount); + for (int i = 0; i < 5; i++) + { + strColumn.Append(null); + } + Assert.Equal(5, strColumn.NullCount); + // Should handle all nulls + StringDataFrameColumn sortedStrColumn = strColumn.Sort() as StringDataFrameColumn; + Assert.Equal(5, sortedStrColumn.NullCount); + Assert.Null(sortedStrColumn[0]); + + for (int i = 0; i < 5; i++) + { + strColumn.Append(i.ToString()); + } + Assert.Equal(5, strColumn.NullCount); + + // Ascending sort + sortedStrColumn = strColumn.Sort() as StringDataFrameColumn; + Assert.Equal("0", sortedStrColumn[0]); + Assert.Null(sortedStrColumn[9]); + + // Descending sort + sortedStrColumn = strColumn.Sort(false) as StringDataFrameColumn; + Assert.Equal("4", sortedStrColumn[0]); + Assert.Null(sortedStrColumn[9]); + } + + [Theory] + [InlineData(5)] + [InlineData(12)] + [InlineData(100)] + [InlineData(1000)] + public void TestPrimitiveColumnSort(int numberOfNulls) + { + // Primitive Column Sort + Int32DataFrameColumn intColumn = new Int32DataFrameColumn("Int", 0); + Assert.Equal(0, intColumn.NullCount); + intColumn.AppendMany(null, numberOfNulls); + Assert.Equal(numberOfNulls, intColumn.NullCount); + + // Should handle all nulls + PrimitiveDataFrameColumn sortedIntColumn = intColumn.Sort(); + Assert.Equal(numberOfNulls, sortedIntColumn.NullCount); + Assert.Null(sortedIntColumn[0]); + + for (int i = 0; i < 5; i++) + { + intColumn.Append(i); + } + Assert.Equal(numberOfNulls, intColumn.NullCount); + + // Ascending sort + sortedIntColumn = intColumn.Sort(); + Assert.Equal(0, sortedIntColumn[0]); + Assert.Null(sortedIntColumn[9]); + + // Descending sort + sortedIntColumn = intColumn.Sort(ascending: false); + Assert.Equal(4, sortedIntColumn[0]); + Assert.Null(sortedIntColumn[9]); + } + + [Fact] + public void TestSortWithDifferentNullCountsInColumns() + { + DataFrame dataFrame = MakeDataFrameWithAllMutableColumnTypes(10); + dataFrame["Int"][3] = null; + dataFrame["String"][3] = null; + DataFrame sorted = dataFrame.OrderBy("Int"); + void Verify(DataFrame sortedDataFrame) + { + Assert.Equal(10, sortedDataFrame.Rows.Count); + DataFrameRow lastRow = sortedDataFrame.Rows[sortedDataFrame.Rows.Count - 1]; + DataFrameRow penultimateRow = sortedDataFrame.Rows[sortedDataFrame.Rows.Count - 2]; + foreach (object value in lastRow) + { + Assert.Null(value); + } + + for (int i = 0; i < sortedDataFrame.Columns.Count; i++) + { + string columnName = sortedDataFrame.Columns[i].Name; + if (columnName != "String" && columnName != "Int") + { + Assert.Equal(dataFrame[columnName][3], penultimateRow[i]); + } + else if (columnName == "String" || columnName == "Int") + { + Assert.Null(penultimateRow[i]); + } + } + } + + Verify(sorted); + + sorted = dataFrame.OrderBy("String"); + Verify(sorted); + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Utils.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Utils.cs new file mode 100644 index 0000000000..c08a96d8b5 --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.Utils.cs @@ -0,0 +1,231 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Apache.Arrow; +using Microsoft.ML.Data; + +namespace Microsoft.Data.Analysis.Tests +{ + public partial class DataFrameTests + { + public static DataFrame MakeDataFrameWithTwoColumns(int length, bool withNulls = true) + { + DataFrameColumn dataFrameColumn1 = new Int32DataFrameColumn("Int1", Enumerable.Range(0, length).Select(x => x)); + DataFrameColumn dataFrameColumn2 = new Int32DataFrameColumn("Int2", Enumerable.Range(10, length).Select(x => x)); + if (withNulls) + { + dataFrameColumn1[length / 2] = null; + dataFrameColumn2[length / 2] = null; + } + DataFrame dataFrame = new DataFrame(); + dataFrame.Columns.Insert(0, dataFrameColumn1); + dataFrame.Columns.Insert(1, dataFrameColumn2); + return dataFrame; + } + + public static ArrowStringDataFrameColumn CreateArrowStringColumn(int length, bool withNulls = true) + { + byte[] dataMemory = new byte[length * 3]; + byte[] nullMemory = new byte[BitUtility.ByteCount(length)]; + byte[] offsetMemory = new byte[(length + 1) * 4]; + + // Initialize offset with 0 as the first value + offsetMemory[0] = 0; + offsetMemory[1] = 0; + offsetMemory[2] = 0; + offsetMemory[3] = 0; + + // Append "foo" length times, with a possible `null` in the middle + int validStringsIndex = 0; + for (int i = 0; i < length; i++) + { + if (withNulls && i == length / 2) + { + BitUtility.SetBit(nullMemory, i, false); + } + else + { + int dataMemoryIndex = validStringsIndex * 3; + dataMemory[dataMemoryIndex++] = 102; + dataMemory[dataMemoryIndex++] = 111; + dataMemory[dataMemoryIndex++] = 111; + BitUtility.SetBit(nullMemory, i, true); + + validStringsIndex++; + } + + // write the current length to (index + 1) + int offsetIndex = (i + 1) * 4; + int offsetValue = 3 * validStringsIndex; + byte[] offsetValueBytes = BitConverter.GetBytes(offsetValue); + offsetMemory[offsetIndex++] = offsetValueBytes[0]; + offsetMemory[offsetIndex++] = offsetValueBytes[1]; + offsetMemory[offsetIndex++] = offsetValueBytes[2]; + offsetMemory[offsetIndex++] = offsetValueBytes[3]; + } + + int nullCount = withNulls ? 1 : 0; + return new ArrowStringDataFrameColumn("ArrowString", dataMemory, offsetMemory, nullMemory, length, nullCount); + } + + public static VBufferDataFrameColumn CreateVBufferDataFrameColumn(int length) + { + var buffers = Enumerable.Repeat(new VBuffer(5, new[] { 0, 1, 2, 3, 4 }), length).ToArray(); + return new VBufferDataFrameColumn("VBuffer", buffers); + } + + public static DataFrame MakeDataFrameWithAllColumnTypes(int length, bool withNulls = true) + { + DataFrame df = MakeDataFrameWithAllMutableAndArrowColumnTypes(length, withNulls); + + var vBufferColumn = CreateVBufferDataFrameColumn(length); + df.Columns.Insert(df.Columns.Count, vBufferColumn); + + return df; + } + + public static DataFrame MakeDataFrameWithAllMutableAndArrowColumnTypes(int length, bool withNulls = true) + { + DataFrame df = MakeDataFrameWithAllMutableColumnTypes(length, withNulls); + DataFrameColumn arrowStringColumn = CreateArrowStringColumn(length, withNulls); + df.Columns.Insert(df.Columns.Count, arrowStringColumn); + + return df; + } + + public static DataFrame MakeDataFrameWithAllMutableColumnTypes(int length, bool withNulls = true) + { + DataFrame df = MakeDataFrameWithNumericStringAndDateTimeColumns(length, withNulls); + DataFrameColumn boolColumn = new BooleanDataFrameColumn("Bool", Enumerable.Range(0, length).Select(x => x % 2 == 0)); + df.Columns.Insert(df.Columns.Count, boolColumn); + if (withNulls) + { + boolColumn[length / 2] = null; + } + return df; + } + + public static DataFrame MakeDataFrameWithNumericAndBoolColumns(int length, bool withNulls = true) + { + DataFrame df = MakeDataFrameWithNumericColumns(length, withNulls); + DataFrameColumn boolColumn = new BooleanDataFrameColumn("Bool", Enumerable.Range(0, length).Select(x => x % 2 == 0)); + df.Columns.Insert(df.Columns.Count, boolColumn); + if (withNulls) + { + boolColumn[length / 2] = null; + } + return df; + } + + public static DataFrame MakeDataFrameWithNumericAndStringColumns(int length, bool withNulls = true) + { + DataFrame df = MakeDataFrameWithNumericColumns(length, withNulls); + DataFrameColumn stringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, length).Select(x => x.ToString())); + df.Columns.Insert(df.Columns.Count, stringColumn); + if (withNulls) + { + stringColumn[length / 2] = null; + } + + DataFrameColumn charColumn = new CharDataFrameColumn("Char", Enumerable.Range(0, length).Select(x => (char)(x + 65))); + df.Columns.Insert(df.Columns.Count, charColumn); + if (withNulls) + { + charColumn[length / 2] = null; + } + return df; + } + + internal static DateTime SampleDateTime = new DateTime(2021, 06, 04); + public static DataFrame MakeDataFrameWithNumericStringAndDateTimeColumns(int length, bool withNulls = true) + { + DataFrame df = MakeDataFrameWithNumericAndStringColumns(length, withNulls); + + DataFrameColumn dateTimeColumn = new DateTimeDataFrameColumn("DateTime", Enumerable.Range(0, length).Select(x => SampleDateTime.AddDays(x))); + df.Columns.Insert(df.Columns.Count, dateTimeColumn); + if (withNulls) + { + dateTimeColumn[length / 2] = null; + } + return df; + } + + public static DataFrame MakeDataFrameWithNumericColumns(int length, bool withNulls = true, int startingFrom = 0) + { + IEnumerable range = Enumerable.Range(startingFrom, length); + + var byteColumn = new ByteDataFrameColumn("Byte", range.Select(x => (byte)x)); + var decimalColumn = new DecimalDataFrameColumn("Decimal", range.Select(x => (decimal)x)); + var doubleColumn = new DoubleDataFrameColumn("Double", range.Select(x => (double)x)); + var floatColumn = new SingleDataFrameColumn("Float", range.Select(x => (float)x)); + var intColumn = new Int32DataFrameColumn("Int", range.Select(x => x)); + var longColumn = new Int64DataFrameColumn("Long", range.Select(x => (long)x)); + var sbyteColumn = new SByteDataFrameColumn("Sbyte", range.Select(x => (sbyte)x)); + var shortColumn = new Int16DataFrameColumn("Short", range.Select(x => (short)x)); + var uintColumn = new UInt32DataFrameColumn("Uint", range.Select(x => (uint)x)); + var ulongColumn = new UInt64DataFrameColumn("Ulong", range.Select(x => (ulong)x)); + var ushortColumn = new UInt16DataFrameColumn("Ushort", range.Select(x => (ushort)x)); + + var columnsList = new List + { + byteColumn, + decimalColumn, + doubleColumn, + floatColumn, + intColumn, + longColumn, + sbyteColumn, + shortColumn, + uintColumn, + ulongColumn, + ushortColumn + }; + + var dataFrame = new DataFrame(columnsList); + + if (withNulls) + { + for (var i = 0; i < dataFrame.Columns.Count; i++) + { + dataFrame.Columns[i][length / 2] = null; + } + } + + return dataFrame; + } + + public static DataFrame MakeDataFrame(int length, bool withNulls = true) + where T1 : unmanaged + where T2 : unmanaged + { + DataFrameColumn baseColumn1 = DataFrameColumn.Create("Column1", Enumerable.Range(0, length).Select(x => (T1)Convert.ChangeType(x % 2 == 0 ? 0 : 1, typeof(T1)))); + DataFrameColumn baseColumn2 = DataFrameColumn.Create("Column2", Enumerable.Range(0, length).Select(x => (T2)Convert.ChangeType(x % 2 == 0 ? 0 : 1, typeof(T2)))); + DataFrame dataFrame = new DataFrame(new List { baseColumn1, baseColumn2 }); + + if (withNulls) + { + for (int i = 0; i < dataFrame.Columns.Count; i++) + { + dataFrame.Columns[i][length / 2] = null; + } + } + + return dataFrame; + } + + public DataFrame SplitTrainTest(DataFrame input, float testRatio, out DataFrame Test) + { + IEnumerable randomIndices = Enumerable.Range(0, (int)input.Rows.Count); + IEnumerable trainIndices = randomIndices.Take((int)(input.Rows.Count * testRatio)); + IEnumerable testIndices = randomIndices.Skip((int)(input.Rows.Count * testRatio)); + Test = input[testIndices]; + return input[trainIndices]; + } + } +} diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs index d5540edf23..1e67caf85f 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrameTests.cs @@ -6,276 +6,14 @@ using System.Collections.Generic; using System.Linq; using System.Text; -using Apache.Arrow; using Microsoft.ML; using Microsoft.ML.Data; -using Microsoft.ML.TestFramework.Attributes; using Xunit; namespace Microsoft.Data.Analysis.Tests { public partial class DataFrameTests { - public static DataFrame MakeDataFrameWithTwoColumns(int length, bool withNulls = true) - { - DataFrameColumn dataFrameColumn1 = new Int32DataFrameColumn("Int1", Enumerable.Range(0, length).Select(x => x)); - DataFrameColumn dataFrameColumn2 = new Int32DataFrameColumn("Int2", Enumerable.Range(10, length).Select(x => x)); - if (withNulls) - { - dataFrameColumn1[length / 2] = null; - dataFrameColumn2[length / 2] = null; - } - DataFrame dataFrame = new DataFrame(); - dataFrame.Columns.Insert(0, dataFrameColumn1); - dataFrame.Columns.Insert(1, dataFrameColumn2); - return dataFrame; - } - - public static ArrowStringDataFrameColumn CreateArrowStringColumn(int length, bool withNulls = true) - { - byte[] dataMemory = new byte[length * 3]; - byte[] nullMemory = new byte[BitUtility.ByteCount(length)]; - byte[] offsetMemory = new byte[(length + 1) * 4]; - - // Initialize offset with 0 as the first value - offsetMemory[0] = 0; - offsetMemory[1] = 0; - offsetMemory[2] = 0; - offsetMemory[3] = 0; - - // Append "foo" length times, with a possible `null` in the middle - int validStringsIndex = 0; - for (int i = 0; i < length; i++) - { - if (withNulls && i == length / 2) - { - BitUtility.SetBit(nullMemory, i, false); - } - else - { - int dataMemoryIndex = validStringsIndex * 3; - dataMemory[dataMemoryIndex++] = 102; - dataMemory[dataMemoryIndex++] = 111; - dataMemory[dataMemoryIndex++] = 111; - BitUtility.SetBit(nullMemory, i, true); - - validStringsIndex++; - } - - // write the current length to (index + 1) - int offsetIndex = (i + 1) * 4; - int offsetValue = 3 * validStringsIndex; - byte[] offsetValueBytes = BitConverter.GetBytes(offsetValue); - offsetMemory[offsetIndex++] = offsetValueBytes[0]; - offsetMemory[offsetIndex++] = offsetValueBytes[1]; - offsetMemory[offsetIndex++] = offsetValueBytes[2]; - offsetMemory[offsetIndex++] = offsetValueBytes[3]; - } - - int nullCount = withNulls ? 1 : 0; - return new ArrowStringDataFrameColumn("ArrowString", dataMemory, offsetMemory, nullMemory, length, nullCount); - } - - public static VBufferDataFrameColumn CreateVBufferDataFrameColumn(int length) - { - var buffers = Enumerable.Repeat(new VBuffer(5, new[] { 0, 1, 2, 3, 4 }), length).ToArray(); - return new VBufferDataFrameColumn("VBuffer", buffers); - } - - public static DataFrame MakeDataFrameWithAllColumnTypes(int length, bool withNulls = true) - { - DataFrame df = MakeDataFrameWithAllMutableAndArrowColumnTypes(length, withNulls); - - var vBufferColumn = CreateVBufferDataFrameColumn(length); - df.Columns.Insert(df.Columns.Count, vBufferColumn); - - return df; - } - - public static DataFrame MakeDataFrameWithAllMutableAndArrowColumnTypes(int length, bool withNulls = true) - { - DataFrame df = MakeDataFrameWithAllMutableColumnTypes(length, withNulls); - DataFrameColumn arrowStringColumn = CreateArrowStringColumn(length, withNulls); - df.Columns.Insert(df.Columns.Count, arrowStringColumn); - - return df; - } - - public static DataFrame MakeDataFrameWithAllMutableColumnTypes(int length, bool withNulls = true) - { - DataFrame df = MakeDataFrameWithNumericStringAndDateTimeColumns(length, withNulls); - DataFrameColumn boolColumn = new BooleanDataFrameColumn("Bool", Enumerable.Range(0, length).Select(x => x % 2 == 0)); - df.Columns.Insert(df.Columns.Count, boolColumn); - if (withNulls) - { - boolColumn[length / 2] = null; - } - return df; - } - - public static DataFrame MakeDataFrameWithNumericAndBoolColumns(int length, bool withNulls = true) - { - DataFrame df = MakeDataFrameWithNumericColumns(length, withNulls); - DataFrameColumn boolColumn = new BooleanDataFrameColumn("Bool", Enumerable.Range(0, length).Select(x => x % 2 == 0)); - df.Columns.Insert(df.Columns.Count, boolColumn); - if (withNulls) - { - boolColumn[length / 2] = null; - } - return df; - } - - public static DataFrame MakeDataFrameWithNumericAndStringColumns(int length, bool withNulls = true) - { - DataFrame df = MakeDataFrameWithNumericColumns(length, withNulls); - DataFrameColumn stringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, length).Select(x => x.ToString())); - df.Columns.Insert(df.Columns.Count, stringColumn); - if (withNulls) - { - stringColumn[length / 2] = null; - } - - DataFrameColumn charColumn = new CharDataFrameColumn("Char", Enumerable.Range(0, length).Select(x => (char)(x + 65))); - df.Columns.Insert(df.Columns.Count, charColumn); - if (withNulls) - { - charColumn[length / 2] = null; - } - return df; - } - - internal static DateTime SampleDateTime = new DateTime(2021, 06, 04); - public static DataFrame MakeDataFrameWithNumericStringAndDateTimeColumns(int length, bool withNulls = true) - { - DataFrame df = MakeDataFrameWithNumericAndStringColumns(length, withNulls); - - DataFrameColumn dateTimeColumn = new DateTimeDataFrameColumn("DateTime", Enumerable.Range(0, length).Select(x => SampleDateTime.AddDays(x))); - df.Columns.Insert(df.Columns.Count, dateTimeColumn); - if (withNulls) - { - dateTimeColumn[length / 2] = null; - } - return df; - } - - public static DataFrame MakeDataFrameWithNumericColumns(int length, bool withNulls = true, int startingFrom = 0) - { - IEnumerable range = Enumerable.Range(startingFrom, length); - - var byteColumn = new ByteDataFrameColumn("Byte", range.Select(x => (byte)x)); - var decimalColumn = new DecimalDataFrameColumn("Decimal", range.Select(x => (decimal)x)); - var doubleColumn = new DoubleDataFrameColumn("Double", range.Select(x => (double)x)); - var floatColumn = new SingleDataFrameColumn("Float", range.Select(x => (float)x)); - var intColumn = new Int32DataFrameColumn("Int", range.Select(x => x)); - var longColumn = new Int64DataFrameColumn("Long", range.Select(x => (long)x)); - var sbyteColumn = new SByteDataFrameColumn("Sbyte", range.Select(x => (sbyte)x)); - var shortColumn = new Int16DataFrameColumn("Short", range.Select(x => (short)x)); - var uintColumn = new UInt32DataFrameColumn("Uint", range.Select(x => (uint)x)); - var ulongColumn = new UInt64DataFrameColumn("Ulong", range.Select(x => (ulong)x)); - var ushortColumn = new UInt16DataFrameColumn("Ushort", range.Select(x => (ushort)x)); - - var columnsList = new List - { - byteColumn, - decimalColumn, - doubleColumn, - floatColumn, - intColumn, - longColumn, - sbyteColumn, - shortColumn, - uintColumn, - ulongColumn, - ushortColumn - }; - - var dataFrame = new DataFrame(columnsList); - - if (withNulls) - { - for (var i = 0; i < dataFrame.Columns.Count; i++) - { - dataFrame.Columns[i][length / 2] = null; - } - } - - return dataFrame; - } - - public static DataFrame MakeDataFrame(int length, bool withNulls = true) - where T1 : unmanaged - where T2 : unmanaged - { - DataFrameColumn baseColumn1 = DataFrameColumn.Create("Column1", Enumerable.Range(0, length).Select(x => (T1)Convert.ChangeType(x % 2 == 0 ? 0 : 1, typeof(T1)))); - DataFrameColumn baseColumn2 = DataFrameColumn.Create("Column2", Enumerable.Range(0, length).Select(x => (T2)Convert.ChangeType(x % 2 == 0 ? 0 : 1, typeof(T2)))); - DataFrame dataFrame = new DataFrame(new List { baseColumn1, baseColumn2 }); - - if (withNulls) - { - for (int i = 0; i < dataFrame.Columns.Count; i++) - { - dataFrame.Columns[i][length / 2] = null; - } - } - - return dataFrame; - } - - public DataFrame SplitTrainTest(DataFrame input, float testRatio, out DataFrame Test) - { - IEnumerable randomIndices = Enumerable.Range(0, (int)input.Rows.Count); - IEnumerable trainIndices = randomIndices.Take((int)(input.Rows.Count * testRatio)); - IEnumerable testIndices = randomIndices.Skip((int)(input.Rows.Count * testRatio)); - Test = input[testIndices]; - return input[trainIndices]; - } - - [Fact] - public void TestVBufferColumn_Creation() - { - var vBufferColumn = CreateVBufferDataFrameColumn(10); - - Assert.Equal(10, vBufferColumn.Length); - Assert.Equal(5, vBufferColumn[0].GetValues().Length); - Assert.Equal(0, vBufferColumn[0].GetValues()[0]); - } - - [Fact] - public void TestVBufferColumn_Indexer() - { - var buffer = new VBuffer(5, new[] { 4, 3, 2, 1, 0 }); - - var vBufferColumn = new VBufferDataFrameColumn("VBuffer", 1); - vBufferColumn[0] = buffer; - - Assert.Equal(1, vBufferColumn.Length); - Assert.Equal(5, vBufferColumn[0].GetValues().Length); - Assert.Equal(0, vBufferColumn[0].GetValues()[4]); - } - - [X64Fact("32-bit doesn't allow to allocate more than 2 Gb")] - public void TestVBufferColumn_Indexer_MoreThanMaxInt() - { - var originalValues = new[] { 4, 3, 2, 1, 0 }; - - var length = VBufferDataFrameColumn.MaxCapacity + 3; - - var vBufferColumn = new VBufferDataFrameColumn("VBuffer", length); - long index = length - 2; - - vBufferColumn[index] = new VBuffer(5, originalValues); - - var values = vBufferColumn[index].GetValues(); - - Assert.Equal(length, vBufferColumn.Length); - Assert.Equal(5, values.Length); - - for (int i = 0; i < values.Length; i++) - { - Assert.Equal(originalValues[i], values[i]); - } - } - [Fact] public void TestIndexer() { @@ -463,418 +201,6 @@ public void RenameColumnWithRenameColumnTests() Assert.True(ReferenceEquals(city, renamedColumn)); } - [Fact] - public void TestBinaryOperations() - { - DataFrame df = MakeDataFrameWithTwoColumns(12); - IReadOnlyList listOfInts = new List() { 5, 5 }; - - // The following binary ops return a copy - var ret = df.Add(5); - Assert.Equal(0, df[0, 0]); - Assert.Equal(5, ret[0, 0]); - ret = df.Add(listOfInts); - Assert.Equal(0, df[0, 0]); - Assert.Equal(5, ret[0, 0]); - ret = df.Subtract(5); - Assert.Equal(0, df[0, 0]); - Assert.Equal(-5, ret[0, 0]); - ret = df.Subtract(listOfInts); - Assert.Equal(0, df[0, 0]); - Assert.Equal(-5, ret[0, 0]); - ret = df.Multiply(5); - Assert.Equal(1, df[1, 0]); - Assert.Equal(5, ret[1, 0]); - ret = df.Multiply(listOfInts); - Assert.Equal(1, df[1, 0]); - Assert.Equal(5, ret[1, 0]); - ret = df.Divide(5); - Assert.Equal(5, df[5, 0]); - Assert.Equal(1, ret[5, 0]); - ret = df.Divide(listOfInts); - Assert.Equal(5, df[5, 0]); - Assert.Equal(1, ret[5, 0]); - ret = df.Modulo(5); - Assert.Equal(5, df[5, 0]); - Assert.Equal(0, ret[5, 0]); - ret = df.Modulo(listOfInts); - Assert.Equal(5, df[5, 0]); - Assert.Equal(0, ret[5, 0]); - - Assert.Equal(true, df.ElementwiseGreaterThanOrEqual(5)[7, 0]); - Assert.Equal(true, df.ElementwiseGreaterThanOrEqual(listOfInts)[7, 0]); - Assert.Equal(true, df.ElementwiseLessThanOrEqual(5)[4, 0]); - Assert.Equal(true, df.ElementwiseLessThanOrEqual(listOfInts)[4, 0]); - Assert.Equal(false, df.ElementwiseGreaterThan(5)[5, 0]); - Assert.Equal(false, df.ElementwiseGreaterThan(listOfInts)[5, 0]); - Assert.Equal(false, df.ElementwiseLessThan(5)[5, 0]); - Assert.Equal(false, df.ElementwiseLessThan(listOfInts)[5, 0]); - // The following binary ops are in place - Assert.Equal(5, df.Add(5, inPlace: true)[0, 0]); - Assert.Equal(10, df.Add(listOfInts, inPlace: true)[0, 0]); - Assert.Equal(5, df.Subtract(5, inPlace: true)[0, 0]); - Assert.Equal(0, df.Subtract(listOfInts, inPlace: true)[0, 0]); - Assert.Equal(5, df.Multiply(5, inPlace: true)[1, 0]); - Assert.Equal(25, df.Multiply(listOfInts, inPlace: true)[1, 0]); - Assert.Equal(5, df.Divide(5, inPlace: true)[1, 0]); - Assert.Equal(1, df.Divide(listOfInts, inPlace: true)[1, 0]); - Assert.Equal(1, df.Modulo(5, inPlace: true)[1, 0]); - Assert.Equal(1, df.Modulo(listOfInts, inPlace: true)[1, 0]); - Assert.Equal(2, df.LeftShift(1)[1, 0]); - Assert.Equal(1, df.RightShift(1)[2, 0]); - } - - [Fact] - public void TestBinaryOperationsWithColumns() - { - int length = 10; - var df1 = MakeDataFrameWithNumericColumns(length); - var df2 = MakeDataFrameWithNumericColumns(length); - - DataFrameColumn newColumn; - DataFrameColumn verify; - for (int i = 0; i < df1.Columns.Count; i++) - { - newColumn = df1.Columns[df1.Columns[i].Name] + df2.Columns[df2.Columns[i].Name]; - verify = newColumn.ElementwiseEquals(df1.Columns[i] * 2); - Assert.Equal(true, verify[0]); - - newColumn = df1.Columns[df1.Columns[i].Name] - df2.Columns[df2.Columns[i].Name]; - verify = newColumn.ElementwiseEquals(0); - Assert.Equal(true, verify[0]); - - newColumn = df1.Columns[df1.Columns[i].Name] * df2.Columns[df2.Columns[i].Name]; - verify = newColumn.ElementwiseEquals(df1.Columns[i] * df1.Columns[i]); - Assert.Equal(true, verify[0]); - - var df1Column = df1.Columns[i] + 1; - var df2Column = df2.Columns[i] + 1; - newColumn = df1Column / df2Column; - verify = newColumn.ElementwiseEquals(1); - Assert.Equal(true, verify[0]); - - newColumn = df1Column % df2Column; - verify = newColumn.ElementwiseEquals(0); - Assert.Equal(true, verify[0]); - - verify = df1.Columns[df1.Columns[i].Name].ElementwiseEquals(df2.Columns[df2.Columns[i].Name]); - Assert.True(verify.All()); - - verify = df1.Columns[df1.Columns[i].Name].ElementwiseNotEquals(df2.Columns[df2.Columns[i].Name]); - Assert.False(verify.Any()); - - verify = df1.Columns[df1.Columns[i].Name].ElementwiseGreaterThanOrEqual(df2.Columns[df2.Columns[i].Name]); - Assert.True(verify.All()); - - verify = df1.Columns[df1.Columns[i].Name].ElementwiseLessThanOrEqual(df2.Columns[df2.Columns[i].Name]); - Assert.True(verify.All()); - - verify = df1.Columns[df1.Columns[i].Name].ElementwiseGreaterThan(df2.Columns[df2.Columns[i].Name]); - Assert.False(verify.Any()); - - verify = df1.Columns[df1.Columns[i].Name].ElementwiseLessThan(df2.Columns[df2.Columns[i].Name]); - Assert.False(verify.Any()); - } - } - - [Fact] - public void TestBinaryOperationsWithConversions() - { - DataFrame df = DataFrameTests.MakeDataFrameWithTwoColumns(10); - - // Add a double to an int column - DataFrame dfd = df.Add(5.0f); - var dtype = dfd.Columns[0].DataType; - Assert.True(dtype == typeof(double)); - - // Add a decimal to an int column - DataFrame dfm = df.Add(5.0m); - dtype = dfm.Columns[0].DataType; - Assert.True(dtype == typeof(decimal)); - - // int + bool should throw - Assert.Throws(() => df.Add(true)); - - var dataFrameColumn1 = new DoubleDataFrameColumn("Double1", Enumerable.Range(0, 10).Select(x => (double)x)); - df.Columns[0] = dataFrameColumn1; - // Double + comparison ops should throw - Assert.Throws(() => df.And(true)); - } - - [Fact] - public void TestBinaryOperationsOnBoolColumn() - { - var df = new DataFrame(); - var dataFrameColumn1 = new BooleanDataFrameColumn("Bool1", Enumerable.Range(0, 10).Select(x => true)); - var dataFrameColumn2 = new BooleanDataFrameColumn("Bool2", Enumerable.Range(0, 10).Select(x => true)); - df.Columns.Insert(0, dataFrameColumn1); - df.Columns.Insert(1, dataFrameColumn2); - - // bool + int should throw - Assert.Throws(() => df.Add(5)); - // Left shift should throw - Assert.Throws(() => df.LeftShift(5)); - - IReadOnlyList listOfBools = new List() { true, false }; - // boolean and And should work - var newdf = df.And(true); - Assert.Equal(true, newdf[4, 0]); - var newdf1 = df.And(listOfBools); - Assert.Equal(false, newdf1[4, 1]); - - newdf = df.Or(true); - Assert.Equal(true, newdf[4, 0]); - newdf1 = df.Or(listOfBools); - Assert.Equal(true, newdf1[4, 1]); - - newdf = df.Xor(true); - Assert.Equal(false, newdf[4, 0]); - newdf1 = df.Xor(listOfBools); - Assert.Equal(true, newdf1[4, 1]); - } - - [Fact] - public void TestBinaryOperationsOnDateTimeColumn() - { - var df = new DataFrame(); - var dataFrameColumn1 = new DateTimeDataFrameColumn("DateTime1", Enumerable.Range(0, 5).Select(x => SampleDateTime.AddDays(x))); - // Make the second data frame column have one value that is different - var dataFrameColumn2 = new DateTimeDataFrameColumn("DateTime2", Enumerable.Range(0, 4).Select(x => SampleDateTime.AddDays(x))); - dataFrameColumn2.Append(SampleDateTime.AddDays(6)); - df.Columns.Insert(0, dataFrameColumn1); - df.Columns.Insert(1, dataFrameColumn2); - - // DateTime + int should throw - Assert.Throws(() => df.Add(5)); - // Left shift should throw - Assert.Throws(() => df.LeftShift(5)); - // Right shift should throw - Assert.Throws(() => df.RightShift(5)); - - // And should throw - Assert.Throws(() => df.And(true)); - // Or should throw - Assert.Throws(() => df.Or(true)); - // Xor should throw - Assert.Throws(() => df.Xor(true)); - - var equalsResult = dataFrameColumn1.ElementwiseEquals(dataFrameColumn2); - Assert.True(equalsResult[0]); - Assert.False(equalsResult[4]); - - var equalsToScalarResult = df["DateTime1"].ElementwiseEquals(SampleDateTime); - Assert.True(equalsToScalarResult[0]); - Assert.False(equalsToScalarResult[1]); - - var notEqualsResult = dataFrameColumn1.ElementwiseNotEquals(dataFrameColumn2); - Assert.False(notEqualsResult[0]); - Assert.True(notEqualsResult[4]); - - var notEqualsToScalarResult = df["DateTime1"].ElementwiseNotEquals(SampleDateTime); - Assert.False(notEqualsToScalarResult[0]); - Assert.True(notEqualsToScalarResult[1]); - } - - [Fact] - public void TestBinaryOperationsOnArrowStringColumn() - { - var df = new DataFrame(); - var strArrayBuilder = new StringArray.Builder(); - for (int i = 0; i < 10; i++) - { - strArrayBuilder.Append(i.ToString()); - } - StringArray strArray = strArrayBuilder.Build(); - - ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", strArray.ValueBuffer.Memory, strArray.ValueOffsetsBuffer.Memory, strArray.NullBitmapBuffer.Memory, strArray.Length, strArray.NullCount); - df.Columns.Insert(0, stringColumn); - - DataFrameColumn newCol = stringColumn.ElementwiseEquals(4); - Assert.Equal(true, newCol[4]); - Assert.Equal(false, newCol[0]); - Assert.Equal(false, newCol[5]); - - newCol = stringColumn.ElementwiseEquals("4"); - Assert.Equal(true, newCol[4]); - Assert.Equal(false, newCol[0]); - - newCol = stringColumn.ElementwiseEquals("foo"); - Assert.False(newCol.All()); - newCol = stringColumn.ElementwiseEquals(null); - Assert.False(newCol.All()); - - ArrowStringDataFrameColumn stringColumnCopy = new ArrowStringDataFrameColumn("String", strArray.ValueBuffer.Memory, strArray.ValueOffsetsBuffer.Memory, strArray.NullBitmapBuffer.Memory, strArray.Length, strArray.NullCount); - newCol = stringColumn.ElementwiseEquals(stringColumnCopy); - Assert.True(newCol.All()); - - DataFrameColumn stringColumnCopyAsBaseColumn = stringColumnCopy; - newCol = stringColumn.ElementwiseEquals(stringColumnCopyAsBaseColumn); - Assert.True(newCol.All()); - - newCol = stringColumn.ElementwiseNotEquals(5); - Assert.Equal(true, newCol[0]); - Assert.Equal(false, newCol[5]); - - newCol = stringColumn.ElementwiseNotEquals("5"); - Assert.Equal(true, newCol[0]); - Assert.Equal(false, newCol[5]); - - newCol = stringColumn.ElementwiseNotEquals("foo"); - Assert.True(newCol.All()); - newCol = stringColumn.ElementwiseNotEquals(null); - Assert.True(newCol.All()); - - newCol = stringColumn.ElementwiseNotEquals(stringColumnCopy); - Assert.False(newCol.All()); - - newCol = stringColumn.ElementwiseNotEquals(stringColumnCopyAsBaseColumn); - Assert.False(newCol.All()); - } - - [Fact] - public void TestBinaryOperationsOnStringColumn() - { - var df = new DataFrame(); - DataFrameColumn stringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, 10).Select(x => x.ToString())); - df.Columns.Insert(0, stringColumn); - - DataFrameColumn newCol = stringColumn.ElementwiseEquals(5); - Assert.Equal(true, newCol[5]); - Assert.Equal(false, newCol[0]); - - newCol = (stringColumn as StringDataFrameColumn).ElementwiseEquals("5"); - Assert.Equal(true, newCol[5]); - Assert.Equal(false, newCol[0]); - - DataFrameColumn stringColumnCopy = new StringDataFrameColumn("String", Enumerable.Range(0, 10).Select(x => x.ToString())); - newCol = stringColumn.ElementwiseEquals(stringColumnCopy); - Assert.Equal(true, newCol[5]); - Assert.Equal(true, newCol[0]); - - StringDataFrameColumn typedStringColumn = stringColumn as StringDataFrameColumn; - StringDataFrameColumn typedStringColumnCopy = stringColumnCopy as StringDataFrameColumn; - newCol = typedStringColumn.ElementwiseEquals(typedStringColumnCopy); - Assert.True(newCol.All()); - - newCol = stringColumn.ElementwiseNotEquals(5); - Assert.Equal(false, newCol[5]); - Assert.Equal(true, newCol[0]); - - newCol = typedStringColumn.ElementwiseNotEquals("5"); - Assert.Equal(false, newCol[5]); - Assert.Equal(true, newCol[0]); - - newCol = stringColumn.ElementwiseNotEquals(stringColumnCopy); - Assert.Equal(false, newCol[5]); - Assert.Equal(false, newCol[0]); - - newCol = typedStringColumn.ElementwiseNotEquals(typedStringColumnCopy); - Assert.False(newCol.All()); - - newCol = typedStringColumn.Add("suffix"); - for (int i = 0; i < newCol.Length; i++) - { - Assert.Equal(newCol[i], typedStringColumn[i] + "suffix"); - } - DataFrameColumn addString = typedStringColumn + "suffix"; - for (int i = 0; i < addString.Length; i++) - { - Assert.Equal(addString[i], typedStringColumn[i] + "suffix"); - } - Assert.True(newCol.ElementwiseEquals(addString).All()); - addString = "prefix" + typedStringColumn; - for (int i = 0; i < addString.Length; i++) - { - Assert.Equal(addString[i], "prefix" + typedStringColumn[i]); - } - } - - [Fact] - public void TestBinaryOperatorsWithConversions() - { - var df = MakeDataFrameWithNumericColumns(10); - - DataFrame tempDf = df + 1; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + (double)1); - tempDf = df + 1.1; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + 1.1); - tempDf = df + 1.1m; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + 1.1m); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - tempDf = df - 1.1; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] - 1.1); - tempDf = df - 1.1m; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] - 1.1m); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - tempDf = df * 1.1; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] * 1.1); - tempDf = df * 1.1m; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] * 1.1m); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - tempDf = df / 1.1; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] / 1.1); - tempDf = df / 1.1m; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] / 1.1m); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - tempDf = df % 1.1; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] % 1.1); - tempDf = df % 1.1m; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] % 1.1m); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - tempDf = 1 + df; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + (double)1); - tempDf = 1.1 + df; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + 1.1); - tempDf = 1.1m + df; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] + 1.1m); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - tempDf = 1.1 - df; - Assert.Equal(tempDf[0, 0], 1.1 - (byte)df[0, 0]); - tempDf = 1.1m - df; - Assert.Equal(tempDf[0, 0], 1.1m - (byte)df[0, 0]); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - tempDf = 1.1 * df; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] * 1.1); - tempDf = 1.1m * df; - Assert.Equal(tempDf[0, 0], (byte)df[0, 0] * 1.1m); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - // To prevent a divide by zero - var plusOne = df + 1; - tempDf = 1.1 / plusOne; - Assert.Equal(tempDf[0, 0], 1.1 / (double)plusOne[0, 0]); - var plusDecimal = df + 1.1m; - tempDf = 1.1m / plusDecimal; - Assert.Equal(tempDf[0, 0], (1.1m) / (decimal)plusDecimal[0, 0]); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - tempDf = 1.1 % plusOne; - Assert.Equal(tempDf[0, 0], 1.1 % (double)plusOne[0, 0]); - tempDf = 1.1m % plusDecimal; - Assert.Equal(tempDf[0, 0], 1.1m % (decimal)plusDecimal[0, 0]); - Assert.True(typeof(decimal) == tempDf.Columns["Int"].DataType); - - Assert.Equal((byte)0, df[0, 0]); - } - - [Fact] - public void TestBinaryOperationsOnColumns() - { - Int32DataFrameColumn column = new Int32DataFrameColumn("Int", Enumerable.Range(0, 10)); - Assert.ThrowsAny(() => column.Add(5.5, inPlace: true)); - Assert.ThrowsAny(() => column.ReverseAdd(5.5, inPlace: true)); - string str = "A String"; - Assert.ThrowsAny(() => column.Add(str, inPlace: true)); - Assert.ThrowsAny(() => column.ReverseAdd(str, inPlace: true)); - } - [Fact] public void TestColumnReverseOrderState() { @@ -893,467 +219,6 @@ public void TestProjectionAndAppend() Assert.Equal(16, df.Columns["Int3"][2]); } - [Fact] - public void TestComputations() - { - DataFrame df = MakeDataFrameWithAllMutableColumnTypes(10); - df["Int"][0] = -10; - Assert.Equal(-10, df.Columns["Int"][0]); - - DataFrameColumn absColumn = df.Columns["Int"].Abs(); - Assert.Equal(10, absColumn[0]); - Assert.Equal(-10, df.Columns["Int"][0]); - df.Columns["Int"].Abs(true); - Assert.Equal(10, df.Columns["Int"][0]); - - Assert.Throws(() => df.Columns["Byte"].All()); - Assert.Throws(() => df.Columns["Byte"].Any()); - Assert.Throws(() => df.Columns["Char"].All()); - Assert.Throws(() => df.Columns["Char"].Any()); - Assert.Throws(() => df.Columns["DateTime"].All()); - Assert.Throws(() => df.Columns["DateTime"].Any()); - Assert.Throws(() => df.Columns["Decimal"].All()); - Assert.Throws(() => df.Columns["Decimal"].Any()); - Assert.Throws(() => df.Columns["Double"].All()); - Assert.Throws(() => df.Columns["Double"].Any()); - Assert.Throws(() => df.Columns["Float"].All()); - Assert.Throws(() => df.Columns["Float"].Any()); - Assert.Throws(() => df.Columns["Int"].All()); - Assert.Throws(() => df.Columns["Int"].Any()); - Assert.Throws(() => df.Columns["Long"].All()); - Assert.Throws(() => df.Columns["Long"].Any()); - Assert.Throws(() => df.Columns["Sbyte"].All()); - Assert.Throws(() => df.Columns["Sbyte"].Any()); - Assert.Throws(() => df.Columns["Short"].All()); - Assert.Throws(() => df.Columns["Short"].Any()); - Assert.Throws(() => df.Columns["Uint"].All()); - Assert.Throws(() => df.Columns["Uint"].Any()); - Assert.Throws(() => df.Columns["Ulong"].All()); - Assert.Throws(() => df.Columns["Ulong"].Any()); - Assert.Throws(() => df.Columns["Ushort"].All()); - Assert.Throws(() => df.Columns["Ushort"].Any()); - - bool any = df.Columns["Bool"].Any(); - bool all = df.Columns["Bool"].All(); - Assert.True(any); - Assert.False(all); - - // Test the computation results - df.Columns["Double"][0] = 100.0; - DataFrameColumn doubleColumn = df.Columns["Double"].CumulativeMax(); - for (int i = 0; i < doubleColumn.Length; i++) - { - if (i == 5) - Assert.Null(doubleColumn[i]); - else - Assert.Equal(100.0, (double)doubleColumn[i]); - } - Assert.Equal(1.0, df.Columns["Double"][1]); - df.Columns["Double"].CumulativeMax(true); - for (int i = 0; i < df.Columns["Double"].Length; i++) - { - if (i == 5) - Assert.Null(df.Columns["Double"][i]); - else - Assert.Equal(100.0, (double)df.Columns["Double"][i]); - } - - df.Columns["Float"][0] = -10.0f; - DataFrameColumn floatColumn = df.Columns["Float"].CumulativeMin(); - for (int i = 0; i < floatColumn.Length; i++) - { - if (i == 5) - Assert.Null(floatColumn[i]); - else - Assert.Equal(-10.0f, (float)floatColumn[i]); - } - Assert.Equal(9.0f, df.Columns["Float"][9]); - df.Columns["Float"].CumulativeMin(true); - for (int i = 0; i < df.Columns["Float"].Length; i++) - { - if (i == 5) - Assert.Null(df.Columns["Float"][i]); - else - Assert.Equal(-10.0f, (float)df.Columns["Float"][i]); - } - - DataFrameColumn uintColumn = df.Columns["Uint"].CumulativeProduct(); - Assert.Equal((uint)0, uintColumn[8]); - Assert.Equal((uint)8, df.Columns["Uint"][8]); - df.Columns["Uint"].CumulativeProduct(true); - Assert.Equal((uint)0, df.Columns["Uint"][9]); - - DataFrameColumn ushortColumn = df.Columns["Ushort"].CumulativeSum(); - Assert.Equal((ushort)40, ushortColumn[9]); - Assert.Equal((ushort)9, df.Columns["Ushort"][9]); - df.Columns["Ushort"].CumulativeSum(true); - Assert.Equal((ushort)40, df.Columns["Ushort"][9]); - - Assert.Equal(100.0, df.Columns["Double"].Max()); - Assert.Equal(-10.0f, df.Columns["Float"].Min()); - Assert.Equal((uint)0, df.Columns["Uint"].Product()); - Assert.Equal((ushort)130, df.Columns["Ushort"].Sum()); - - df.Columns["Double"][0] = 100.1; - Assert.Equal(100.1, df.Columns["Double"][0]); - DataFrameColumn roundColumn = df.Columns["Double"].Round(); - Assert.Equal(100.0, roundColumn[0]); - Assert.Equal(100.1, df.Columns["Double"][0]); - df.Columns["Double"].Round(true); - Assert.Equal(100.0, df.Columns["Double"][0]); - - // Test that none of the numeric column types throw - for (int i = 0; i < df.Columns.Count; i++) - { - DataFrameColumn column = df.Columns[i]; - if (column.DataType == typeof(bool)) - { - Assert.Throws(() => column.CumulativeMax()); - Assert.Throws(() => column.CumulativeMin()); - Assert.Throws(() => column.CumulativeProduct()); - Assert.Throws(() => column.CumulativeSum()); - Assert.Throws(() => column.Max()); - Assert.Throws(() => column.Min()); - Assert.Throws(() => column.Product()); - Assert.Throws(() => column.Sum()); - continue; - } - else if (column.DataType == typeof(string)) - { - Assert.Throws(() => column.CumulativeMax()); - Assert.Throws(() => column.CumulativeMin()); - Assert.Throws(() => column.CumulativeProduct()); - Assert.Throws(() => column.CumulativeSum()); - Assert.Throws(() => column.Max()); - Assert.Throws(() => column.Min()); - Assert.Throws(() => column.Product()); - Assert.Throws(() => column.Sum()); - continue; - } - else if (column.DataType == typeof(DateTime)) - { - column.CumulativeMax(); - column.CumulativeMin(); - column.Max(); - column.Min(); - - Assert.Throws(() => column.CumulativeProduct()); - Assert.Throws(() => column.CumulativeSum()); - Assert.Throws(() => column.Product()); - Assert.Throws(() => column.Sum()); - continue; - } - - column.CumulativeMax(); - column.CumulativeMin(); - column.CumulativeProduct(); - column.CumulativeSum(); - column.Max(); - column.Min(); - column.Product(); - column.Sum(); - } - } - - [Fact] - public void TestComputationsIncludingDateTime() - { - DataFrame df = MakeDataFrameWithNumericStringAndDateTimeColumns(10); - df["Int"][0] = -10; - Assert.Equal(-10, df.Columns["Int"][0]); - - DataFrameColumn absColumn = df.Columns["Int"].Abs(); - Assert.Equal(10, absColumn[0]); - Assert.Equal(-10, df.Columns["Int"][0]); - df.Columns["Int"].Abs(true); - Assert.Equal(10, df.Columns["Int"][0]); - - Assert.Throws(() => df.Columns["Byte"].All()); - Assert.Throws(() => df.Columns["Byte"].Any()); - Assert.Throws(() => df.Columns["Char"].All()); - Assert.Throws(() => df.Columns["Char"].Any()); - Assert.Throws(() => df.Columns["Decimal"].All()); - Assert.Throws(() => df.Columns["Decimal"].Any()); - Assert.Throws(() => df.Columns["Double"].All()); - Assert.Throws(() => df.Columns["Double"].Any()); - Assert.Throws(() => df.Columns["Float"].All()); - Assert.Throws(() => df.Columns["Float"].Any()); - Assert.Throws(() => df.Columns["Int"].All()); - Assert.Throws(() => df.Columns["Int"].Any()); - Assert.Throws(() => df.Columns["Long"].All()); - Assert.Throws(() => df.Columns["Long"].Any()); - Assert.Throws(() => df.Columns["Sbyte"].All()); - Assert.Throws(() => df.Columns["Sbyte"].Any()); - Assert.Throws(() => df.Columns["Short"].All()); - Assert.Throws(() => df.Columns["Short"].Any()); - Assert.Throws(() => df.Columns["Uint"].All()); - Assert.Throws(() => df.Columns["Uint"].Any()); - Assert.Throws(() => df.Columns["Ulong"].All()); - Assert.Throws(() => df.Columns["Ulong"].Any()); - Assert.Throws(() => df.Columns["Ushort"].All()); - Assert.Throws(() => df.Columns["Ushort"].Any()); - Assert.Throws(() => df.Columns["DateTime"].All()); - Assert.Throws(() => df.Columns["DateTime"].Any()); - - // Test the computation results - var maxDate = SampleDateTime.AddDays(100); - df.Columns["DateTime"][0] = maxDate; - DataFrameColumn dateTimeColumn = df.Columns["DateTime"].CumulativeMax(); - for (int i = 0; i < dateTimeColumn.Length; i++) - { - if (i == 5) - Assert.Null(dateTimeColumn[i]); - else - Assert.Equal(maxDate, (DateTime)dateTimeColumn[i]); - } - Assert.Equal(maxDate, dateTimeColumn.Max()); - - df.Columns["Double"][0] = 100.0; - DataFrameColumn doubleColumn = df.Columns["Double"].CumulativeMax(); - for (int i = 0; i < doubleColumn.Length; i++) - { - if (i == 5) - Assert.Null(doubleColumn[i]); - else - Assert.Equal(100.0, (double)doubleColumn[i]); - } - Assert.Equal(1.0, df.Columns["Double"][1]); - df.Columns["Double"].CumulativeMax(true); - for (int i = 0; i < df.Columns["Double"].Length; i++) - { - if (i == 5) - Assert.Null(df.Columns["Double"][i]); - else - Assert.Equal(100.0, (double)df.Columns["Double"][i]); - } - - df.Columns["Float"][0] = -10.0f; - DataFrameColumn floatColumn = df.Columns["Float"].CumulativeMin(); - for (int i = 0; i < floatColumn.Length; i++) - { - if (i == 5) - Assert.Null(floatColumn[i]); - else - Assert.Equal(-10.0f, (float)floatColumn[i]); - } - Assert.Equal(9.0f, df.Columns["Float"][9]); - df.Columns["Float"].CumulativeMin(true); - for (int i = 0; i < df.Columns["Float"].Length; i++) - { - if (i == 5) - Assert.Null(df.Columns["Float"][i]); - else - Assert.Equal(-10.0f, (float)df.Columns["Float"][i]); - } - - DataFrameColumn uintColumn = df.Columns["Uint"].CumulativeProduct(); - Assert.Equal((uint)0, uintColumn[8]); - Assert.Equal((uint)8, df.Columns["Uint"][8]); - df.Columns["Uint"].CumulativeProduct(true); - Assert.Equal((uint)0, df.Columns["Uint"][9]); - - DataFrameColumn ushortColumn = df.Columns["Ushort"].CumulativeSum(); - Assert.Equal((ushort)40, ushortColumn[9]); - Assert.Equal((ushort)9, df.Columns["Ushort"][9]); - df.Columns["Ushort"].CumulativeSum(true); - Assert.Equal((ushort)40, df.Columns["Ushort"][9]); - - Assert.Equal(100.0, df.Columns["Double"].Max()); - Assert.Equal(-10.0f, df.Columns["Float"].Min()); - Assert.Equal((uint)0, df.Columns["Uint"].Product()); - Assert.Equal((ushort)130, df.Columns["Ushort"].Sum()); - - df.Columns["Double"][0] = 100.1; - Assert.Equal(100.1, df.Columns["Double"][0]); - DataFrameColumn roundColumn = df.Columns["Double"].Round(); - Assert.Equal(100.0, roundColumn[0]); - Assert.Equal(100.1, df.Columns["Double"][0]); - df.Columns["Double"].Round(true); - Assert.Equal(100.0, df.Columns["Double"][0]); - - // Test that none of the numeric column types throw - for (int i = 0; i < df.Columns.Count; i++) - { - DataFrameColumn column = df.Columns[i]; - if (column.DataType == typeof(bool)) - { - Assert.Throws(() => column.CumulativeMax()); - Assert.Throws(() => column.CumulativeMin()); - Assert.Throws(() => column.CumulativeProduct()); - Assert.Throws(() => column.CumulativeSum()); - Assert.Throws(() => column.Max()); - Assert.Throws(() => column.Min()); - Assert.Throws(() => column.Product()); - Assert.Throws(() => column.Sum()); - continue; - } - else if (column.DataType == typeof(string)) - { - Assert.Throws(() => column.CumulativeMax()); - Assert.Throws(() => column.CumulativeMin()); - Assert.Throws(() => column.CumulativeProduct()); - Assert.Throws(() => column.CumulativeSum()); - Assert.Throws(() => column.Max()); - Assert.Throws(() => column.Min()); - Assert.Throws(() => column.Product()); - Assert.Throws(() => column.Sum()); - continue; - } - else if (column.DataType == typeof(DateTime)) - { - Assert.Throws(() => column.CumulativeProduct()); - Assert.Throws(() => column.CumulativeSum()); - Assert.Throws(() => column.Product()); - Assert.Throws(() => column.Sum()); - continue; - } - column.CumulativeMax(); - column.CumulativeMin(); - column.CumulativeProduct(); - column.CumulativeSum(); - column.Max(); - column.Min(); - column.Product(); - column.Sum(); - } - } - - [Fact] - public void TestIntComputations_MaxMin_WithNulls() - { - var column = new Int32DataFrameColumn("Int", new int?[] - { - null, - 2, - 1, - 4, - 3, - null - }); - - Assert.Equal(1, column.Min()); - Assert.Equal(4, column.Max()); - } - - [Fact] - public void TestIntSum_OnColumnWithNullsOnly() - { - var column = new Int32DataFrameColumn("Int", new int?[] { null, null }); - Assert.Null(column.Sum()); - } - - [Fact] - public void TestIntSum_OnEmptyColumn() - { - var column = new Int32DataFrameColumn("Int"); - Assert.Null(column.Sum()); - } - - [Fact] - public void TestIntComputations_MaxMin_OnEmptyColumn() - { - var column = new Int32DataFrameColumn("Int"); - - Assert.Null(column.Min()); - Assert.Null(column.Max()); - } - - [Fact] - public void TestDateTimeComputations_MaxMin_OnEmptyColumn() - { - var column = new DateTimeDataFrameColumn("DateTime"); - - Assert.Null(column.Min()); - Assert.Null(column.Max()); - } - - [Fact] - public void TestDateTimeComputations_MaxMin_WithNulls() - { - var dateTimeColumn = new DateTimeDataFrameColumn("DateTime", new DateTime?[] - { - null, - new DateTime(2022, 1, 1), - new DateTime(2020, 1, 1), - new DateTime(2023, 1, 1), - new DateTime(2021, 1, 1), - null - }); - - Assert.Equal(new DateTime(2020, 1, 1), dateTimeColumn.Min()); - Assert.Equal(new DateTime(2023, 1, 1), dateTimeColumn.Max()); - } - - [Theory] - [InlineData(5, 10)] - [InlineData(-15, 10)] - [InlineData(-5, 10)] - public void TestComputations_WithNegativeNumbers_MaxMin_Calculated(int startingFrom, int length) - { - // Arrange - - IEnumerable range = Enumerable.Range(startingFrom, length); - - int max = range.Max(); - int min = range.Min(); - - DataFrame df = MakeDataFrameWithNumericColumns(length, withNulls: false, startingFrom); - - var byteColumn = (PrimitiveDataFrameColumn)df.Columns["Byte"]; - var decimalColumn = (PrimitiveDataFrameColumn)df.Columns["Decimal"]; - var doubleColumn = (PrimitiveDataFrameColumn)df.Columns["Double"]; - var floatColumn = (PrimitiveDataFrameColumn)df.Columns["Float"]; - var intColumn = (PrimitiveDataFrameColumn)df.Columns["Int"]; - var longColumn = (PrimitiveDataFrameColumn)df.Columns["Long"]; - var sbyteColumn = (PrimitiveDataFrameColumn)df.Columns["Sbyte"]; - var shortColumn = (PrimitiveDataFrameColumn)df.Columns["Short"]; - var uintColumn = (PrimitiveDataFrameColumn)df.Columns["Uint"]; - var ulongColumn = (PrimitiveDataFrameColumn)df.Columns["Ulong"]; - var ushortColumn = (PrimitiveDataFrameColumn)df.Columns["Ushort"]; - - // Act, Assert - - // We need to iterate over all range with conversion to byte due to negative numbers issue - Assert.Equal((byte)byteColumn.Max(), range.Select(x => (byte)x).Max()); - - Assert.Equal((decimal)decimalColumn.Max(), (decimal)max); - Assert.Equal((double)doubleColumn.Max(), (double)max); - Assert.Equal((float)floatColumn.Max(), (float)max); - Assert.Equal((int)intColumn.Max(), (int)max); - Assert.Equal((long)longColumn.Max(), (long)max); - Assert.Equal((sbyte)sbyteColumn.Max(), (sbyte)max); - Assert.Equal((short)shortColumn.Max(), (short)max); - - // We need to iterate over all range with conversion to uint due to negative numbers issue - Assert.Equal((uint)uintColumn.Max(), range.Select(x => (uint)x).Max()); - - // We need to iterate over all range with conversion to ulong due to negative numbers issue - Assert.Equal((ulong)ulongColumn.Max(), range.Select(x => (ulong)x).Max()); - - // We need to iterate over all range with conversion to ushort due to negative numbers issue - Assert.Equal((ushort)ushortColumn.Max(), range.Select(x => (ushort)x).Max()); - - // We need to iterate over all range with conversion to byte due to negative numbers issue - Assert.Equal((byte)byteColumn.Min(), range.Select(x => (byte)x).Min()); - - Assert.Equal((decimal)decimalColumn.Min(), (decimal)min); - Assert.Equal((double)doubleColumn.Min(), (double)min); - Assert.Equal((float)floatColumn.Min(), (float)min); - Assert.Equal((int)intColumn.Min(), (int)min); - Assert.Equal((long)longColumn.Min(), (long)min); - Assert.Equal((sbyte)sbyteColumn.Min(), (sbyte)min); - Assert.Equal((short)shortColumn.Min(), (short)min); - - // We need to iterate over all range with conversion to uint due to negative numbers issue - Assert.Equal((uint)uintColumn.Min(), range.Select(x => (uint)x).Min()); - - // We need to iterate over all range with conversion to ulong due to negative numbers issue - Assert.Equal((ulong)ulongColumn.Min(), range.Select(x => (ulong)x).Min()); - - // We need to iterate over all range with conversion to ushort due to negative numbers issue - Assert.Equal((ushort)ushortColumn.Min(), range.Select(x => (ushort)x).Min()); - } - [Fact] public void TestOrderBy() { @@ -1390,316 +255,6 @@ public void TestOrderBy() Assert.Equal(9, sortedDf.Columns["Int"][1]); } - [Fact] - public void TestSplitAndSort() - { - DataFrame df = MakeDataFrameWithAllMutableColumnTypes(20); - df.Columns["Int"][0] = 100000; - df.Columns["Int"][df.Rows.Count - 1] = -1; - df.Columns["Int"][5] = 200000; - DataFrame dfTest; - DataFrame dfTrain = SplitTrainTest(df, 0.8f, out dfTest); - - // Sort by "Int" in ascending order - var sortedDf = dfTrain.OrderBy("Int"); - Assert.Null(sortedDf.Columns["Int"][sortedDf.Rows.Count - 1]); - Assert.Equal(1, sortedDf.Columns["Int"][0]); - Assert.Equal(100000, sortedDf.Columns["Int"][sortedDf.Rows.Count - 3]); - Assert.Equal(200000, sortedDf.Columns["Int"][sortedDf.Rows.Count - 2]); - } - - [Fact] - public void TestStringColumnSort() - { - // StringDataFrameColumn specific sort tests - StringDataFrameColumn strColumn = new StringDataFrameColumn("String", 0); - Assert.Equal(0, strColumn.NullCount); - for (int i = 0; i < 5; i++) - { - strColumn.Append(null); - } - Assert.Equal(5, strColumn.NullCount); - // Should handle all nulls - StringDataFrameColumn sortedStrColumn = strColumn.Sort() as StringDataFrameColumn; - Assert.Equal(5, sortedStrColumn.NullCount); - Assert.Null(sortedStrColumn[0]); - - for (int i = 0; i < 5; i++) - { - strColumn.Append(i.ToString()); - } - Assert.Equal(5, strColumn.NullCount); - - // Ascending sort - sortedStrColumn = strColumn.Sort() as StringDataFrameColumn; - Assert.Equal("0", sortedStrColumn[0]); - Assert.Null(sortedStrColumn[9]); - - // Descending sort - sortedStrColumn = strColumn.Sort(false) as StringDataFrameColumn; - Assert.Equal("4", sortedStrColumn[0]); - Assert.Null(sortedStrColumn[9]); - } - - [Theory] - [InlineData(5)] - [InlineData(12)] - [InlineData(100)] - [InlineData(1000)] - public void TestPrimitiveColumnSort(int numberOfNulls) - { - // Primitive Column Sort - Int32DataFrameColumn intColumn = new Int32DataFrameColumn("Int", 0); - Assert.Equal(0, intColumn.NullCount); - intColumn.AppendMany(null, numberOfNulls); - Assert.Equal(numberOfNulls, intColumn.NullCount); - - // Should handle all nulls - PrimitiveDataFrameColumn sortedIntColumn = intColumn.Sort(); - Assert.Equal(numberOfNulls, sortedIntColumn.NullCount); - Assert.Null(sortedIntColumn[0]); - - for (int i = 0; i < 5; i++) - { - intColumn.Append(i); - } - Assert.Equal(numberOfNulls, intColumn.NullCount); - - // Ascending sort - sortedIntColumn = intColumn.Sort(); - Assert.Equal(0, sortedIntColumn[0]); - Assert.Null(sortedIntColumn[9]); - - // Descending sort - sortedIntColumn = intColumn.Sort(ascending: false); - Assert.Equal(4, sortedIntColumn[0]); - Assert.Null(sortedIntColumn[9]); - } - - [Fact] - public void TestSortWithDifferentNullCountsInColumns() - { - DataFrame dataFrame = MakeDataFrameWithAllMutableColumnTypes(10); - dataFrame["Int"][3] = null; - dataFrame["String"][3] = null; - DataFrame sorted = dataFrame.OrderBy("Int"); - void Verify(DataFrame sortedDataFrame) - { - Assert.Equal(10, sortedDataFrame.Rows.Count); - DataFrameRow lastRow = sortedDataFrame.Rows[sortedDataFrame.Rows.Count - 1]; - DataFrameRow penultimateRow = sortedDataFrame.Rows[sortedDataFrame.Rows.Count - 2]; - foreach (object value in lastRow) - { - Assert.Null(value); - } - - for (int i = 0; i < sortedDataFrame.Columns.Count; i++) - { - string columnName = sortedDataFrame.Columns[i].Name; - if (columnName != "String" && columnName != "Int") - { - Assert.Equal(dataFrame[columnName][3], penultimateRow[i]); - } - else if (columnName == "String" || columnName == "Int") - { - Assert.Null(penultimateRow[i]); - } - } - } - - Verify(sorted); - - sorted = dataFrame.OrderBy("String"); - Verify(sorted); - } - - private void VerifyJoin(DataFrame join, DataFrame left, DataFrame right, JoinAlgorithm joinAlgorithm) - { - Int64DataFrameColumn mapIndices = new Int64DataFrameColumn("map", join.Rows.Count); - for (long i = 0; i < join.Rows.Count; i++) - { - mapIndices[i] = i; - } - for (int i = 0; i < join.Columns.Count; i++) - { - DataFrameColumn joinColumn = join.Columns[i]; - DataFrameColumn isEqual; - - if (joinAlgorithm == JoinAlgorithm.Left) - { - if (i < left.Columns.Count) - { - DataFrameColumn leftColumn = left.Columns[i]; - isEqual = joinColumn.ElementwiseEquals(leftColumn); - } - else - { - int columnIndex = i - left.Columns.Count; - DataFrameColumn rightColumn = right.Columns[columnIndex]; - DataFrameColumn compareColumn = rightColumn.Length <= join.Rows.Count ? rightColumn.Clone(numberOfNullsToAppend: join.Rows.Count - rightColumn.Length) : rightColumn.Clone(mapIndices); - isEqual = joinColumn.ElementwiseEquals(compareColumn); - } - } - else if (joinAlgorithm == JoinAlgorithm.Right) - { - if (i < left.Columns.Count) - { - DataFrameColumn leftColumn = left.Columns[i]; - DataFrameColumn compareColumn = leftColumn.Length <= join.Rows.Count ? leftColumn.Clone(numberOfNullsToAppend: join.Rows.Count - leftColumn.Length) : leftColumn.Clone(mapIndices); - isEqual = joinColumn.ElementwiseEquals(compareColumn); - } - else - { - int columnIndex = i - left.Columns.Count; - DataFrameColumn rightColumn = right.Columns[columnIndex]; - isEqual = joinColumn.ElementwiseEquals(rightColumn); - } - } - else if (joinAlgorithm == JoinAlgorithm.Inner) - { - if (i < left.Columns.Count) - { - DataFrameColumn leftColumn = left.Columns[i]; - isEqual = joinColumn.ElementwiseEquals(leftColumn.Clone(mapIndices)); - } - else - { - int columnIndex = i - left.Columns.Count; - DataFrameColumn rightColumn = right.Columns[columnIndex]; - isEqual = joinColumn.ElementwiseEquals(rightColumn.Clone(mapIndices)); - } - } - else - { - if (i < left.Columns.Count) - { - DataFrameColumn leftColumn = left.Columns[i]; - isEqual = joinColumn.ElementwiseEquals(leftColumn.Clone(numberOfNullsToAppend: join.Rows.Count - leftColumn.Length)); - } - else - { - int columnIndex = i - left.Columns.Count; - DataFrameColumn rightColumn = right.Columns[columnIndex]; - isEqual = joinColumn.ElementwiseEquals(rightColumn.Clone(numberOfNullsToAppend: join.Rows.Count - rightColumn.Length)); - } - } - for (int j = 0; j < join.Rows.Count; j++) - { - Assert.Equal(true, isEqual[j]); - } - } - } - - private void VerifyMerge(DataFrame merge, DataFrame left, DataFrame right, JoinAlgorithm joinAlgorithm) - { - if (joinAlgorithm == JoinAlgorithm.Left || joinAlgorithm == JoinAlgorithm.Inner) - { - HashSet intersection = new HashSet(); - for (int i = 0; i < merge.Columns["Int_left"].Length; i++) - { - if (merge.Columns["Int_left"][i] == null) - continue; - intersection.Add((int)merge.Columns["Int_left"][i]); - } - for (int i = 0; i < left.Columns["Int"].Length; i++) - { - if (left.Columns["Int"][i] != null && intersection.Contains((int)left.Columns["Int"][i])) - intersection.Remove((int)left.Columns["Int"][i]); - } - Assert.Empty(intersection); - } - else if (joinAlgorithm == JoinAlgorithm.Right) - { - HashSet intersection = new HashSet(); - for (int i = 0; i < merge.Columns["Int_right"].Length; i++) - { - if (merge.Columns["Int_right"][i] == null) - continue; - intersection.Add((int)merge.Columns["Int_right"][i]); - } - for (int i = 0; i < right.Columns["Int"].Length; i++) - { - if (right.Columns["Int"][i] != null && intersection.Contains((int)right.Columns["Int"][i])) - intersection.Remove((int)right.Columns["Int"][i]); - } - Assert.Empty(intersection); - } - else if (joinAlgorithm == JoinAlgorithm.FullOuter) - { - VerifyMerge(merge, left, right, JoinAlgorithm.Left); - VerifyMerge(merge, left, right, JoinAlgorithm.Right); - } - } - - [Fact] - public void TestJoin() - { - DataFrame left = MakeDataFrameWithAllMutableColumnTypes(10); - DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); - - // Tests with right.Rows.Count < left.Rows.Count - // Left join - DataFrame join = left.Join(right); - Assert.Equal(join.Rows.Count, left.Rows.Count); - Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Null(join.Columns["Int_right"][6]); - VerifyJoin(join, left, right, JoinAlgorithm.Left); - - // Right join - join = left.Join(right, joinAlgorithm: JoinAlgorithm.Right); - Assert.Equal(join.Rows.Count, right.Rows.Count); - Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(join.Columns["Int_right"][3], right.Columns["Int"][3]); - Assert.Null(join.Columns["Int_right"][2]); - VerifyJoin(join, left, right, JoinAlgorithm.Right); - - // Outer join - join = left.Join(right, joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(join.Rows.Count, left.Rows.Count); - Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Null(join.Columns["Int_right"][6]); - VerifyJoin(join, left, right, JoinAlgorithm.FullOuter); - - // Inner join - join = left.Join(right, joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(join.Rows.Count, right.Rows.Count); - Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(join.Columns["Int_right"][3], right.Columns["Int"][3]); - Assert.Null(join.Columns["Int_right"][2]); - VerifyJoin(join, left, right, JoinAlgorithm.Inner); - - // Tests with right.Rows.Count > left.Rows.Count - // Left join - right = MakeDataFrameWithAllMutableColumnTypes(15); - join = left.Join(right); - Assert.Equal(join.Rows.Count, left.Rows.Count); - Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(join.Columns["Int_right"][6], right.Columns["Int"][6]); - VerifyJoin(join, left, right, JoinAlgorithm.Left); - - // Right join - join = left.Join(right, joinAlgorithm: JoinAlgorithm.Right); - Assert.Equal(join.Rows.Count, right.Rows.Count); - Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(join.Columns["Int_right"][2], right.Columns["Int"][2]); - Assert.Null(join.Columns["Int_left"][12]); - VerifyJoin(join, left, right, JoinAlgorithm.Right); - - // Outer join - join = left.Join(right, joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(join.Rows.Count, right.Rows.Count); - Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Null(join.Columns["Int_left"][12]); - VerifyJoin(join, left, right, JoinAlgorithm.FullOuter); - - // Inner join - join = left.Join(right, joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(join.Rows.Count, left.Rows.Count); - Assert.Equal(join.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(join.Columns["Int_right"][2], right.Columns["Int"][2]); - VerifyJoin(join, left, right, JoinAlgorithm.Inner); - } - [Fact] public void TestGroupBy() { @@ -2002,18 +557,6 @@ public void TestColumnClamp() Assert.Equal(7, df.Columns["Int"][9]); } - [Fact] - public void TestColumnFilter() - { - DataFrame df = MakeDataFrameWithNumericColumns(10); - DataFrameColumn filtered = df.Columns["Int"].Filter(3, 7); - Assert.Equal(4, filtered.Length); - Assert.Equal(3, filtered[0]); - Assert.Equal(4, filtered[1]); - Assert.Equal(6, filtered[2]); - Assert.Equal(7, filtered[3]); - } - [Fact] public void TestDataFrameClamp() { @@ -2060,51 +603,16 @@ void VerifyDataFrameClamp(DataFrame clampedColumn) VerifyDataFrameClamp(clamped); for (int i = 0; i < 10; i++) { - if (i != 5) - Assert.Equal(i, df.Columns["Int"][i]); - else - Assert.Null(df.Columns["Int"][5]); - } - - // Inplace - df.Clamp(3, 7, true); - VerifyDataFrameClamp(df); - - } - - [Fact] - public void TestDataFrameFilter() - { - DataFrame df = MakeDataFrameWithAllMutableColumnTypes(10); - DataFrame boolColumnFiltered = df[df.Columns["Bool"].ElementwiseEquals(true)]; - List verify = new List { 0, 2, 4, 6, 8 }; - Assert.Equal(5, boolColumnFiltered.Rows.Count); - for (int i = 0; i < boolColumnFiltered.Columns.Count; i++) - { - DataFrameColumn column = boolColumnFiltered.Columns[i]; - if (column.Name == "Char" || column.Name == "Bool" || column.Name == "String" || column.Name == "DateTime") - continue; - for (int j = 0; j < column.Length; j++) - { - Assert.Equal(verify[j].ToString(), column[j].ToString()); - } - } - DataFrame intEnumerableFiltered = df[Enumerable.Range(0, 10)]; - DataFrame boolEnumerableFiltered = df[Enumerable.Range(0, 10).Select(x => true)]; - DataFrame longEnumerableFiltered = df[Enumerable.Range(0, 10).Select(x => (long)x)]; - Assert.Equal(intEnumerableFiltered.Columns.Count, df.Columns.Count); - Assert.Equal(boolEnumerableFiltered.Columns.Count, df.Columns.Count); - Assert.Equal(longEnumerableFiltered.Columns.Count, df.Columns.Count); - for (int i = 0; i < intEnumerableFiltered.Columns.Count; i++) - { - DataFrameColumn intFilteredColumn = intEnumerableFiltered.Columns[i]; - DataFrameColumn dfColumn = df.Columns[i]; - DataFrameColumn boolFilteredColumn = boolEnumerableFiltered.Columns[i]; - DataFrameColumn longFilteredColumn = longEnumerableFiltered.Columns[i]; - Assert.True(intFilteredColumn.ElementwiseEquals(dfColumn).All()); - Assert.True(boolFilteredColumn.ElementwiseEquals(dfColumn).All()); - Assert.True(longFilteredColumn.ElementwiseEquals(dfColumn).All()); + if (i != 5) + Assert.Equal(i, df.Columns["Int"][i]); + else + Assert.Null(df.Columns["Int"][5]); } + + // Inplace + df.Clamp(3, 7, true); + VerifyDataFrameClamp(df); + } [Fact] @@ -2172,757 +680,6 @@ public void TestSample() Assert.Throws(() => df.Sample(13)); } - [Theory] - [InlineData(1, 2)] - [InlineData(2, 1)] - public void TestDataCorrectnessForInnerMerge(int leftCount, int rightCount) - { - DataFrame left = MakeDataFrameWithNumericColumns(leftCount, false); - DataFrameColumn leftStringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, leftCount).Select(x => "Left")); - left.Columns.Insert(left.Columns.Count, leftStringColumn); - - DataFrame right = MakeDataFrameWithNumericColumns(rightCount, false); - DataFrameColumn rightStringColumn = new StringDataFrameColumn("String", Enumerable.Range(0, rightCount).Select(x => "Right")); - right.Columns.Insert(right.Columns.Count, rightStringColumn); - - DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); - - Assert.Equal("Left", (string)merge.Columns["String_left"][0]); - Assert.Equal("Right", (string)merge.Columns["String_right"][0]); - } - - [Fact] - public void TestMerge() - { - DataFrame left = MakeDataFrameWithAllMutableColumnTypes(10); - DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); - - // Tests with right.Rows.Count < left.Rows.Count - // Left merge - DataFrame merge = left.Merge(right, "Int", "Int"); - Assert.Equal(10, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Null(merge.Columns["Int_right"][6]); - Assert.Null(merge.Columns["Int_left"][5]); - VerifyMerge(merge, left, right, JoinAlgorithm.Left); - - // Right merge - merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Right); - Assert.Equal(5, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(merge.Columns["Int_right"][3], right.Columns["Int"][3]); - Assert.Null(merge.Columns["Int_right"][2]); - VerifyMerge(merge, left, right, JoinAlgorithm.Right); - - // Outer merge - merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(merge.Rows.Count, left.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Null(merge.Columns["Int_right"][6]); - VerifyMerge(merge, left, right, JoinAlgorithm.FullOuter); - - // Inner merge - merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(merge.Rows.Count, right.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(merge.Columns["Int_right"][2], right.Columns["Int"][3]); - Assert.Null(merge.Columns["Int_right"][4]); - VerifyMerge(merge, left, right, JoinAlgorithm.Inner); - - // Tests with right.Rows.Count > left.Rows.Count - // Left merge - right = MakeDataFrameWithAllMutableColumnTypes(15); - merge = left.Merge(right, "Int", "Int"); - Assert.Equal(merge.Rows.Count, left.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(merge.Columns["Int_right"][6], right.Columns["Int"][6]); - VerifyMerge(merge, left, right, JoinAlgorithm.Left); - - // Right merge - merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Right); - Assert.Equal(merge.Rows.Count, right.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(merge.Columns["Int_right"][2], right.Columns["Int"][2]); - Assert.Null(merge.Columns["Int_left"][12]); - VerifyMerge(merge, left, right, JoinAlgorithm.Right); - - // Outer merge - merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(16, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Null(merge.Columns["Int_left"][12]); - Assert.Null(merge.Columns["Int_left"][15]); - VerifyMerge(merge, left, right, JoinAlgorithm.FullOuter); - - // Inner merge - merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(9, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - Assert.Equal(merge.Columns["Int_right"][2], right.Columns["Int"][2]); - VerifyMerge(merge, left, right, JoinAlgorithm.Inner); - } - - private void MatchRowsOnMergedDataFrame(DataFrame merge, DataFrame left, DataFrame right, long mergeRow, long? leftRow, long? rightRow) - { - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - DataFrameRow dataFrameMergeRow = merge.Rows[mergeRow]; - int columnIndex = 0; - foreach (object value in dataFrameMergeRow) - { - object compare = null; - if (columnIndex < left.Columns.Count) - { - if (leftRow != null) - { - compare = left.Rows[leftRow.Value][columnIndex]; - } - } - else - { - int rightColumnIndex = columnIndex - left.Columns.Count; - if (rightRow != null) - { - compare = right.Rows[rightRow.Value][rightColumnIndex]; - } - } - Assert.Equal(value, compare); - columnIndex++; - } - } - - [Theory] - [InlineData(10, 5, JoinAlgorithm.Left)] - [InlineData(5, 10, JoinAlgorithm.Right)] - public void TestMergeEdgeCases_LeftOrRight(int leftLength, int rightLength, JoinAlgorithm joinAlgorithm) - { - DataFrame left = MakeDataFrameWithAllMutableColumnTypes(leftLength); - if (leftLength > 5) - { - left["Int"][8] = null; - } - DataFrame right = MakeDataFrameWithAllMutableColumnTypes(rightLength); - if (rightLength > 5) - { - right["Int"][8] = null; - } - - DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: joinAlgorithm); - Assert.Equal(10, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - int[] matchedFullRows = new int[] { 0, 1, 3, 4 }; - for (long i = 0; i < matchedFullRows.Length; i++) - { - int rowIndex = matchedFullRows[i]; - MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, rowIndex, rowIndex); - } - - int[] matchedLeftOrRightRowsNullOtherRows = new int[] { 2, 5, 6, 7, 8, 9 }; - for (long i = 0; i < matchedLeftOrRightRowsNullOtherRows.Length; i++) - { - int rowIndex = matchedLeftOrRightRowsNullOtherRows[i]; - if (leftLength > 5) - { - MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, rowIndex, null); - } - else - { - MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, null, rowIndex); - } - } - } - - [Fact] - public void TestMergeEdgeCases_Inner() - { - DataFrame left = MakeDataFrameWithAllMutableColumnTypes(5); - DataFrame right = MakeDataFrameWithAllMutableColumnTypes(10); - left["Int"][3] = null; - right["Int"][6] = null; - // Creates this case: - /* - * Left: Right: - * 0 0 - * 1 1 - * null(2) 2 - * null(3) 3 - * 4 4 - * null(5) - * null(6) - * 7 - * 8 - * 9 - */ - /* - * Merge will result in a DataFrame like: - * Int_Left Int_Right - * 0 0 - * 1 1 - * 4 4 - * null(2) null(5) - * null(3) null(5) - * null(2) null(6) - * null(3) null(6) - */ - - DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.Inner); - Assert.Equal(7, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - int[] mergeRows = new int[] { 0, 1, 2, 3, 4, 5, 6 }; - int[] leftRows = new int[] { 0, 1, 4, 2, 3, 2, 3 }; - int[] rightRows = new int[] { 0, 1, 4, 5, 5, 6, 6 }; - for (long i = 0; i < mergeRows.Length; i++) - { - int rowIndex = mergeRows[i]; - int leftRowIndex = leftRows[i]; - int rightRowIndex = rightRows[i]; - MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, leftRowIndex, rightRowIndex); - } - } - - [Fact] - public void TestMergeEdgeCases_Outer() - { - DataFrame left = MakeDataFrameWithAllMutableColumnTypes(5); - left["Int"][3] = null; - DataFrame right = MakeDataFrameWithAllMutableColumnTypes(5); - right["Int"][1] = 5; - right["Int"][3] = null; - right["Int"][4] = 6; - - // Creates this case: - /* - * Left: Right: RowIndex: - * 0 0 0 - * 1 5 1 - * null null 2 - * null(3) null(3) 3 - * 4 6 4 - */ - - /* - * Merge will result in a DataFrame like: - * Int_left: Int_right: Merged: Index: - * 0 0 0 - 0 0 - * 1 null 1 - N 1 - * null null 2 - 2 2 - * null null(3) 2 - 3 3 - * null(3) null 3 - 2 4 - * null(3) null(3) 3 - 3 5 - * 4 null 4 - N 6 - * null 5 N - 1 7 - * null 6 N - 4 8 - */ - - DataFrame merge = left.Merge(right, "Int", "Int", joinAlgorithm: JoinAlgorithm.FullOuter); - Assert.Equal(9, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - int[] mergeRows = new int[] { 0, 2, 3, 4, 5 }; - int[] leftRows = new int[] { 0, 2, 2, 3, 3 }; - int[] rightRows = new int[] { 0, 2, 3, 2, 3 }; - for (long i = 0; i < mergeRows.Length; i++) - { - int rowIndex = mergeRows[i]; - int leftRowIndex = leftRows[i]; - int rightRowIndex = rightRows[i]; - MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, leftRowIndex, rightRowIndex); - } - - mergeRows = new int[] { 1, 6 }; - leftRows = new int[] { 1, 4 }; - for (long i = 0; i < mergeRows.Length; i++) - { - int rowIndex = mergeRows[i]; - int leftRowIndex = leftRows[i]; - MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, leftRowIndex, null); - } - - mergeRows = new int[] { 7, 8 }; - rightRows = new int[] { 1, 4 }; - for (long i = 0; i < mergeRows.Length; i++) - { - int rowIndex = mergeRows[i]; - int rightRowIndex = rightRows[i]; - MatchRowsOnMergedDataFrame(merge, left, right, rowIndex, null, rightRowIndex); - } - } - - [Fact] - public void TestMerge_ByTwoColumns_Complex_LeftJoin() - { - //Test left merge by to int type columns - - //Arrange - var left = new DataFrame(); - left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2, 3, 4, 5 })); - left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 0, 1, 1, 2, 2, 3 })); - left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 3, 1, 2, 1, 2, 1 })); - - var right = new DataFrame(); - right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2, 3 })); - right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 1, 2 })); - right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1, 1 })); - - // Creates this case: - /* ------------------------- - * Left | Right - * I G1 G2 | I G1 G2 - * ------------------------- - * 0 0 3 | 0 1 1 - * 1 1 1 | 1 1 2 - * 2 1 2 | 2 1 1 - * 3 2 1 | 3 2 1 - * 4 2 2 - * 5 3 1 - */ - - /* - * Merge will result in a DataFrame like: - * IL G1 G2 IR Merged: - * ------------------------- - * 0 0 3 0 - N - * 1 1 1 0 1 1 1 - 0 - * 1 1 1 2 1 1 1 - 2 - * 2 1 2 1 1 2 2 - 1 - * 3 2 1 3 2 1 3 - 3 - * 4 2 2 4 - N - * 5 3 1 5 - N - */ - - //Act - var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }); - - //Assert - var expectedMerged = new (int? Left, int? Right)[] { - (0, null), - (1, 0), - (1, 2), - (2, 1), - (3, 3), - (4, null), - (5, null) - }; - - Assert.Equal(expectedMerged.Length, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - for (long i = 0; i < expectedMerged.Length; i++) - { - MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); - } - - } - - [Fact] - public void TestMerge_ByTwoColumns_Simple_ManyToMany_LeftJoin() - { - //Test left merge by to int type columns - - //Arrange - var left = new DataFrame(); - left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); - left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); - - var right = new DataFrame(); - right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); - right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); - - // Creates this case: - /* --------------------------- - * Left | Right - * I G1 G2 | I G1 G2 - * --------------------------- - * 0 1 1 | 0 1 1 - * 1 1 1 | 1 1 1 - * 2 3 3 | 2 0 0 - */ - - /* - * Merge will result in a DataFrame like: - * IL G1 G2 IR Merged: - * ------------------------- - * 0 1 1 0 1 1 0 - 0 - * 0 1 1 1 1 1 0 - 1 - * 1 1 1 0 1 1 1 - 0 - * 1 1 1 1 1 1 1 - 1 - * 2 3 3 2 - N - */ - - //Act - var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }); - - //Assert - var expectedMerged = new (int? Left, int? Right)[] { - (0, 0), - (0, 1), - (1, 0), - (1, 1), - (2, null) - }; - - Assert.Equal(expectedMerged.Length, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - for (long i = 0; i < expectedMerged.Length; i++) - { - MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); - } - } - - [Fact] - public void TestMerge_ByTwoColumns_Simple_ManyToMany_RightJoin() - { - //Test left merge by to int type columns - - //Arrange - var left = new DataFrame(); - left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); - left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); - - var right = new DataFrame(); - right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); - right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); - - // Creates this case: - /* --------------------------- - * Left | Right - * I G1 G2 | I G1 G2 - * --------------------------- - * 0 1 1 | 0 1 1 - * 1 1 1 | 1 1 1 - * 2 3 3 | 2 0 0 - */ - - /* - * Merge will result in a DataFrame like: - * IL G1 G2 IR Merged: - * ------------------------- - * 0 1 1 0 1 1 0 - 0 - * 1 1 1 0 1 1 1 - 0 - * 0 1 1 1 1 1 0 - 1 - * 1 1 1 1 1 1 1 - 1 - * 2 0 0 N - 2 - */ - - //Act - var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.Right); - - //Assert - var expectedMerged = new (int? Left, int? Right)[] { - (0, 0), - (1, 0), - (0, 1), - (1, 1), - (null, 2) - }; - - Assert.Equal(expectedMerged.Length, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - for (long i = 0; i < expectedMerged.Length; i++) - { - MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); - } - } - - [Fact] - public void TestMerge_ByTwoColumns_Simple_ManyToMany_InnerJoin() - { - //Test left merge by to int type columns - - //Arrange - var left = new DataFrame(); - left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); - left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); - - var right = new DataFrame(); - right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); - right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); - - // Creates this case: - /* --------------------------- - * Left | Right - * I G1 G2 | I G1 G2 - * --------------------------- - * 0 1 1 | 0 1 1 - * 1 1 1 | 1 1 1 - * 2 3 3 | 2 0 0 - */ - - /* - * Merge will result in a DataFrame like: - * IL G1 G2 IR Merged: - * ------------------------- - * 0 1 1 0 1 1 0 - 0 - * 1 1 1 0 1 1 1 - 0 - * 0 1 1 1 1 1 0 - 1 - * 1 1 1 1 1 1 1 - 1 - */ - - //Act - var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.Inner); - - //Assert - var expectedMerged = new (int? Left, int? Right)[] { - (0, 0), - (1, 0), - (0, 1), - (1, 1) - }; - - Assert.Equal(expectedMerged.Length, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - for (long i = 0; i < expectedMerged.Length; i++) - { - MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); - } - } - - [Fact] - public void TestMerge_ByTwoColumns_Simple_ManyToMany_OuterJoin() - { - //Test left merge by to int type columns - - //Arrange - var left = new DataFrame(); - left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 3 })); - left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 3 })); - - var right = new DataFrame(); - right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 0 })); - right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 0 })); - - // Creates this case: - /* --------------------------- - * Left | Right - * I G1 G2 | I G1 G2 - * --------------------------- - * 0 1 1 | 0 1 1 - * 1 1 1 | 1 1 1 - * 2 3 3 | 2 0 0 - */ - - /* - * Merge will result in a DataFrame like: - * IL G1 G2 IR Merged: - * ------------------------- - * 0 1 1 0 1 1 0 - 0 - * 0 1 1 1 1 1 0 - 1 - * 1 1 1 0 1 1 1 - 0 - * 1 1 1 1 1 1 1 - 1 - * 2 3 3 2 - N - * 2 0 0 N - 2 - */ - - //Act - var merge = left.Merge(right, new[] { "G1", "G2" }, new[] { "G1", "G2" }, joinAlgorithm: JoinAlgorithm.FullOuter); - - //Assert - var expectedMerged = new (int? Left, int? Right)[] { - (0, 0), - (0, 1), - (1, 0), - (1, 1), - (2, null), - (null, 2) - }; - - Assert.Equal(expectedMerged.Length, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - for (long i = 0; i < expectedMerged.Length; i++) - { - MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); - } - } - - [Fact] - public void TestMerge_ByThreeColumns_OneToOne_LeftJoin() - { - //Test merge by LEFT join of int and string columns - - //Arrange - var left = new DataFrame(); - left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 2 })); - left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1 })); - left.Columns.Add(new StringDataFrameColumn("G3", new[] { "A", "B", "C" })); - - var right = new DataFrame(); - right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 0, 1, 1 })); - right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 2 })); - right.Columns.Add(new StringDataFrameColumn("G3", new[] { "Z", "Y", "B" })); - - // Creates this case: - /* ----------------------------- - * Left | Right - * I G1 G2 G3 | I G1 G2 G3 - * ------------------------------ - * 0 1 1 A | 0 0 1 Z - * 1 1 2 B | 1 1 1 Y - * 2 2 1 C | 2 1 2 B - */ - - /* - * Merge will result in a DataFrame like: - * IL G1 G2 G3 IR Merged: - * ------------------------- - * 0 1 1 A 0 - N - * 1 1 2 B 2 1 2 B 1 - 2 - * 2 2 1 C 2 - N - */ - - //Act - var merge = left.Merge(right, new[] { "G1", "G2", "G3" }, new[] { "G1", "G2", "G3" }); - - //Assert - var expectedMerged = new (int? Left, int? Right)[] { - (0, null), - (1, 2), - (2, null) - }; - - Assert.Equal(expectedMerged.Length, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - for (long i = 0; i < expectedMerged.Length; i++) - { - MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); - } - } - - [Fact] - public void TestMerge_ByThreeColumns_OneToOne_RightJoin() - { - //Test merge by RIGHT join of int and string columns - - //Arrange - var left = new DataFrame(); - left.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - left.Columns.Add(new Int32DataFrameColumn("G1", new[] { 1, 1, 2 })); - left.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 2, 1 })); - left.Columns.Add(new StringDataFrameColumn("G3", new[] { "A", "B", "C" })); - - var right = new DataFrame(); - right.Columns.Add(new Int32DataFrameColumn("Index", new[] { 0, 1, 2 })); - right.Columns.Add(new Int32DataFrameColumn("G1", new[] { 0, 1, 1 })); - right.Columns.Add(new Int32DataFrameColumn("G2", new[] { 1, 1, 2 })); - right.Columns.Add(new StringDataFrameColumn("G3", new[] { "Z", "Y", "B" })); - - // Creates this case: - /* ----------------------------- - * Left | Right - * I G1 G2 G3 | I G1 G2 G3 - * ------------------------------ - * 0 1 1 A | 0 0 1 Z - * 1 1 2 B | 1 1 1 Y - * 2 2 1 C | 2 1 2 B - */ - - /* - * Merge will result in a DataFrame like: - * IL G1 G2 G3 IR Merged: - * ------------------------- - * 0 0 1 Z N - 0 - * 1 1 1 Y N - 1 - * 1 1 2 B 2 1 2 B 1 - 2 - */ - - //Act - var merge = left.Merge(right, new[] { "G1", "G2", "G3" }, new[] { "G1", "G2", "G3" }, joinAlgorithm: JoinAlgorithm.Right); - - //Assert - var expectedMerged = new (int? Left, int? Right)[] { - (null, 0), - (null, 1), - (1, 2) - }; - - Assert.Equal(expectedMerged.Length, merge.Rows.Count); - Assert.Equal(merge.Columns.Count, left.Columns.Count + right.Columns.Count); - - for (long i = 0; i < expectedMerged.Length; i++) - { - MatchRowsOnMergedDataFrame(merge, left, right, i, expectedMerged[i].Left, expectedMerged[i].Right); - } - } - - [Fact] - public void TestMerge_Issue5778() - { - DataFrame left = MakeDataFrameWithAllMutableColumnTypes(2, false); - DataFrame right = MakeDataFrameWithAllMutableColumnTypes(1); - - DataFrame merge = left.Merge(right, "Int", "Int"); - - Assert.Equal(2, merge.Rows.Count); - Assert.Equal(0, (int)merge.Columns["Int_left"][0]); - Assert.Equal(1, (int)merge.Columns["Int_left"][1]); - MatchRowsOnMergedDataFrame(merge, left, right, 0, 0, 0); - MatchRowsOnMergedDataFrame(merge, left, right, 1, 1, 0); - } - - [Fact] - //Issue 6127 - public void TestMerge_CorrectColumnTypes() - { - DataFrame left = MakeDataFrameWithAllMutableColumnTypes(2, false); - DataFrame right = MakeDataFrameWithAllMutableColumnTypes(1); - - DataFrame merge = left.Merge(right, "Int", "Int"); - - Assert.NotNull(merge.Columns.GetBooleanColumn("Bool_left")); - Assert.NotNull(merge.Columns.GetBooleanColumn("Bool_right")); - - Assert.NotNull(merge.Columns.GetDecimalColumn("Decimal_left")); - Assert.NotNull(merge.Columns.GetDecimalColumn("Decimal_right")); - - Assert.NotNull(merge.Columns.GetSingleColumn("Float_left")); - Assert.NotNull(merge.Columns.GetSingleColumn("Float_right")); - - Assert.NotNull(merge.Columns.GetDoubleColumn("Double_left")); - Assert.NotNull(merge.Columns.GetDoubleColumn("Double_right")); - - Assert.NotNull(merge.Columns.GetByteColumn("Byte_left")); - Assert.NotNull(merge.Columns.GetByteColumn("Byte_right")); - - Assert.NotNull(merge.Columns.GetCharColumn("Char_left")); - Assert.NotNull(merge.Columns.GetCharColumn("Char_right")); - - Assert.NotNull(merge.Columns.GetInt16Column("Short_left")); - Assert.NotNull(merge.Columns.GetInt16Column("Short_right")); - - Assert.NotNull(merge.Columns.GetUInt16Column("Ushort_left")); - Assert.NotNull(merge.Columns.GetUInt16Column("Ushort_right")); - - Assert.NotNull(merge.Columns.GetInt32Column("Int_left")); - Assert.NotNull(merge.Columns.GetInt32Column("Int_right")); - - Assert.NotNull(merge.Columns.GetUInt32Column("Uint_left")); - Assert.NotNull(merge.Columns.GetUInt32Column("Uint_right")); - - Assert.NotNull(merge.Columns.GetInt64Column("Long_left")); - Assert.NotNull(merge.Columns.GetInt64Column("Long_right")); - - Assert.NotNull(merge.Columns.GetUInt64Column("Ulong_left")); - Assert.NotNull(merge.Columns.GetUInt64Column("Ulong_right")); - - Assert.NotNull(merge.Columns.GetDateTimeColumn("DateTime_left")); - Assert.NotNull(merge.Columns.GetDateTimeColumn("DateTime_right")); - - } - [Fact] public void TestDescription() { @@ -3466,87 +1223,6 @@ void AssertLengthTypeAndValues(DataFrameColumn column, Type type) AssertLengthTypeAndValues(ushortColumn, typeof(ushort)); } - [Fact] - public void TestBinaryOperationsOnExplodedNumericColumns() - { - DataFrame df = MakeDataFrameWithNumericAndBoolColumns(10, withNulls: false); - Int32DataFrameColumn ints = df.Columns["Int"] as Int32DataFrameColumn; - Int32DataFrameColumn res = ints.Add(1).Subtract(1).Multiply(10).Divide(10).LeftShift(2).RightShift(2); - Assert.True(res.ElementwiseEquals(ints).All()); - Assert.True(res.ElementwiseGreaterThanOrEqual(ints).All()); - Assert.True(res.ElementwiseLessThanOrEqual(ints).All()); - Assert.False(res.ElementwiseNotEquals(ints).All()); - Assert.False(res.ElementwiseGreaterThan(ints).All()); - Assert.False(res.ElementwiseLessThan(ints).All()); - - // Test inPlace - Int32DataFrameColumn inPlace = ints.Add(1, inPlace: true).Subtract(1, inPlace: true).Multiply(10, inPlace: true).Divide(10, inPlace: true).LeftShift(2, inPlace: true).RightShift(2, inPlace: true).Add(100, inPlace: true); - Assert.True(inPlace.ElementwiseEquals(ints).All()); - Assert.True(inPlace.ElementwiseGreaterThanOrEqual(ints).All()); - Assert.True(inPlace.ElementwiseLessThanOrEqual(ints).All()); - Assert.False(inPlace.ElementwiseNotEquals(ints).All()); - Assert.False(inPlace.ElementwiseGreaterThan(ints).All()); - Assert.False(inPlace.ElementwiseLessThan(ints).All()); - - Assert.False(inPlace.ElementwiseEquals(res).All()); - Assert.True(inPlace.ElementwiseGreaterThanOrEqual(res).All()); - Assert.False(inPlace.ElementwiseLessThanOrEqual(res).All()); - Assert.True(inPlace.ElementwiseNotEquals(res).All()); - Assert.True(inPlace.ElementwiseGreaterThan(res).All()); - Assert.False(inPlace.ElementwiseLessThan(res).All()); - - // Test Bool column - BooleanDataFrameColumn bools = df.Columns["Bool"] as BooleanDataFrameColumn; - BooleanDataFrameColumn allFalse = bools.Or(true).And(true).Xor(true); - Assert.True(allFalse.ElementwiseEquals(false).All()); - - // Test inPlace - BooleanDataFrameColumn inPlaceAllFalse = bools.Or(true, inPlace: true).And(true, inPlace: true).Xor(true, inPlace: true); - Assert.True(inPlaceAllFalse.ElementwiseEquals(bools).All()); - - // Test Reverse Operations - Int32DataFrameColumn reverse = ints.ReverseAdd(1).ReverseSubtract(1).ReverseMultiply(-1); - Assert.True(reverse.ElementwiseEquals(ints).All()); - - // Test inPlace - Int32DataFrameColumn reverseInPlace = ints.ReverseAdd(1, inPlace: true).ReverseSubtract(1, inPlace: true).ReverseMultiply(-1, inPlace: true).ReverseDivide(100, inPlace: true); - Assert.True(reverseInPlace.ElementwiseEquals(ints).All()); - Assert.False(reverseInPlace.ElementwiseEquals(reverse).All()); - } - - [Fact] - public void TestArrowStringApply() - { - ArrowStringDataFrameColumn column = CreateArrowStringColumn(10); - ArrowStringDataFrameColumn ret = column.Apply((string cur) => - { - if (cur != null) - { - return cur + "123"; - } - return null; - }); - for (long i = 0; i < column.Length; i++) - { - if (column[i] != null) - { - Assert.Equal(column[i] + "123", ret[i]); - } - else - { - Assert.Null(ret[i]); - } - } - Assert.Equal(1, ret.NullCount); - - // Test null counts - ret = column.Apply((string cur) => - { - return null; - }); - Assert.Equal(column.Length, ret.NullCount); - } - [Fact] public void GetColumnTests() { @@ -3627,40 +1303,6 @@ public void TestMeanMedian() } - [Fact] - public void Test_PrimitiveColumnNotEqualsNull() - { - var col = new DoubleDataFrameColumn("col", new double?[] { 1.23, null, 2, 3 }); - var dfTest = new DataFrame(col); - - var filteredNullDf = dfTest.Filter(dfTest["col"].ElementwiseNotEquals(null)); - - Assert.True(filteredNullDf.Columns.IndexOf("col") >= 0); - Assert.Equal(3, filteredNullDf.Columns["col"].Length); - - Assert.Equal(1.23, filteredNullDf.Columns["col"][0]); - Assert.Equal(2.0, filteredNullDf.Columns["col"][1]); - Assert.Equal(3.0, filteredNullDf.Columns["col"][2]); - } - - [Fact] - public void Test_PrimitiveColumnEqualsNull() - { - var index = new Int32DataFrameColumn("index", new int[] { 1, 2, 3, 4, 5 }); - var col = new DoubleDataFrameColumn("col", new double?[] { 1.23, null, 2, 3, null }); ; - var dfTest = new DataFrame(index, col); - - var filteredNullDf = dfTest.Filter(dfTest["col"].ElementwiseEquals(null)); - - Assert.True(filteredNullDf.Columns.IndexOf("col") >= 0); - Assert.True(filteredNullDf.Columns.IndexOf("index") >= 0); - - Assert.Equal(2, filteredNullDf.Rows.Count); - - Assert.Equal(2, filteredNullDf.Columns["index"][0]); - Assert.Equal(5, filteredNullDf.Columns["index"][1]); - } - [Fact] public void Test_StringColumnNotEqualsNull() { @@ -3694,110 +1336,5 @@ public void Test_StringColumnEqualsNull() Assert.Equal(2, filteredNullDf.Columns["index"][0]); Assert.Equal(5, filteredNullDf.Columns["index"][1]); } - - [Fact] - public void Test_ArithmeticsAddWithNull() - { - // Arrange - //Number of elements shoult be higher than 8 to test SIMD - var left_column = new Int32DataFrameColumn("Left", new int?[] { 1, 1, null, null, 4, 5, 6, 7, 8, 9 }); - var right_column = new Int32DataFrameColumn("Right", new int?[] { 1, null, 1, null, 4, 5, 6, 7, 8, 9 }); - - // Act - var sum = left_column + right_column; - - // Assert - Assert.Equal(3, sum.NullCount); - - Assert.Equal(2, sum[0]); // 1 + 1 - Assert.Null(sum[1]); // null + 1 - Assert.Null(sum[2]); // 1 + null - Assert.Null(sum[3]); // null + null - Assert.Equal(8, sum[4]); - Assert.Equal(10, sum[5]); - Assert.Equal(12, sum[6]); - Assert.Equal(14, sum[7]); - Assert.Equal(16, sum[8]); - Assert.Equal(18, sum[9]); - } - - [Fact] - public void Test_ArithmeticsAddScalarWithNull() - { - // Arrange - //Number of elements shoult be higher than 8 to test SIMD - var left_column = new Int32DataFrameColumn("Left", new int?[] { 0, 1, null, null, 4, 5, 6, 7, 8, null }); - - // Act - var sum = left_column + 5; - - // Assert - Assert.Equal(3, sum.NullCount); - - Assert.Equal(5, sum[0]); // 1 + 1 - Assert.Equal(6, sum[1]); // 1 + 1 - Assert.Null(sum[2]); // 1 + null - Assert.Null(sum[3]); // null + null - Assert.Equal(9, sum[4]); - Assert.Equal(10, sum[5]); - Assert.Equal(11, sum[6]); - Assert.Equal(12, sum[7]); - Assert.Equal(13, sum[8]); - Assert.Null(sum[9]); - } - - [Fact] - public void Test_ArithmeticsDiffWithNull() - { - // Arrange - var left_column = new Int32DataFrameColumn("Left", new int?[] { 1, 1, null, null }); - var right_column = new Int32DataFrameColumn("Right", new int?[] { 1, null, 1, null }); - - // Act - var diff = left_column - (right_column); - - // Assert - Assert.Equal(3, diff.NullCount); - Assert.Equal(0, diff[0]); // 1 - 1 - Assert.Null(diff[1]); // null - 1 - Assert.Null(diff[2]); // 1 - null - Assert.Null(diff[3]); // null - null - } - - [Fact] - public void Test_ArithmeticsMultWithNull() - { - // Arrange - var left_column = new Int32DataFrameColumn("Left", new int?[] { 4, 1, null, null }); - var right_column = new Int32DataFrameColumn("Right", new int?[] { 2, null, 1, null }); - - // Act - var mult = left_column * right_column; - - // Assert - Assert.Equal(3, mult.NullCount); - Assert.Equal(8, mult[0]); // 1 * 1 - Assert.Null(mult[1]); // null * 1 - Assert.Null(mult[2]); // 1 * null - Assert.Null(mult[3]); // null * null - } - - [Fact] - public void Test_ArithmeticsDivWithNull() - { - // Arrange - var left_column = new Int32DataFrameColumn("Left", new int?[] { 4, 1, null, null }); - var right_column = new Int32DataFrameColumn("Right", new int?[] { 2, null, 1, null }); - - // Act - var div = left_column / right_column; - - // Assert - Assert.Equal(3, div.NullCount); - Assert.Equal(2, div[0]); // 1 / 1 - Assert.Null(div[1]); // null / 1 - Assert.Null(div[2]); // 1 / null - Assert.Null(div[3]); // null / null - } } } diff --git a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs b/test/Microsoft.Data.Analysis.Tests/PrimitiveDataFrameColumnTests.cs similarity index 80% rename from test/Microsoft.Data.Analysis.Tests/BufferTests.cs rename to test/Microsoft.Data.Analysis.Tests/PrimitiveDataFrameColumnTests.cs index 6e67d79dc9..6e04171800 100644 --- a/test/Microsoft.Data.Analysis.Tests/BufferTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/PrimitiveDataFrameColumnTests.cs @@ -8,12 +8,11 @@ using System.Text; using Apache.Arrow; using Microsoft.ML.TestFramework.Attributes; -using Newtonsoft.Json.Linq; using Xunit; namespace Microsoft.Data.Analysis.Tests { - public class BufferTests + public class PrimitiveDataFrameColumnTests { [X64Fact("32-bit doesn't allow to allocate more than 2 Gb")] public void TestGetterAndSetterForColumnsGreaterThanMaxCapacity() @@ -373,7 +372,6 @@ public void TestNotNullableColumnCloneWithIndicesMapAsEnumerableInt() Assert.Equal(column[indicesMap[i]], clonedColumn[i]); } - [Fact] public void TestNullableColumnCloneWithIndicesMapAndSmallerSize() { @@ -406,60 +404,6 @@ public void TestNullableColumnCloneWithIndicesMap_OutOfRange() } - [Fact] - public void TestBasicArrowStringColumn() - { - StringArray strArray = new StringArray.Builder().Append("foo").Append("bar").Build(); - Memory dataMemory = new byte[] { 102, 111, 111, 98, 97, 114 }; - Memory nullMemory = new byte[] { 0, 0, 0, 0 }; - Memory offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0 }; - - ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, strArray.Length, strArray.NullCount); - Assert.Equal(2, stringColumn.Length); - Assert.Equal("foo", stringColumn[0]); - Assert.Equal("bar", stringColumn[1]); - } - - [Fact] - public void TestArrowStringColumnWithNulls() - { - string data = "joemark"; - byte[] bytes = Encoding.UTF8.GetBytes(data); - Memory dataMemory = new Memory(bytes); - Memory nullMemory = new byte[] { 0b1101 }; - Memory offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 }; - ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, 4, 1); - - Assert.Equal(4, stringColumn.Length); - Assert.Equal("joe", stringColumn[0]); - Assert.Null(stringColumn[1]); - Assert.Equal("mark", stringColumn[2]); - Assert.Equal("", stringColumn[3]); - - List ret = stringColumn[0, 4]; - Assert.Equal("joe", ret[0]); - Assert.Null(ret[1]); - Assert.Equal("mark", ret[2]); - Assert.Equal("", ret[3]); - } - - [Fact] - public void TestArrowStringColumnClone() - { - StringArray strArray = new StringArray.Builder().Append("foo").Append("bar").Build(); - Memory dataMemory = new byte[] { 102, 111, 111, 98, 97, 114 }; - Memory nullMemory = new byte[] { 0, 0, 0, 0 }; - Memory offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0 }; - - ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, strArray.Length, strArray.NullCount); - - DataFrameColumn clone = stringColumn.Clone(numberOfNullsToAppend: 5); - Assert.Equal(7, clone.Length); - Assert.Equal(stringColumn[0], clone[0]); - Assert.Equal(stringColumn[1], clone[1]); - for (int i = 2; i < 7; i++) - Assert.Null(clone[i]); - } [X64Fact("32-bit doesn't allow to allocate more than 2 Gb")] public void TestAppend_SizeMoreThanMaxBufferCapacity() @@ -481,6 +425,145 @@ public void TestAppendMany_SizeMoreThanMaxBufferCapacity() Assert.Equal(MaxCapacityInBytes + 5, intColumn.Length); } + [Fact] + public void Test_PrimitiveColumnNotEqualsNull() + { + var col = new DoubleDataFrameColumn("col", new double?[] { 1.23, null, 2, 3 }); + var dfTest = new DataFrame(col); + + var filteredNullDf = dfTest.Filter(dfTest["col"].ElementwiseNotEquals(null)); + + Assert.True(filteredNullDf.Columns.IndexOf("col") >= 0); + Assert.Equal(3, filteredNullDf.Columns["col"].Length); + + Assert.Equal(1.23, filteredNullDf.Columns["col"][0]); + Assert.Equal(2.0, filteredNullDf.Columns["col"][1]); + Assert.Equal(3.0, filteredNullDf.Columns["col"][2]); + } + + [Fact] + public void Test_PrimitiveColumnEqualsNull() + { + var index = new Int32DataFrameColumn("index", new int[] { 1, 2, 3, 4, 5 }); + var col = new DoubleDataFrameColumn("col", new double?[] { 1.23, null, 2, 3, null }); ; + var dfTest = new DataFrame(index, col); + + var filteredNullDf = dfTest.Filter(dfTest["col"].ElementwiseEquals(null)); + + Assert.True(filteredNullDf.Columns.IndexOf("col") >= 0); + Assert.True(filteredNullDf.Columns.IndexOf("index") >= 0); + + Assert.Equal(2, filteredNullDf.Rows.Count); + + Assert.Equal(2, filteredNullDf.Columns["index"][0]); + Assert.Equal(5, filteredNullDf.Columns["index"][1]); + } + + [Fact] + public void Test_ArithmeticsAddWithNull() + { + // Arrange + //Number of elements shoult be higher than 8 to test SIMD + var left_column = new Int32DataFrameColumn("Left", new int?[] { 1, 1, null, null, 4, 5, 6, 7, 8, 9 }); + var right_column = new Int32DataFrameColumn("Right", new int?[] { 1, null, 1, null, 4, 5, 6, 7, 8, 9 }); + + // Act + var sum = left_column + right_column; + + // Assert + Assert.Equal(3, sum.NullCount); + + Assert.Equal(2, sum[0]); // 1 + 1 + Assert.Null(sum[1]); // null + 1 + Assert.Null(sum[2]); // 1 + null + Assert.Null(sum[3]); // null + null + Assert.Equal(8, sum[4]); + Assert.Equal(10, sum[5]); + Assert.Equal(12, sum[6]); + Assert.Equal(14, sum[7]); + Assert.Equal(16, sum[8]); + Assert.Equal(18, sum[9]); + } + + [Fact] + public void Test_ArithmeticsAddScalarWithNull() + { + // Arrange + //Number of elements shoult be higher than 8 to test SIMD + var left_column = new Int32DataFrameColumn("Left", new int?[] { 0, 1, null, null, 4, 5, 6, 7, 8, null }); + + // Act + var sum = left_column + 5; + + // Assert + Assert.Equal(3, sum.NullCount); + + Assert.Equal(5, sum[0]); // 1 + 1 + Assert.Equal(6, sum[1]); // 1 + 1 + Assert.Null(sum[2]); // 1 + null + Assert.Null(sum[3]); // null + null + Assert.Equal(9, sum[4]); + Assert.Equal(10, sum[5]); + Assert.Equal(11, sum[6]); + Assert.Equal(12, sum[7]); + Assert.Equal(13, sum[8]); + Assert.Null(sum[9]); + } + + [Fact] + public void Test_ArithmeticsDiffWithNull() + { + // Arrange + var left_column = new Int32DataFrameColumn("Left", new int?[] { 1, 1, null, null }); + var right_column = new Int32DataFrameColumn("Right", new int?[] { 1, null, 1, null }); + + // Act + var diff = left_column - (right_column); + + // Assert + Assert.Equal(3, diff.NullCount); + Assert.Equal(0, diff[0]); // 1 - 1 + Assert.Null(diff[1]); // null - 1 + Assert.Null(diff[2]); // 1 - null + Assert.Null(diff[3]); // null - null + } + + [Fact] + public void Test_ArithmeticsMultWithNull() + { + // Arrange + var left_column = new Int32DataFrameColumn("Left", new int?[] { 4, 1, null, null }); + var right_column = new Int32DataFrameColumn("Right", new int?[] { 2, null, 1, null }); + + // Act + var mult = left_column * right_column; + + // Assert + Assert.Equal(3, mult.NullCount); + Assert.Equal(8, mult[0]); // 1 * 1 + Assert.Null(mult[1]); // null * 1 + Assert.Null(mult[2]); // 1 * null + Assert.Null(mult[3]); // null * null + } + + [Fact] + public void Test_ArithmeticsDivWithNull() + { + // Arrange + var left_column = new Int32DataFrameColumn("Left", new int?[] { 4, 1, null, null }); + var right_column = new Int32DataFrameColumn("Right", new int?[] { 2, null, 1, null }); + + // Act + var div = left_column / right_column; + + // Assert + Assert.Equal(3, div.NullCount); + Assert.Equal(2, div[0]); // 1 / 1 + Assert.Null(div[1]); // null / 1 + Assert.Null(div[2]); // 1 / null + Assert.Null(div[3]); // null / null + } + //#if !NETFRAMEWORK // https://github.com/dotnet/corefxlab/issues/2796 // [Fact] // public void TestPrimitiveColumnGetReadOnlyBuffers() diff --git a/test/Microsoft.Data.Analysis.Tests/VBufferColumnTests.cs b/test/Microsoft.Data.Analysis.Tests/VBufferColumnTests.cs new file mode 100644 index 0000000000..e727a1f2af --- /dev/null +++ b/test/Microsoft.Data.Analysis.Tests/VBufferColumnTests.cs @@ -0,0 +1,66 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.ML.Data; +using Microsoft.ML.TestFramework.Attributes; +using Xunit; + + +namespace Microsoft.Data.Analysis.Tests +{ + public class VBufferColumnTests + { + [Fact] + public void TestVBufferColumn_Creation() + { + var buffers = Enumerable.Repeat(new VBuffer(5, new[] { 0, 1, 2, 3, 4 }), 10).ToArray(); + var vBufferColumn = new VBufferDataFrameColumn("VBuffer", buffers); + + Assert.Equal(10, vBufferColumn.Length); + Assert.Equal(5, vBufferColumn[0].GetValues().Length); + Assert.Equal(0, vBufferColumn[0].GetValues()[0]); + } + + [Fact] + public void TestVBufferColumn_Indexer() + { + var buffer = new VBuffer(5, new[] { 4, 3, 2, 1, 0 }); + + var vBufferColumn = new VBufferDataFrameColumn("VBuffer", 1); + vBufferColumn[0] = buffer; + + Assert.Equal(1, vBufferColumn.Length); + Assert.Equal(5, vBufferColumn[0].GetValues().Length); + Assert.Equal(0, vBufferColumn[0].GetValues()[4]); + } + + [X64Fact("32-bit doesn't allow to allocate more than 2 Gb")] + public void TestVBufferColumn_Indexer_MoreThanMaxInt() + { + var originalValues = new[] { 4, 3, 2, 1, 0 }; + + var length = VBufferDataFrameColumn.MaxCapacity + 3; + + var vBufferColumn = new VBufferDataFrameColumn("VBuffer", length); + long index = length - 2; + + vBufferColumn[index] = new VBuffer(5, originalValues); + + var values = vBufferColumn[index].GetValues(); + + Assert.Equal(length, vBufferColumn.Length); + Assert.Equal(5, values.Length); + + for (int i = 0; i < values.Length; i++) + { + Assert.Equal(originalValues[i], values[i]); + } + } + } +}