diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs new file mode 100644 index 0000000000..5fd246bd5a --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Categorical/OneHotEncoding.cs @@ -0,0 +1,85 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using static Microsoft.ML.Transforms.OneHotEncodingEstimator; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class OneHotEncoding + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Get a small dataset as an IEnumerable. + var samples = new List() + { + new DataPoint(){ Label = 0, Education = "0-5yrs" }, + new DataPoint(){ Label = 1, Education = "0-5yrs" }, + new DataPoint(){ Label = 45, Education = "6-11yrs" }, + new DataPoint(){ Label = 50, Education = "6-11yrs" }, + new DataPoint(){ Label = 50, Education = "11-15yrs" }, + }; + + // Convert training data to IDataView. + var trainData = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for one hot encoding the Education column. + var bagPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Bag); + // Fit to data. + var bagTransformer = bagPipeline.Fit(trainData); + + // Get transformed data + var bagTransformedData = bagTransformer.Transform(trainData); + // Getting the data of the newly created column, so we can preview it. + var bagEncodedColumn = bagTransformedData.GetColumn("EducationOneHotEncoded"); + + var keyPipeline = mlContext.Transforms.Categorical.OneHotEncoding("EducationOneHotEncoded", "Education", OutputKind.Key); + // Fit to data. + var keyTransformer = keyPipeline.Fit(trainData); + + // Get transformed data + var keyTransformedData = keyTransformer.Transform(trainData); + // Getting the data of the newly created column, so we can preview it. + var keyEncodedColumn = keyTransformedData.GetColumn("EducationOneHotEncoded"); + + Console.WriteLine("One Hot Encoding based on the bagging strategy."); + foreach (var row in bagEncodedColumn) + { + for (var i = 0; i < row.Length; i++) + Console.Write($"{row[i]} "); + } + + // data column obtained post-transformation. + // Since there are only two categories in the Education column of the trainData, the output vector + // for one hot will have two slots. + // + // 0 0 0 + // 0 0 0 + // 0 0 1 + // 0 0 1 + // 0 1 0 + + Console.WriteLine("One Hot Encoding with key type output."); + foreach (var element in keyEncodedColumn) + Console.WriteLine(element); + + // 1 + // 1 + // 2 + // 2 + // 3 + + } + + private class DataPoint + { + public float Label { get; set; } + + public string Education { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs new file mode 100644 index 0000000000..15d448deee --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/IndicateMissingValues.cs @@ -0,0 +1,74 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class IndicateMissingValues + { + + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + var samples = new List() + { + new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, + new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, + new DataPoint(){ Label = float.NaN, Features = new float[3] {-1, float.NaN, -3} }, + }; + // Convert training data to IDataView, the general data type used in ML.NET. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // IndicateMissingValues is used to create a boolean containing + // 'true' where the value in the input column is NaN. This value can be used + // to replace missing values with other values. + IEstimator pipeline = mlContext.Transforms.IndicateMissingValues("MissingIndicator", "Features"); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var tansformer = pipeline.Fit(data); + var transformedData = tansformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var rowEnumerable = mlContext.Data.CreateEnumerable(transformedData, reuseRowObject: false); + + // a small printing utility + Func vectorPrinter = (object[] vector) => + { + string preview = "["; + foreach (var slot in vector) + preview += $"{slot} "; + return preview += "]"; + + }; + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in rowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingIndicator: {vectorPrinter(row.MissingIndicator.Cast().ToArray())}"); + } + + // Expected output: + // + // Label: 3 Features: [1 1 0] MissingIndicator: [False False False] + // Label: 32 Features: [0 NaN 1] MissingIndicator: [False True False] + // Label: NaN Features: [-1 NaN -3 ] MissingIndicator: [False True False] + } + + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + private sealed class SampleDataTransformed : DataPoint + { + public bool[] MissingIndicator { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs new file mode 100644 index 0000000000..1bcc4ef5f5 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ReplaceMissingValues.cs @@ -0,0 +1,102 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; +using static Microsoft.ML.Transforms.MissingValueReplacingEstimator.ColumnOptions; + +namespace Microsoft.ML.Samples.Dynamic +{ + class ReplaceMissingValues + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + var samples = new List() + { + new DataPoint(){ Label = 3, Features = new float[3] {1, 1, 0} }, + new DataPoint(){ Label = 32, Features = new float[3] {0, float.NaN, 1} }, + new DataPoint(){ Label = 5, Features = new float[3] {-1, 2, -3} }, + new DataPoint(){ Label = 9, Features = new float[3] {-1, 6, -3} }, + }; + // Convert training data to IDataView, the general data type used in ML.NET. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. + var meanPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.Mean); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var meanTransformer = meanPipeline.Fit(data); + var meanTransformedData = meanTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var meanRowEnumerable = mlContext.Data.CreateEnumerable(meanTransformedData, reuseRowObject: false); + + // ReplaceMissingValues is used to create a column where missing values are replaced according to the ReplacementMode. + var defaultPipeline = mlContext.Transforms.ReplaceMissingValues("MissingReplaced", "Features", ReplacementMode.DefaultValue); + + // Now we can transform the data and look at the output to confirm the behavior of the estimator. + // This operation doesn't actually evaluate data until we read the data below. + var defaultTransformer = defaultPipeline.Fit(data); + var defaultTransformedData = defaultTransformer.Transform(data); + + // We can extract the newly created column as an IEnumerable of SampleDataTransformed, the class we define below. + var defaultRowEnumerable = mlContext.Data.CreateEnumerable(defaultTransformedData, reuseRowObject: false); + + // a small printing utility + Func vectorPrinter = (object[] vector) => + { + string preview = "["; + foreach (var slot in vector) + preview += $"{slot} "; + return preview += "]"; + + }; + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in meanRowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast().ToArray())}"); + } + + // Expected output: + // Notice how the NaN of the Features column for the second row is replaced by the mean of (1, 2, 6) the values in that row + // + //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] + //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 3 1] + //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 -3] + //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 -3] + + // And finally, we can write out the rows of the dataset, looking at the columns of interest. + foreach (var row in defaultRowEnumerable) + { + Console.WriteLine($"Label: {row.Label} Features: {vectorPrinter(row.Features.Cast().ToArray())} MissingReplaced: {vectorPrinter(row.MissingReplaced.Cast().ToArray())}"); + } + + // Expected output: + // Notice how the NaN of the Features column for the second row is replaced by 0, the default value for floats. + // + //Label: 3 Features: [1 1 0] MissingReplaced: [1 1 0] + //Label: 32 Features: [0 NaN 1] MissingReplaced: [0 0 1] + //Label: 5 Features: [-1 2 - 3] MissingReplaced: [-1 2 - 3] + //Label: 9 Features: [-1 6 - 3] MissingReplaced: [-1 6 - 3] + } + + private class DataPoint + { + public float Label { get; set; } + + [VectorType(3)] + public float[] Features { get; set; } + } + + private sealed class SampleDataTransformed : DataPoint + { + [VectorType(3)] + public float[] MissingReplaced { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index 00424b9516..ef67739045 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - CustomMapping.Example(); + ReplaceMissingValues.Example(); } } } diff --git a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs index 6d17d4ed78..59c5553d2a 100644 --- a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs +++ b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs @@ -20,6 +20,12 @@ public static class CategoricalCatalog /// Name of the column resulting from the transformation of . /// Name of column to transform. If set to , the value of the will be used as source. /// The conversion mode. + /// + /// + /// + /// public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog, string outputColumnName, string inputColumnName = null, diff --git a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs index 56b1739034..2e283ad89f 100644 --- a/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs +++ b/src/Microsoft.ML.Transforms/ExtensionsCatalog.cs @@ -29,6 +29,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// Name of the column resulting from the transformation of . /// Name of column to transform. If set to , the value of the will be used as source. /// If left to null the will get replaced. + /// + /// + /// + /// public static MissingValueIndicatorEstimator IndicateMissingValues(this TransformsCatalog catalog, string outputColumnName, string inputColumnName = null) @@ -46,6 +52,12 @@ public static MissingValueIndicatorEstimator IndicateMissingValues(this Transfor /// Name of column to transform. If set to , the value of the will be used as source. /// If not provided, the will be replaced with the results of the transforms. /// The type of replacement to use as specified in + /// + /// + /// + /// public static MissingValueReplacingEstimator ReplaceMissingValues(this TransformsCatalog catalog, string outputColumnName, string inputColumnName = null,