From 778b905144d870d4958e3bde7c1dd5265831c868 Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Thu, 8 Sep 2022 14:14:21 -0700 Subject: [PATCH 1/4] update --- .../Microsoft.ML.AutoML.Samples/Program.cs | 44 +++++++ .../AutoMLExperiment/IMonitor.cs | 1 + .../CodeGen/fast_tree_search_space.json | 5 + .../CodeGen/search-space-schema.json | 3 +- .../SweepableEstimator/Estimators/FastTree.cs | 4 + .../AutoMLExperimentTests.cs | 113 ++++++++++++++++++ 6 files changed, 169 insertions(+), 1 deletion(-) diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs b/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs index 646c3546ae..bf38cbf134 100644 --- a/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs @@ -1,4 +1,5 @@ using System; +using Microsoft.ML.Data; namespace Microsoft.ML.AutoML.Samples { @@ -6,6 +7,27 @@ public class Program { public static void Main(string[] args) { + var context = new MLContext(1); + context.Log += (o, e) => + { + if (e.Source.StartsWith("AutoMLExperiment")) + { + Console.WriteLine(e.RawMessage); + } + }; + var trainPath = @"D:/large_csv.csv"; + var dataset = context.Data.LoadFromTextFile(trainPath, ',', hasHeader: false); + var experiment = context.Auto().CreateExperiment(); + var label = "Entry(Text)"; + var pipeline = context.Transforms.Conversion.MapValueToKey(label, label) + .Append(context.Auto().MultiClassification(label, featureColumnName: "_data", useFastTree: true, useFastForest: false, useLgbm: false, fastTreeOption: new CodeGen.FastTreeOption { DiskTranspose = true, })); + + experiment.SetDataset(context.Data.TrainTestSplit(dataset)) + .SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, label) + .SetPipeline(pipeline) + .SetTrainingTimeInSeconds(10000); + + experiment.Run(); try { RecommendationExperiment.Run(); @@ -36,4 +58,26 @@ public static void Main(string[] args) Console.ReadLine(); } } + + class ModelInput + { + [LoadColumn(0), NoColumn] + public string _data0 { get; set; } + + [LoadColumn(1), NoColumn] + public float ignoreData1 { get; set; } + + [LoadColumn(2, 4205)] + public float[] _data { get; set; } + + [LoadColumn(4206), NoColumn]//(4206,4208)] + public float _ignoreData4206 { get; set; } + [LoadColumn(4207), NoColumn]//(4206,4208)] + public float _ignoreData4207 { get; set; } + [LoadColumn(4208), NoColumn]//(4206,4208)] + public float _ignoreData4208 { get; set; } + + [LoadColumn(4209), ColumnName("Entry(Text)")] + public string _label { get; set; } + } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs index 6a28d80010..af867f07c1 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs @@ -50,6 +50,7 @@ public virtual void ReportCompletedTrial(TrialResult result) public virtual void ReportFailTrial(TrialSettings settings, Exception exception = null) { + _logger.Trace(exception.Message + exception.StackTrace); _logger.Info($"Update Failed Trial - Id: {settings.TrialId} - Pipeline: {_pipeline.ToString(settings.Parameter)}"); } diff --git a/src/Microsoft.ML.AutoML/CodeGen/fast_tree_search_space.json b/src/Microsoft.ML.AutoML/CodeGen/fast_tree_search_space.json index 0a3af2790e..21fda7e1a0 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/fast_tree_search_space.json +++ b/src/Microsoft.ML.AutoML/CodeGen/fast_tree_search_space.json @@ -75,6 +75,11 @@ { "name": "ExampleWeightColumnName", "type": "string" + }, + { + "name": "DiskTranspose", + "type": "boolean", + "default": false } ] } diff --git a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json index f99c390b62..595f721e3d 100644 --- a/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json +++ b/src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json @@ -154,7 +154,8 @@ "NumberOfIterations", "Quiet", "OutputAsFloatArray", - "ModelFactory" + "ModelFactory", + "DiskTranspose" ] }, "option_type": { diff --git a/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/FastTree.cs b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/FastTree.cs index cc6ae89136..b89e63bf34 100644 --- a/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/FastTree.cs +++ b/src/Microsoft.ML.AutoML/SweepableEstimator/Estimators/FastTree.cs @@ -22,6 +22,7 @@ public override IEstimator BuildFromOption(MLContext context, Fast NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(), MaximumBinCountPerFeature = param.MaximumBinCountPerFeature, FeatureFraction = param.FeatureFraction, + DiskTranspose = param.DiskTranspose, }; return context.MulticlassClassification.Trainers.OneVersusAll(context.BinaryClassification.Trainers.FastTree(option), labelColumnName: param.LabelColumnName); @@ -43,6 +44,7 @@ public override IEstimator BuildFromOption(MLContext context, Fast ExampleWeightColumnName = param.ExampleWeightColumnName, NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(), MaximumBinCountPerFeature = param.MaximumBinCountPerFeature, + DiskTranspose = param.DiskTranspose, FeatureFraction = param.FeatureFraction, }; @@ -65,6 +67,7 @@ public override IEstimator BuildFromOption(MLContext context, Fast ExampleWeightColumnName = param.ExampleWeightColumnName, NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(), MaximumBinCountPerFeature = param.MaximumBinCountPerFeature, + DiskTranspose = param.DiskTranspose, FeatureFraction = param.FeatureFraction, }; @@ -87,6 +90,7 @@ public override IEstimator BuildFromOption(MLContext context, Fast ExampleWeightColumnName = param.ExampleWeightColumnName, NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(), MaximumBinCountPerFeature = param.MaximumBinCountPerFeature, + DiskTranspose = param.DiskTranspose, FeatureFraction = param.FeatureFraction, }; diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index e38dace417..b498f510de 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Diagnostics; +using System.IO; using System.Linq; using System.Text; using System.Threading; @@ -12,10 +13,12 @@ using FluentAssertions; using Microsoft.Data.Analysis; using Microsoft.Extensions.DependencyInjection; +using Microsoft.ML.Data; using Microsoft.ML.AutoML.CodeGen; using Microsoft.ML.Runtime; using Microsoft.ML.TestFramework; using Microsoft.ML.TestFramework.Attributes; +using Tensorflow; using Xunit; using Xunit.Abstractions; @@ -327,6 +330,94 @@ public async Task AutoMLExperiment_Taxi_Fare_CV_5_Test() var result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.5); } + + [Fact] + public async Task Generate_300GB_csv() + { + var rnd = new Random(); + ModelInput GenerateRandomRow() + { + return new ModelInput + { + _data0 = rnd.NextSingle() > 0.5 ? "a" : "b", + ignoreData1 = rnd.NextSingle(), + _data = Enumerable.Repeat(rnd.NextSingle(), 4204).ToArray(), + _ignoreData4206 = rnd.NextSingle(), + _ignoreData4207 = rnd.NextSingle(), + _ignoreData4208 = rnd.NextSingle(), + _label = rnd.NextSingle() > 0.5 ? "True" : "False", + }; + } + + var filePath = @"D:/large_csv.csv"; + var fileInfo = new FileInfo(filePath); + + using (var fileStream = fileInfo.Open(FileMode.Append)) + using (var stream = new StreamWriter(fileStream)) + { + var i = 0; + while ((!fileInfo.Exists || fileInfo.Length < 300.0 * 1024 * 1024 * 1024) && i < 100) + { + fileInfo.Refresh(); + Output.WriteLine($"{fileInfo.Length / (1024 * 1024 * 1024 * 1.0)}"); + var taskNum = 10; + var taskPool = new Task[taskNum]; + for (int _i = 0; _i != taskNum; ++_i) + { + var t = Task.Factory.StartNew(() => + { + var sb = new StringBuilder(); + var rows = Enumerable.Range(0, 10000).Select(i => GenerateRandomRow()); + foreach (var row in rows) + { + var line = $"\"{row._data0}\",{row.ignoreData1.ToString("F2")},{string.Join(",", row._data.Select(d => d.ToString("F2")))}, {row._ignoreData4206.ToString("F2")}, {row._ignoreData4207.ToString("F2")}, {row._ignoreData4208.ToString("F2")},\"{row._label}\""; + sb.AppendLine(line); + } + + return sb.ToString(); + }); + taskPool[_i] = t; + } + + var getRandomRow = await Task.WhenAll(taskPool); + foreach (var row in getRandomRow) + { + await stream.WriteAsync(row); + } + + await stream.FlushAsync(); + i++; + } + } + } + + [Fact] + public async Task Large_csv_test() + { + var context = new MLContext(1); + context.Log += (o, e) => + { + if (e.Source.StartsWith("AutoMLExperiment")) + { + this.Output.WriteLine(e.RawMessage); + } + }; + var trainPath = @"D:/large_csv.csv"; + var dataset = context.Data.LoadFromTextFile(trainPath, ',', hasHeader: false); + var experiment = context.Auto().CreateExperiment(); + var label = "Entry(Text)"; + var pipeline = context.Auto().Featurizer(dataset, excludeColumns: new[] { label }) + .Append(context.Transforms.Conversion.MapValueToKey(label, label)) + .Append(context.Auto().MultiClassification(label)); + + experiment.SetDataset(context.Data.TrainTestSplit(dataset)) + .SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, label) + .SetPipeline(pipeline) + .SetTrainingTimeInSeconds(50); + + var result = await experiment.RunAsync(); + result.Metric.Should().BeGreaterThan(0.5); + } } class DummyTrialRunner : ITrialRunner @@ -426,4 +517,26 @@ public void Stop() _timer = null; } } + + class ModelInput + { + [LoadColumn(0), NoColumn] + public string _data0 { get; set; } + + [LoadColumn(1), NoColumn] + public float ignoreData1 { get; set; } + + [LoadColumn(2, 4205)] + public float[] _data { get; set; } + + [LoadColumn(4206), NoColumn]//(4206,4208)] + public float _ignoreData4206 { get; set; } + [LoadColumn(4207), NoColumn]//(4206,4208)] + public float _ignoreData4207 { get; set; } + [LoadColumn(4208), NoColumn]//(4206,4208)] + public float _ignoreData4208 { get; set; } + + [LoadColumn(4209), ColumnName("Entry(Text)")] + public string _label { get; set; } + } } From b50a10e72001eba6c86ea132626eebf9e24a1726 Mon Sep 17 00:00:00 2001 From: Xiaoyun Zhang Date: Thu, 8 Sep 2022 15:45:19 -0700 Subject: [PATCH 2/4] Update AutoMLExperimentTests.cs --- test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index b498f510de..46f79326d2 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -331,7 +331,6 @@ public async Task AutoMLExperiment_Taxi_Fare_CV_5_Test() result.Metric.Should().BeGreaterThan(0.5); } - [Fact] public async Task Generate_300GB_csv() { var rnd = new Random(); @@ -391,7 +390,6 @@ ModelInput GenerateRandomRow() } } - [Fact] public async Task Large_csv_test() { var context = new MLContext(1); From e6015896cefc5f94376923568e2478eee6740d56 Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Wed, 10 May 2023 11:30:40 -0700 Subject: [PATCH 3/4] checkout test file --- .../Microsoft.ML.AutoML.Samples/Program.cs | 46 +--- .../AutoMLExperimentTests.cs | 219 +++++++----------- 2 files changed, 88 insertions(+), 177 deletions(-) diff --git a/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs b/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs index bf38cbf134..8de189de4a 100644 --- a/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.AutoML.Samples/Program.cs @@ -1,5 +1,4 @@ using System; -using Microsoft.ML.Data; namespace Microsoft.ML.AutoML.Samples { @@ -7,29 +6,10 @@ public class Program { public static void Main(string[] args) { - var context = new MLContext(1); - context.Log += (o, e) => - { - if (e.Source.StartsWith("AutoMLExperiment")) - { - Console.WriteLine(e.RawMessage); - } - }; - var trainPath = @"D:/large_csv.csv"; - var dataset = context.Data.LoadFromTextFile(trainPath, ',', hasHeader: false); - var experiment = context.Auto().CreateExperiment(); - var label = "Entry(Text)"; - var pipeline = context.Transforms.Conversion.MapValueToKey(label, label) - .Append(context.Auto().MultiClassification(label, featureColumnName: "_data", useFastTree: true, useFastForest: false, useLgbm: false, fastTreeOption: new CodeGen.FastTreeOption { DiskTranspose = true, })); - - experiment.SetDataset(context.Data.TrainTestSplit(dataset)) - .SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, label) - .SetPipeline(pipeline) - .SetTrainingTimeInSeconds(10000); - - experiment.Run(); try { + AutoMLExperiment.RunAsync().Wait(); + RecommendationExperiment.Run(); Console.Clear(); @@ -58,26 +38,4 @@ public static void Main(string[] args) Console.ReadLine(); } } - - class ModelInput - { - [LoadColumn(0), NoColumn] - public string _data0 { get; set; } - - [LoadColumn(1), NoColumn] - public float ignoreData1 { get; set; } - - [LoadColumn(2, 4205)] - public float[] _data { get; set; } - - [LoadColumn(4206), NoColumn]//(4206,4208)] - public float _ignoreData4206 { get; set; } - [LoadColumn(4207), NoColumn]//(4206,4208)] - public float _ignoreData4207 { get; set; } - [LoadColumn(4208), NoColumn]//(4206,4208)] - public float _ignoreData4208 { get; set; } - - [LoadColumn(4209), ColumnName("Entry(Text)")] - public string _label { get; set; } - } } diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index 48b3392da5..1820a9447f 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -13,12 +13,10 @@ using FluentAssertions; using Microsoft.Data.Analysis; using Microsoft.Extensions.DependencyInjection; -using Microsoft.ML.Data; using Microsoft.ML.AutoML.CodeGen; using Microsoft.ML.Runtime; using Microsoft.ML.TestFramework; using Microsoft.ML.TestFramework.Attributes; -using Tensorflow; using Xunit; using Xunit.Abstractions; @@ -75,7 +73,7 @@ public async Task AutoMLExperiment_cancel_trial_when_exceeds_memory_limit_Async( // the following experiment set memory usage limit to 0.01mb // so all trials should be canceled and there should be no successful trials. // therefore when experiment finishes, it should throw timeout exception with no model trained message. - experiment.SetTrainingTimeInSeconds(10) + experiment.SetMaxModelToExplore(10) .SetTrialRunner((serviceProvider) => { var channel = serviceProvider.GetService(); @@ -83,8 +81,7 @@ public async Task AutoMLExperiment_cancel_trial_when_exceeds_memory_limit_Async( return new DummyTrialRunner(settings, 5, channel); }) .SetTuner() - .SetMaximumMemoryUsageInMegaByte(0.01) - .SetPerformanceMonitor(); + .SetMaximumMemoryUsageInMegaByte(0.01); var runExperimentAction = async () => await experiment.RunAsync(); await runExperimentAction.Should().ThrowExactlyAsync(); @@ -154,6 +151,13 @@ public async Task AutoMLExperiment_return_current_best_trial_when_ct_is_canceled public async Task AutoMLExperiment_finish_training_when_time_is_up_Async() { var context = new MLContext(1); + context.Log += (o, e) => + { + if (e.Source.StartsWith("AutoMLExperiment")) + { + this.Output.WriteLine(e.RawMessage); + } + }; var experiment = context.Auto().CreateExperiment(); experiment.SetTrainingTimeInSeconds(5) @@ -161,7 +165,7 @@ public async Task AutoMLExperiment_finish_training_when_time_is_up_Async() { var channel = serviceProvider.GetService(); var settings = serviceProvider.GetService(); - return new DummyTrialRunner(settings, 1, channel); + return new DummyTrialRunner(settings, 0, channel); }) .SetTuner(); @@ -173,6 +177,32 @@ public async Task AutoMLExperiment_finish_training_when_time_is_up_Async() cts.IsCancellationRequested.Should().BeFalse(); } + [Fact] + public async Task AutoMLExperiment_finish_training_when_reach_to_max_model_async() + { + var context = new MLContext(1); + var experiment = context.Auto().CreateExperiment(); + experiment.SetMaxModelToExplore(5) + .SetTrialRunner((serviceProvider) => + { + var channel = serviceProvider.GetService(); + var settings = serviceProvider.GetService(); + return new DummyTrialRunner(settings, 1, channel); + }) + .SetTuner(); + + var runModelCounts = 0; + context.Log += (o, e) => + { + if (e.RawMessage.Contains("Update Completed Trial")) + { + runModelCounts++; + } + }; + await experiment.RunAsync(); + runModelCounts.Should().Be(5); + } + [Fact] public async Task AutoMLExperiment_UCI_Adult_Train_Test_Split_Test() @@ -188,12 +218,12 @@ public async Task AutoMLExperiment_UCI_Adult_Train_Test_Split_Test() var data = DatasetUtil.GetUciAdultDataView(); var experiment = context.Auto().CreateExperiment(); var pipeline = context.Auto().Featurizer(data, "_Features_", excludeColumns: new[] { DatasetUtil.UciAdultLabel }) - .Append(context.Auto().BinaryClassification(DatasetUtil.UciAdultLabel, "_Features_", useLgbm: false, useSdca: false, useLbfgs: false)); + .Append(context.Auto().BinaryClassification(DatasetUtil.UciAdultLabel, "_Features_", useLgbm: false, useSdcaLogisticRegression: false, useLbfgsLogisticRegression: false)); experiment.SetDataset(context.Data.TrainTestSplit(data)) .SetBinaryClassificationMetric(BinaryClassificationMetric.AreaUnderRocCurve, DatasetUtil.UciAdultLabel) .SetPipeline(pipeline) - .SetTrainingTimeInSeconds(1); + .SetMaxModelToExplore(1); var result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.8); @@ -213,12 +243,12 @@ public async Task AutoMLExperiment_UCI_Adult_CV_5_Test() var data = DatasetUtil.GetUciAdultDataView(); var experiment = context.Auto().CreateExperiment(); var pipeline = context.Auto().Featurizer(data, "_Features_", excludeColumns: new[] { DatasetUtil.UciAdultLabel }) - .Append(context.Auto().BinaryClassification(DatasetUtil.UciAdultLabel, "_Features_", useLgbm: false, useSdca: false, useLbfgs: false)); + .Append(context.Auto().BinaryClassification(DatasetUtil.UciAdultLabel, "_Features_", useLgbm: false, useSdcaLogisticRegression: false, useLbfgsLogisticRegression: false)); experiment.SetDataset(data, 5) .SetBinaryClassificationMetric(BinaryClassificationMetric.AreaUnderRocCurve, DatasetUtil.UciAdultLabel) .SetPipeline(pipeline) - .SetTrainingTimeInSeconds(10); + .SetMaxModelToExplore(1); var result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.8); @@ -230,7 +260,7 @@ public async Task AutoMLExperiment_Iris_CV_5_Test() var context = new MLContext(1); context.Log += (o, e) => { - if (e.Source.StartsWith("AutoMLExperiment")) + if (e.RawMessage.Contains("Trial")) { this.Output.WriteLine(e.RawMessage); } @@ -240,12 +270,12 @@ public async Task AutoMLExperiment_Iris_CV_5_Test() var label = "Label"; var pipeline = context.Auto().Featurizer(data, excludeColumns: new[] { label }) .Append(context.Transforms.Conversion.MapValueToKey(label, label)) - .Append(context.Auto().MultiClassification(label, useLgbm: false, useSdca: false, useLbfgs: false)); + .Append(context.Auto().MultiClassification(label, useLgbm: false, useSdcaMaximumEntrophy: false, useLbfgsMaximumEntrophy: false)); experiment.SetDataset(data, 5) .SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, label) .SetPipeline(pipeline) - .SetTrainingTimeInSeconds(10); + .SetMaxModelToExplore(1); var result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.8); @@ -267,12 +297,12 @@ public async Task AutoMLExperiment_Iris_Train_Test_Split_Test() var label = "Label"; var pipeline = context.Auto().Featurizer(data, excludeColumns: new[] { label }) .Append(context.Transforms.Conversion.MapValueToKey(label, label)) - .Append(context.Auto().MultiClassification(label, useLgbm: false, useSdca: false, useLbfgs: false)); + .Append(context.Auto().MultiClassification(label, useLgbm: false, useSdcaMaximumEntrophy: false, useLbfgsMaximumEntrophy: false)); experiment.SetDataset(context.Data.TrainTestSplit(data)) .SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, label) .SetPipeline(pipeline) - .SetTrainingTimeInSeconds(10); + .SetMaxModelToExplore(1); var result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.8); @@ -294,12 +324,12 @@ public async Task AutoMLExperiment_Taxi_Fare_Train_Test_Split_Test() var experiment = context.Auto().CreateExperiment(); var label = DatasetUtil.TaxiFareLabel; var pipeline = context.Auto().Featurizer(train, excludeColumns: new[] { label }) - .Append(context.Auto().Regression(label, useLgbm: false, useSdca: false, useLbfgs: false)); + .Append(context.Auto().Regression(label, useLgbm: false, useSdca: false, useLbfgsPoissonRegression: false)); experiment.SetDataset(train, test) .SetRegressionMetric(RegressionMetric.RSquared, label) .SetPipeline(pipeline) - .SetTrainingTimeInSeconds(50); + .SetMaxModelToExplore(1); var result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.5); @@ -309,125 +339,64 @@ public async Task AutoMLExperiment_Taxi_Fare_Train_Test_Split_Test() public async Task AutoMLExperiment_Taxi_Fare_CV_5_Test() { var context = new MLContext(1); - context.Log += (o, e) => - { - if (e.Source.StartsWith("AutoMLExperiment")) - { - this.Output.WriteLine(e.RawMessage); - } - }; var train = DatasetUtil.GetTaxiFareTrainDataView(); var experiment = context.Auto().CreateExperiment(); var label = DatasetUtil.TaxiFareLabel; var pipeline = context.Auto().Featurizer(train, excludeColumns: new[] { label }) - .Append(context.Auto().Regression(label, useLgbm: false, useSdca: false, useLbfgs: false)); + .Append(context.Auto().Regression(label, useLgbm: false, useSdca: false, useLbfgsPoissonRegression: false)); experiment.SetDataset(train, 5) .SetRegressionMetric(RegressionMetric.RSquared, label) .SetPipeline(pipeline) - .SetTrainingTimeInSeconds(50); + .SetMaxModelToExplore(1); var result = await experiment.RunAsync(); result.Metric.Should().BeGreaterThan(0.5); } - public async Task Generate_300GB_csv() + [Fact] + public async Task AutoMLExperiment_Taxi_Fare_CV_5_SamplingKey_Test() { - var rnd = new Random(); - ModelInput GenerateRandomRow() - { - return new ModelInput - { - _data0 = rnd.NextSingle() > 0.5 ? "a" : "b", - ignoreData1 = rnd.NextSingle(), - _data = Enumerable.Repeat(rnd.NextSingle(), 4204).ToArray(), - _ignoreData4206 = rnd.NextSingle(), - _ignoreData4207 = rnd.NextSingle(), - _ignoreData4208 = rnd.NextSingle(), - _label = rnd.NextSingle() > 0.5 ? "True" : "False", - }; - } + var context = new MLContext(1); + var train = DatasetUtil.GetTaxiFareTrainDataView(); + var experiment = context.Auto().CreateExperiment(); + var label = DatasetUtil.TaxiFareLabel; + var pipeline = context.Auto().Featurizer(train, excludeColumns: new[] { label }) + .Append(context.Auto().Regression(label, useLgbm: false, useSdca: false, useLbfgsPoissonRegression: false)); - var filePath = @"D:/large_csv.csv"; - var fileInfo = new FileInfo(filePath); + experiment.SetDataset(train, 5, "vendor_id") + .SetRegressionMetric(RegressionMetric.RSquared, label) + .SetPipeline(pipeline) + .SetMaxModelToExplore(1); - using (var fileStream = fileInfo.Open(FileMode.Append)) - using (var stream = new StreamWriter(fileStream)) - { - var i = 0; - while ((!fileInfo.Exists || fileInfo.Length < 300.0 * 1024 * 1024 * 1024) && i < 100) - { - fileInfo.Refresh(); - Output.WriteLine($"{fileInfo.Length / (1024 * 1024 * 1024 * 1.0)}"); - var taskNum = 10; - var taskPool = new Task[taskNum]; - for (int _i = 0; _i != taskNum; ++_i) - { - var t = Task.Factory.StartNew(() => - { - var sb = new StringBuilder(); - var rows = Enumerable.Range(0, 10000).Select(i => GenerateRandomRow()); - foreach (var row in rows) - { - var line = $"\"{row._data0}\",{row.ignoreData1.ToString("F2")},{string.Join(",", row._data.Select(d => d.ToString("F2")))}, {row._ignoreData4206.ToString("F2")}, {row._ignoreData4207.ToString("F2")}, {row._ignoreData4208.ToString("F2")},\"{row._label}\""; - sb.AppendLine(line); - } - - return sb.ToString(); - }); - taskPool[_i] = t; - } - - var getRandomRow = await Task.WhenAll(taskPool); - foreach (var row in getRandomRow) - { - await stream.WriteAsync(row); - } - - await stream.FlushAsync(); - i++; - } - } + var result = await experiment.RunAsync(); + result.Metric.Should().BeGreaterThan(0.2); + result.Metric.Should().BeLessThan(0.5); } - public async Task Large_csv_test() + [Fact] + public void AutoMLExperiment_should_use_seed_from_context_if_provided() { - var context = new MLContext(1); - context.Log += (o, e) => - { - if (e.Source.StartsWith("AutoMLExperiment")) - { - this.Output.WriteLine(e.RawMessage); - } - }; - var trainPath = @"D:/large_csv.csv"; - var dataset = context.Data.LoadFromTextFile(trainPath, ',', hasHeader: false); + var context = new MLContext(); var experiment = context.Auto().CreateExperiment(); - var label = "Entry(Text)"; - var pipeline = context.Auto().Featurizer(dataset, excludeColumns: new[] { label }) - .Append(context.Transforms.Conversion.MapValueToKey(label, label)) - .Append(context.Auto().MultiClassification(label)); + var settings = experiment.ServiceCollection.BuildServiceProvider().GetRequiredService(); + settings.Seed.Should().BeNull(); - experiment.SetDataset(context.Data.TrainTestSplit(dataset)) - .SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, label) - .SetPipeline(pipeline) - .SetTrainingTimeInSeconds(50); - - var result = await experiment.RunAsync(); - result.Metric.Should().BeGreaterThan(0.5); + context = new MLContext(1); + experiment = context.Auto().CreateExperiment(); + settings = experiment.ServiceCollection.BuildServiceProvider().GetRequiredService(); + settings.Seed.Should().Be(1); } } class DummyTrialRunner : ITrialRunner { private readonly int _finishAfterNSeconds; - private readonly CancellationToken _ct; private readonly IChannel _logger; public DummyTrialRunner(AutoMLExperiment.AutoMLExperimentSettings automlSettings, int finishAfterNSeconds, IChannel logger) { _finishAfterNSeconds = finishAfterNSeconds; - _ct = automlSettings.CancellationToken; _logger = logger; } @@ -439,7 +408,7 @@ public async Task RunAsync(TrialSettings settings, CancellationToke { _logger.Info("Update Running Trial"); await Task.Delay(_finishAfterNSeconds * 1000, ct); - _ct.ThrowIfCancellationRequested(); + ct.ThrowIfCancellationRequested(); _logger.Info("Update Completed Trial"); var metric = 1.000 + 0.01 * settings.TrialId; return new TrialResult @@ -462,9 +431,7 @@ public DummyPeformanceMonitor() _checkIntervalInMilliseconds = 1000; } - public event EventHandler CpuUsage; - - public event EventHandler MemoryUsageInMegaByte; + public event EventHandler PerformanceMetricsUpdated; public void Dispose() { @@ -480,6 +447,10 @@ public void Dispose() return 1000; } + public void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource) + { + } + public void Start() { if (_timer == null) @@ -487,13 +458,17 @@ public void Start() _timer = new System.Timers.Timer(_checkIntervalInMilliseconds); _timer.Elapsed += (o, e) => { - CpuUsage?.Invoke(this, 100); - MemoryUsageInMegaByte?.Invoke(this, 1000); + PerformanceMetricsUpdated?.Invoke(this, new TrialPerformanceMetrics() { PeakCpuUsage = 100, PeakMemoryUsage = 1000 }); }; _timer.AutoReset = true; - _timer.Enabled = true; } + _timer.Enabled = true; + } + + public void Pause() + { + _timer.Enabled = false; } public void Stop() @@ -503,26 +478,4 @@ public void Stop() _timer = null; } } - - class ModelInput - { - [LoadColumn(0), NoColumn] - public string _data0 { get; set; } - - [LoadColumn(1), NoColumn] - public float ignoreData1 { get; set; } - - [LoadColumn(2, 4205)] - public float[] _data { get; set; } - - [LoadColumn(4206), NoColumn]//(4206,4208)] - public float _ignoreData4206 { get; set; } - [LoadColumn(4207), NoColumn]//(4206,4208)] - public float _ignoreData4207 { get; set; } - [LoadColumn(4208), NoColumn]//(4206,4208)] - public float _ignoreData4208 { get; set; } - - [LoadColumn(4209), ColumnName("Entry(Text)")] - public string _label { get; set; } - } } From abad9ef9492977818c85f1fb08db71616423e706 Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Wed, 10 May 2023 13:24:06 -0700 Subject: [PATCH 4/4] fix tests --- ...EstimatorPipeline_search_space_init_value_test.approved.txt | 3 ++- ...ablePipelineFromIEstimatorAndBinaryClassifiers.approved.txt | 3 ++- ...pablePipelineFromIEstimatorAndMultiClassifiers.approved.txt | 3 ++- ...teSweepablePipelineFromIEstimatorAndRegressors.approved.txt | 3 ++- ...pelineFromIEstimatorAndSweepableEstimatorArray.approved.txt | 3 ++- ...elineFromSweepableEstimatorAndMultiClassifiers.approved.txt | 3 ++- ...omSweepableEstimatorAndSweepableEstimatorArray.approved.txt | 3 ++- ...pelineFromSweepablePipelineAndMultiClassifiers.approved.txt | 3 ++- ...romSweepablePipelineAndSweepableEstimatorArray.approved.txt | 3 ++- 9 files changed, 18 insertions(+), 9 deletions(-) diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableEstimatorPipelineTest.SweepableEstimatorPipeline_search_space_init_value_test.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableEstimatorPipelineTest.SweepableEstimatorPipeline_search_space_init_value_test.approved.txt index 6ff945b59f..1c38c7ae66 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableEstimatorPipelineTest.SweepableEstimatorPipeline_search_space_init_value_test.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableEstimatorPipelineTest.SweepableEstimatorPipeline_search_space_init_value_test.approved.txt @@ -23,6 +23,7 @@ "FeatureFraction": 1, "LearningRate": 0.10, "LabelColumnName": "Label", - "FeatureColumnName": "Feature" + "FeatureColumnName": "Feature", + "DiskTranspose": false } } \ No newline at end of file diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndBinaryClassifiers.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndBinaryClassifiers.approved.txt index 58efe42258..0e5fc3a63f 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndBinaryClassifiers.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndBinaryClassifiers.approved.txt @@ -16,7 +16,8 @@ "FeatureFraction": 1, "LearningRate": 0.1, "LabelColumnName": "Label", - "FeatureColumnName": "Features" + "FeatureColumnName": "Features", + "DiskTranspose": false } }, "e2": { diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndMultiClassifiers.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndMultiClassifiers.approved.txt index b02699087d..688ecf3f5e 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndMultiClassifiers.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndMultiClassifiers.approved.txt @@ -16,7 +16,8 @@ "FeatureFraction": 1, "LearningRate": 0.1, "LabelColumnName": "Label", - "FeatureColumnName": "Features" + "FeatureColumnName": "Features", + "DiskTranspose": false } }, "e2": { diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndRegressors.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndRegressors.approved.txt index b02699087d..688ecf3f5e 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndRegressors.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndRegressors.approved.txt @@ -16,7 +16,8 @@ "FeatureFraction": 1, "LearningRate": 0.1, "LabelColumnName": "Label", - "FeatureColumnName": "Features" + "FeatureColumnName": "Features", + "DiskTranspose": false } }, "e2": { diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndSweepableEstimatorArray.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndSweepableEstimatorArray.approved.txt index b02699087d..688ecf3f5e 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndSweepableEstimatorArray.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromIEstimatorAndSweepableEstimatorArray.approved.txt @@ -16,7 +16,8 @@ "FeatureFraction": 1, "LearningRate": 0.1, "LabelColumnName": "Label", - "FeatureColumnName": "Features" + "FeatureColumnName": "Features", + "DiskTranspose": false } }, "e2": { diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepableEstimatorAndMultiClassifiers.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepableEstimatorAndMultiClassifiers.approved.txt index 258dc76181..d8c41123ef 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepableEstimatorAndMultiClassifiers.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepableEstimatorAndMultiClassifiers.approved.txt @@ -22,7 +22,8 @@ "FeatureFraction": 1, "LearningRate": 0.1, "LabelColumnName": "Label", - "FeatureColumnName": "Features" + "FeatureColumnName": "Features", + "DiskTranspose": false } }, "e2": { diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepableEstimatorAndSweepableEstimatorArray.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepableEstimatorAndSweepableEstimatorArray.approved.txt index 3f6be2bbb3..574da3184f 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepableEstimatorAndSweepableEstimatorArray.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepableEstimatorAndSweepableEstimatorArray.approved.txt @@ -16,7 +16,8 @@ "FeatureFraction": 1, "LearningRate": 0.1, "LabelColumnName": "Label", - "FeatureColumnName": "Features" + "FeatureColumnName": "Features", + "DiskTranspose": false } }, "e2": { diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepablePipelineAndMultiClassifiers.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepablePipelineAndMultiClassifiers.approved.txt index 5ea3e5977e..312c7424f5 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepablePipelineAndMultiClassifiers.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepablePipelineAndMultiClassifiers.approved.txt @@ -20,7 +20,8 @@ "FeatureFraction": 1, "LearningRate": 0.1, "LabelColumnName": "Label", - "FeatureColumnName": "Features" + "FeatureColumnName": "Features", + "DiskTranspose": false } }, "e3": { diff --git a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepablePipelineAndSweepableEstimatorArray.approved.txt b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepablePipelineAndSweepableEstimatorArray.approved.txt index 5ea3e5977e..312c7424f5 100644 --- a/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepablePipelineAndSweepableEstimatorArray.approved.txt +++ b/test/Microsoft.ML.AutoML.Tests/ApprovalTests/SweepableExtensionTest.CreateSweepablePipelineFromSweepablePipelineAndSweepableEstimatorArray.approved.txt @@ -20,7 +20,8 @@ "FeatureFraction": 1, "LearningRate": 0.1, "LabelColumnName": "Label", - "FeatureColumnName": "Features" + "FeatureColumnName": "Features", + "DiskTranspose": false } }, "e3": {