From d3ffb2a1335e68625feb55e3286c437e30765a90 Mon Sep 17 00:00:00 2001 From: J W Date: Sun, 19 Jul 2020 12:13:53 -0400 Subject: [PATCH 1/4] Update to only get result if awaiter has completed --- src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs b/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs index 1dc52d81f5..da50c9c409 100644 --- a/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs @@ -634,7 +634,9 @@ protected override bool MoveNextCore() while (_liveCount < _poolRows && !_doneConsuming) { // We are under capacity. Try to get some more. - while (_toConsumeChannel.Reader.WaitToReadAsync().GetAwaiter().GetResult()) + var waitToReadAwaiter = _toConsumeChannel.Reader.WaitToReadAsync().GetAwaiter(); + + while (waitToReadAwaiter.IsCompleted && waitToReadAwaiter.GetResult()) { var hasReadItem = _toConsumeChannel.Reader.TryRead(out int got); if (hasReadItem) From e131724d9c1dc37f82bfe82b4c5109f8b9234c65 Mon Sep 17 00:00:00 2001 From: J W Date: Mon, 27 Jul 2020 07:35:48 -0400 Subject: [PATCH 2/4] Add thread sleep --- .../Transforms/RowShufflingTransformer.cs | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs b/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs index da50c9c409..fd529208c0 100644 --- a/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs @@ -634,25 +634,25 @@ protected override bool MoveNextCore() while (_liveCount < _poolRows && !_doneConsuming) { // We are under capacity. Try to get some more. - var waitToReadAwaiter = _toConsumeChannel.Reader.WaitToReadAsync().GetAwaiter(); - - while (waitToReadAwaiter.IsCompleted && waitToReadAwaiter.GetResult()) + var hasReadItem = _toConsumeChannel.Reader.TryRead(out int got); + if (hasReadItem) { - var hasReadItem = _toConsumeChannel.Reader.TryRead(out int got); - if (hasReadItem) + if (got == 0) { - if (got == 0) - { - // We've reached the end of the Channel. There's no reason - // to attempt further communication with the producer. - // Check whether something horrible happened. - if (_producerTaskException != null) - throw Ch.Except(_producerTaskException, "Shuffle input cursor reader failed with an exception"); - _doneConsuming = true; - break; - } - _liveCount += got; + // We've reached the end of the Channel. There's no reason + // to attempt further communication with the producer. + // Check whether something horrible happened. + if (_producerTaskException != null) + throw Ch.Except(_producerTaskException, "Shuffle input cursor reader failed with an exception"); + _doneConsuming = true; + break; } + _liveCount += got; + } + else + { + // Sleeping for one millisecond to stop the thread from spinning while waiting for the producer. + Thread.Sleep(1); } } if (_liveCount == 0) From 3ab5678d85e1fdad69307bee2bb69ec94b1f3a39 Mon Sep 17 00:00:00 2001 From: J W Date: Wed, 29 Jul 2020 06:07:40 -0400 Subject: [PATCH 3/4] Add regression scenario to make sure async fix works --- .../SamplesDatasetUtils.cs | 43 +++++++++++++++++++ .../Scenarios/RegressionTest.cs | 40 +++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 test/Microsoft.ML.Tests/Scenarios/RegressionTest.cs diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 51ce75b3af..2fba1b33b2 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -185,6 +185,49 @@ public static string DownloadTensorFlowSentimentModel() return path; } + public static string DownloadTaxiFareData() + { + string githubPath = "https://raw.githubusercontent.com/dotnet/machinelearning-samples/master/samples/csharp/getting-started/Regression_TaxiFarePrediction/TaxiFarePrediction/Data/taxi-fare-train.csv"; + string dataFile = "taxi-fare-train.csv"; + + Download(githubPath, dataFile); + + return dataFile; + } + + public static IDataView LoadTaxiFareDataset(MLContext context) + { + var dataFile = DownloadTaxiFareData(); + + var dataView = context.Data.LoadFromTextFile(dataFile, hasHeader: true, separatorChar: ','); + + return dataView; + } + + public class TaxiData + { + [LoadColumn(0)] + public string VendorId; + + [LoadColumn(1)] + public string RateCode; + + [LoadColumn(2)] + public float PassengerCount; + + [LoadColumn(3)] + public float TripTime; + + [LoadColumn(4)] + public float TripDistance; + + [LoadColumn(5)] + public string PaymentType; + + [LoadColumn(6)] + public float FareAmount; + } + private static string Download(string baseGitPath, string dataFile) { if (File.Exists(dataFile)) diff --git a/test/Microsoft.ML.Tests/Scenarios/RegressionTest.cs b/test/Microsoft.ML.Tests/Scenarios/RegressionTest.cs new file mode 100644 index 0000000000..e371432a54 --- /dev/null +++ b/test/Microsoft.ML.Tests/Scenarios/RegressionTest.cs @@ -0,0 +1,40 @@ +using Xunit; + +namespace Microsoft.ML.Scenarios +{ + public partial class ScenariosTests + { + [Fact] + public void TestRegressionScenario() + { + var context = new MLContext(); + + IDataView taxiData = Microsoft.ML.SamplesUtils.DatasetUtils.LoadTaxiFareDataset(context); + var splitData = context.Data.TrainTestSplit(taxiData, testFraction: 0.2); + + IDataView trainingDataView = context.Data.FilterRowsByColumn(splitData.TrainSet, "FareAmount", lowerBound: 1, upperBound: 150); + + var dataProcessPipeline = context.Transforms.CopyColumns(outputColumnName: "Label", inputColumnName: "FareAmount") + .Append(context.Transforms.Categorical.OneHotEncoding(outputColumnName: "VendorIdEncoded", inputColumnName: "VendorId")) + .Append(context.Transforms.Categorical.OneHotEncoding(outputColumnName: "RateCodeEncoded", inputColumnName: "RateCode")) + .Append(context.Transforms.Categorical.OneHotEncoding(outputColumnName: "PaymentTypeEncoded", inputColumnName: "PaymentType")) + .Append(context.Transforms.NormalizeMeanVariance(outputColumnName: "PassengerCount")) + .Append(context.Transforms.NormalizeMeanVariance(outputColumnName: "TripTime")) + .Append(context.Transforms.NormalizeMeanVariance(outputColumnName: "TripDistance")) + .Append(context.Transforms.Concatenate("Features", "VendorIdEncoded", "RateCodeEncoded", "PaymentTypeEncoded", "PassengerCount", + "TripTime", "TripDistance")); + + var trainer = context.Regression.Trainers.Sdca(labelColumnName: "Label", featureColumnName: "Features"); + var trainingPipeline = dataProcessPipeline.Append(trainer); + + var model = trainingPipeline.Fit(trainingDataView); + + var predictions = model.Transform(splitData.TestSet); + + var metrics = context.Regression.Evaluate(predictions); + + Assert.True(metrics.RSquared > .9); + Assert.True(metrics.RootMeanSquaredError > 2); + } + } +} From 302c4d335e414706691c00b06687620640114b80 Mon Sep 17 00:00:00 2001 From: J W Date: Wed, 29 Jul 2020 12:31:23 -0400 Subject: [PATCH 4/4] Update from feedback --- .../SamplesDatasetUtils.cs | 43 ------------------- .../Scenarios/RegressionTest.cs | 18 ++++++-- 2 files changed, 14 insertions(+), 47 deletions(-) diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 2fba1b33b2..51ce75b3af 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -185,49 +185,6 @@ public static string DownloadTensorFlowSentimentModel() return path; } - public static string DownloadTaxiFareData() - { - string githubPath = "https://raw.githubusercontent.com/dotnet/machinelearning-samples/master/samples/csharp/getting-started/Regression_TaxiFarePrediction/TaxiFarePrediction/Data/taxi-fare-train.csv"; - string dataFile = "taxi-fare-train.csv"; - - Download(githubPath, dataFile); - - return dataFile; - } - - public static IDataView LoadTaxiFareDataset(MLContext context) - { - var dataFile = DownloadTaxiFareData(); - - var dataView = context.Data.LoadFromTextFile(dataFile, hasHeader: true, separatorChar: ','); - - return dataView; - } - - public class TaxiData - { - [LoadColumn(0)] - public string VendorId; - - [LoadColumn(1)] - public string RateCode; - - [LoadColumn(2)] - public float PassengerCount; - - [LoadColumn(3)] - public float TripTime; - - [LoadColumn(4)] - public float TripDistance; - - [LoadColumn(5)] - public string PaymentType; - - [LoadColumn(6)] - public float FareAmount; - } - private static string Download(string baseGitPath, string dataFile) { if (File.Exists(dataFile)) diff --git a/test/Microsoft.ML.Tests/Scenarios/RegressionTest.cs b/test/Microsoft.ML.Tests/Scenarios/RegressionTest.cs index e371432a54..c179a6f341 100644 --- a/test/Microsoft.ML.Tests/Scenarios/RegressionTest.cs +++ b/test/Microsoft.ML.Tests/Scenarios/RegressionTest.cs @@ -1,4 +1,9 @@ -using Xunit; +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Tests; +using Xunit; namespace Microsoft.ML.Scenarios { @@ -9,8 +14,13 @@ public void TestRegressionScenario() { var context = new MLContext(); - IDataView taxiData = Microsoft.ML.SamplesUtils.DatasetUtils.LoadTaxiFareDataset(context); - var splitData = context.Data.TrainTestSplit(taxiData, testFraction: 0.2); + string taxiDataPath = GetDataPath("taxi-fare-train.csv"); + + var taxiData = + context.Data.LoadFromTextFile(taxiDataPath, hasHeader: true, + separatorChar: ','); + + var splitData = context.Data.TrainTestSplit(taxiData, testFraction: 0.1); IDataView trainingDataView = context.Data.FilterRowsByColumn(splitData.TrainSet, "FareAmount", lowerBound: 1, upperBound: 150); @@ -33,7 +43,7 @@ public void TestRegressionScenario() var metrics = context.Regression.Evaluate(predictions); - Assert.True(metrics.RSquared > .9); + Assert.True(metrics.RSquared > .8); Assert.True(metrics.RootMeanSquaredError > 2); } }