Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions docs/samples/Microsoft.ML.AutoML.Samples/Program.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
using System;
using Microsoft.ML.Data;

namespace Microsoft.ML.AutoML.Samples
{
public class Program
{
public static void Main(string[] args)
{
var context = new MLContext(1);
context.Log += (o, e) =>
{
if (e.Source.StartsWith("AutoMLExperiment"))
{
Console.WriteLine(e.RawMessage);
}
};
var trainPath = @"D:/large_csv.csv";
var dataset = context.Data.LoadFromTextFile<ModelInput>(trainPath, ',', hasHeader: false);
var experiment = context.Auto().CreateExperiment();
var label = "Entry(Text)";
var pipeline = context.Transforms.Conversion.MapValueToKey(label, label)
.Append(context.Auto().MultiClassification(label, featureColumnName: "_data", useFastTree: true, useFastForest: false, useLgbm: false, fastTreeOption: new CodeGen.FastTreeOption { DiskTranspose = true, }));

experiment.SetDataset(context.Data.TrainTestSplit(dataset))
.SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, label)
.SetPipeline(pipeline)
.SetTrainingTimeInSeconds(10000);

experiment.Run();
try
{
RecommendationExperiment.Run();
Expand Down Expand Up @@ -36,4 +58,26 @@ public static void Main(string[] args)
Console.ReadLine();
}
}

class ModelInput
{
[LoadColumn(0), NoColumn]
public string _data0 { get; set; }

[LoadColumn(1), NoColumn]
public float ignoreData1 { get; set; }

[LoadColumn(2, 4205)]
public float[] _data { get; set; }

[LoadColumn(4206), NoColumn]//(4206,4208)]
public float _ignoreData4206 { get; set; }
[LoadColumn(4207), NoColumn]//(4206,4208)]
public float _ignoreData4207 { get; set; }
[LoadColumn(4208), NoColumn]//(4206,4208)]
public float _ignoreData4208 { get; set; }

[LoadColumn(4209), ColumnName("Entry(Text)")]
public string _label { get; set; }
}
}
1 change: 1 addition & 0 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public virtual void ReportCompletedTrial(TrialResult result)

public virtual void ReportFailTrial(TrialSettings settings, Exception exception = null)
{
_logger.Trace(exception.Message + exception.StackTrace);
_logger.Info($"Update Failed Trial - Id: {settings.TrialId} - Pipeline: {_pipeline.ToString(settings.Parameter)}");
}

Expand Down
5 changes: 5 additions & 0 deletions src/Microsoft.ML.AutoML/CodeGen/fast_tree_search_space.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@
{
"name": "ExampleWeightColumnName",
"type": "string"
},
{
"name": "DiskTranspose",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This option allows user to overwrite DiskTranspose when necessary, which avoids OOM when dataset is large

"type": "boolean",
"default": false
}
]
}
3 changes: 2 additions & 1 deletion src/Microsoft.ML.AutoML/CodeGen/search-space-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@
"NumberOfIterations",
"Quiet",
"OutputAsFloatArray",
"ModelFactory"
"ModelFactory",
"DiskTranspose"
]
},
"option_type": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public override IEstimator<ITransformer> BuildFromOption(MLContext context, Fast
NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(),
MaximumBinCountPerFeature = param.MaximumBinCountPerFeature,
FeatureFraction = param.FeatureFraction,
DiskTranspose = param.DiskTranspose,
};

return context.MulticlassClassification.Trainers.OneVersusAll(context.BinaryClassification.Trainers.FastTree(option), labelColumnName: param.LabelColumnName);
Expand All @@ -43,6 +44,7 @@ public override IEstimator<ITransformer> BuildFromOption(MLContext context, Fast
ExampleWeightColumnName = param.ExampleWeightColumnName,
NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(),
MaximumBinCountPerFeature = param.MaximumBinCountPerFeature,
DiskTranspose = param.DiskTranspose,
FeatureFraction = param.FeatureFraction,
};

Expand All @@ -65,6 +67,7 @@ public override IEstimator<ITransformer> BuildFromOption(MLContext context, Fast
ExampleWeightColumnName = param.ExampleWeightColumnName,
NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(),
MaximumBinCountPerFeature = param.MaximumBinCountPerFeature,
DiskTranspose = param.DiskTranspose,
FeatureFraction = param.FeatureFraction,
};

Expand All @@ -87,6 +90,7 @@ public override IEstimator<ITransformer> BuildFromOption(MLContext context, Fast
ExampleWeightColumnName = param.ExampleWeightColumnName,
NumberOfThreads = AutoMlUtils.GetNumberOfThreadFromEnvrionment(),
MaximumBinCountPerFeature = param.MaximumBinCountPerFeature,
DiskTranspose = param.DiskTranspose,
FeatureFraction = param.FeatureFraction,
};

Expand Down
113 changes: 113 additions & 0 deletions test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,20 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using FluentAssertions;
using Microsoft.Data.Analysis;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.ML.Data;
using Microsoft.ML.AutoML.CodeGen;
using Microsoft.ML.Runtime;
using Microsoft.ML.TestFramework;
using Microsoft.ML.TestFramework.Attributes;
using Tensorflow;
using Xunit;
using Xunit.Abstractions;

Expand Down Expand Up @@ -327,6 +330,94 @@ public async Task AutoMLExperiment_Taxi_Fare_CV_5_Test()
var result = await experiment.RunAsync();
result.Metric.Should().BeGreaterThan(0.5);
}

[Fact]
public async Task Generate_300GB_csv()
{
var rnd = new Random();
ModelInput GenerateRandomRow()
{
return new ModelInput
{
_data0 = rnd.NextSingle() > 0.5 ? "a" : "b",
ignoreData1 = rnd.NextSingle(),
_data = Enumerable.Repeat(rnd.NextSingle(), 4204).ToArray(),
_ignoreData4206 = rnd.NextSingle(),
_ignoreData4207 = rnd.NextSingle(),
_ignoreData4208 = rnd.NextSingle(),
_label = rnd.NextSingle() > 0.5 ? "True" : "False",
};
}

var filePath = @"D:/large_csv.csv";
var fileInfo = new FileInfo(filePath);

using (var fileStream = fileInfo.Open(FileMode.Append))
using (var stream = new StreamWriter(fileStream))
{
var i = 0;
while ((!fileInfo.Exists || fileInfo.Length < 300.0 * 1024 * 1024 * 1024) && i < 100)
{
fileInfo.Refresh();
Output.WriteLine($"{fileInfo.Length / (1024 * 1024 * 1024 * 1.0)}");
var taskNum = 10;
var taskPool = new Task<string>[taskNum];
for (int _i = 0; _i != taskNum; ++_i)
{
var t = Task.Factory.StartNew(() =>
{
var sb = new StringBuilder();
var rows = Enumerable.Range(0, 10000).Select(i => GenerateRandomRow());
foreach (var row in rows)
{
var line = $"\"{row._data0}\",{row.ignoreData1.ToString("F2")},{string.Join(",", row._data.Select(d => d.ToString("F2")))}, {row._ignoreData4206.ToString("F2")}, {row._ignoreData4207.ToString("F2")}, {row._ignoreData4208.ToString("F2")},\"{row._label}\"";
sb.AppendLine(line);
}

return sb.ToString();
});
taskPool[_i] = t;
}

var getRandomRow = await Task.WhenAll(taskPool);
foreach (var row in getRandomRow)
{
await stream.WriteAsync(row);
}

await stream.FlushAsync();
i++;
}
}
}

[Fact]
public async Task Large_csv_test()
{
var context = new MLContext(1);
context.Log += (o, e) =>
{
if (e.Source.StartsWith("AutoMLExperiment"))
{
this.Output.WriteLine(e.RawMessage);
}
};
var trainPath = @"D:/large_csv.csv";
var dataset = context.Data.LoadFromTextFile<ModelInput>(trainPath, ',', hasHeader: false);
var experiment = context.Auto().CreateExperiment();
var label = "Entry(Text)";
var pipeline = context.Auto().Featurizer(dataset, excludeColumns: new[] { label })
.Append(context.Transforms.Conversion.MapValueToKey(label, label))
.Append(context.Auto().MultiClassification(label));

experiment.SetDataset(context.Data.TrainTestSplit(dataset))
.SetMulticlassClassificationMetric(MulticlassClassificationMetric.MacroAccuracy, label)
.SetPipeline(pipeline)
.SetTrainingTimeInSeconds(50);

var result = await experiment.RunAsync();
result.Metric.Should().BeGreaterThan(0.5);
}
}

class DummyTrialRunner : ITrialRunner
Expand Down Expand Up @@ -426,4 +517,26 @@ public void Stop()
_timer = null;
}
}

class ModelInput
{
[LoadColumn(0), NoColumn]
public string _data0 { get; set; }

[LoadColumn(1), NoColumn]
public float ignoreData1 { get; set; }

[LoadColumn(2, 4205)]
public float[] _data { get; set; }

[LoadColumn(4206), NoColumn]//(4206,4208)]
public float _ignoreData4206 { get; set; }
[LoadColumn(4207), NoColumn]//(4206,4208)]
public float _ignoreData4207 { get; set; }
[LoadColumn(4208), NoColumn]//(4206,4208)]
public float _ignoreData4208 { get; set; }

[LoadColumn(4209), ColumnName("Entry(Text)")]
public string _label { get; set; }
}
}