Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f880b7b
Added a comment.
sierralee51 Jun 26, 2019
4f06b84
reformatted ModelOperations samples to width 85
sierralee51 Jun 27, 2019
0c16632
Merge branch 'master' of https://github.com/dotnet/machinelearning in…
sierralee51 Jun 27, 2019
3ce38a3
Fixed commented-on parts of MachineOperations & reformatted DataOpera…
sierralee51 Jun 27, 2019
10f40fd
Update docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/Cache.cs
sierralee51 Jun 28, 2019
00acf2a
Update Program.cs
sierralee51 Jun 28, 2019
acdf665
Update DataViewEnumerable.tt
sierralee51 Jun 28, 2019
842765f
Update DataViewEnumerable.cs
sierralee51 Jun 28, 2019
fe9c188
Update DataViewEnumerable.tt
sierralee51 Jun 28, 2019
2f91a6d
Update FilterRowsByColumn.tt
sierralee51 Jun 28, 2019
7a27578
Update ShuffleRows.tt
sierralee51 Jun 28, 2019
cd05ce0
Update TakeRows.tt
sierralee51 Jun 28, 2019
61de1fb
Update TakeRows.cs
sierralee51 Jun 28, 2019
5d2e8af
Update SkipRows.cs
sierralee51 Jun 28, 2019
063a651
Update SkipRows.tt
sierralee51 Jun 28, 2019
958633b
Update ShuffleRows.cs
sierralee51 Jun 28, 2019
01a5390
Update ShuffleRows.cs
sierralee51 Jun 28, 2019
0eb6065
Update ShuffleRows.cs
sierralee51 Jun 28, 2019
2db2bde
Update ShuffleRows.tt
sierralee51 Jun 28, 2019
8c1989f
Update SkipRows.tt
sierralee51 Jun 28, 2019
f39823a
Update SkipRows.cs
sierralee51 Jun 28, 2019
90ed060
Update FilterRowsByColumn.cs
sierralee51 Jun 28, 2019
2a5e8c3
Update FilterRowsByColumn.cs
sierralee51 Jun 28, 2019
f9d91e6
Update DataViewEnumerable.cs
sierralee51 Jun 28, 2019
5daecb8
Update FilterRowsByColumn.cs
sierralee51 Jun 28, 2019
c436498
Update FilterRowsByColumn.tt
sierralee51 Jun 28, 2019
49d7a09
Update FilterRowsByColumn.cs
sierralee51 Jun 28, 2019
d2315f9
Update FilterRowsByColumn.tt
sierralee51 Jun 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ public static class BootstrapSample
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Create a new context for ML.NET operations. It can be used for
// exception tracking and logging, as a catalog of available operations
// and as the source of randomness.
var mlContext = new MLContext();

// Get a small dataset as an IEnumerable.
Expand All @@ -23,20 +24,27 @@ public static void Example()

var data = mlContext.Data.LoadFromEnumerable(rawData);

// Now take a bootstrap sample of this dataset to create a new dataset. The bootstrap is a resampling technique that
// creates a training set of the same size by picking with replacement from the original dataset. With the bootstrap,
// we expect that the resampled dataset will have about 63% of the rows of the original dataset (i.e. 1-e^-1), with some
// rows represented more than once.
// BootstrapSample is a streaming implementation of the boostrap that enables sampling from a dataset too large to hold in memory.
// To enable streaming, BootstrapSample approximates the bootstrap by sampling each row according to a Poisson(1) distribution.
// Note that this streaming approximation treats each row independently, thus the resampled dataset is not guaranteed to be the
// same length as the input dataset.
// Let's take a look at the behavior of the BootstrapSample by examining a few draws:
// Now take a bootstrap sample of this dataset to create a new dataset.
// The bootstrap is a resampling technique that creates a training set
// of the same size by picking with replacement from the original
// dataset. With the bootstrap, we expect that the resampled dataset
// will have about 63% of the rows of the original dataset
// (i.e. 1-e^-1), with some rows represented more than once.
// BootstrapSample is a streaming implementation of the boostrap that
// enables sampling from a dataset too large to hold in memory. To
// enable streaming, BootstrapSample approximates the bootstrap by
// sampling each row according to a Poisson(1) distribution. Note that
// this streaming approximation treats each row independently, thus the
// resampled dataset is not guaranteed to be the same length as the
// input dataset. Let's take a look at the behavior of the
// BootstrapSample by examining a few draws:
for (int i = 0; i < 3; i++)
{
var resample = mlContext.Data.BootstrapSample(data, seed: i);

var enumerable = mlContext.Data.CreateEnumerable<DataPoint>(resample, reuseRowObject: false);
var enumerable = mlContext.Data
.CreateEnumerable<DataPoint>(resample, reuseRowObject: false);

Console.WriteLine($"Label\tFeature");
foreach (var row in enumerable)
{
Expand Down
60 changes: 41 additions & 19 deletions docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/Cache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,56 +8,78 @@ public static class Cache
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Create a new context for ML.NET operations. It can be used for except
// ion tracking and logging, as a catalog of available operations and as
// the source of randomness.
var mlContext = new MLContext();

var data = DatasetUtils.LoadHousingRegressionDataset(mlContext);

// Time how long it takes to page through the records if we don't cache.
(int lines, double columnAverage, double elapsedSeconds) = TimeToScanIDataView(mlContext, data);
Console.WriteLine($"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds} seconds.");
(int lines, double columnAverage, double elapsedSeconds) =
TimeToScanIDataView(mlContext, data);

Console.WriteLine($"Lines={lines}," +
$"averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds}" +
$"seconds.");
// Expected output (time is approximate):
// Lines=506, averageOfColumn0=564.17 and took 0.314 seconds.

// Now create a cached view of the data.
var cachedData = mlContext.Data.Cache(data);

// Time how long it takes to page through the records the first time they're accessed after a cache is applied.
// This iteration will be longer than subsequent calls, as the dataset is being accessed and stored for later.
// Note that this operation may be relatively quick, as the system may have cached the file.
(lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext, cachedData);
Console.WriteLine($"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds} seconds.");
// Time how long it takes to page through the records the first time
// they're accessed after a cache is applied. This iteration will be
// longer than subsequent calls, as the dataset is being accessed and
// stored for later. Note that this operation may be relatively quick,
// as the system may have cached the file.
(lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext,
cachedData);

Console.WriteLine($"Lines={lines}," +
$"averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds}" +
$"seconds.");
// Expected output (time is approximate):
// Lines=506, averageOfColumn0=564.17 and took 0.056 seconds.

// Time how long it takes to page through the records now that the data is cached. After the first iteration that caches the IDataView,
// future iterations, like this one, are faster because they are pulling from data cached in memory.
(lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext, cachedData);
Console.WriteLine($"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took {elapsedSeconds} seconds.");
// Time how long it takes to page through the records now that the data
// is cached. After the first iteration that caches the IDataView,
// future iterations, like this one, are faster because they are pulling
// from data cached in memory.
(lines, columnAverage, elapsedSeconds) = TimeToScanIDataView(mlContext,
cachedData);

Console.WriteLine(
$"Lines={lines}, averageOfColumn0={columnAverage:0.00} and took " +
$"{elapsedSeconds} seconds.");
// Expected output (time is approximate):
// Lines=506, averageOfColumn0=564.17 and took 0.006 seconds.
}

private static (int lines, double columnAverage, double elapsedSeconds) TimeToScanIDataView(MLContext mlContext, IDataView data)
private static (int lines, double columnAverage, double elapsedSeconds)
TimeToScanIDataView(MLContext mlContext, IDataView data)
{
int lines = 0;
double columnAverage = 0.0;
var enumerable = mlContext.Data.CreateEnumerable<HousingRegression>(data, reuseRowObject: true);
var enumerable = mlContext.Data
.CreateEnumerable<HousingRegression>(data, reuseRowObject: true);

var watch = System.Diagnostics.Stopwatch.StartNew();
foreach (var row in enumerable)
{
lines++;
columnAverage += row.MedianHomeValue + row.CrimesPerCapita + row.PercentResidental + row.PercentNonRetail + row.CharlesRiver
+ row.NitricOxides + row.RoomsPerDwelling + row.PercentPre40s + row.EmploymentDistance
+ row.HighwayDistance + row.TaxRate + row.TeacherRatio;
columnAverage += row.MedianHomeValue + row.CrimesPerCapita +
row.PercentResidental + row.PercentNonRetail + row.CharlesRiver
+ row.NitricOxides + row.RoomsPerDwelling + row.PercentPre40s +
row.EmploymentDistance + row.HighwayDistance + row.TaxRate +
row.TeacherRatio;
}
watch.Stop();
columnAverage /= lines;
var elapsed = watch.Elapsed;

return (lines, columnAverage, elapsed.Seconds);
}
}

/// <summary>
/// A class to hold the raw housing regression rows.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,28 @@ public static void Example()
// Generate some data points.
var examples = GenerateRandomDataPoints(10);

// Convert the examples list to an IDataView object, which is consumable by ML.NET API.
// Convert the examples list to an IDataView object, which is consumable
// by ML.NET API.
var dataview = mlContext.Data.LoadFromEnumerable(examples);

// Cross validation splits your data randomly into set of "folds", and creates groups of Train and Test sets,
// where for each group, one fold is the Test and the rest of the folds the Train.
// So below, we specify Group column as the column containing the sampling keys.
// If we pass that column to cross validation it would be used to break data into certain chunks.
var folds = mlContext.Data.CrossValidationSplit(dataview, numberOfFolds: 3, samplingKeyColumnName: "Group");
var trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[0].TrainSet, reuseRowObject: false);
var testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[0].TestSet, reuseRowObject: false);
// Cross validation splits your data randomly into set of "folds", and
// creates groups of Train and Test sets, where for each group, one fold
// is the Test and the rest of the folds the Train. So below, we specify
// Group column as the column containing the sampling keys. If we pass
// that column to cross validation it would be used to break data into
// certain chunks.
var folds = mlContext.Data
.CrossValidationSplit(dataview, numberOfFolds:3,
samplingKeyColumnName: "Group");

var trainSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[0].TrainSet,
reuseRowObject: false);

var testSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[0].TestSet,
reuseRowObject: false);

PrintPreviewRows(trainSet, testSet);

// The data in the Train split.
Expand All @@ -43,8 +55,14 @@ public static void Example()
// [Group, 0], [Features, 0.9060271]
// [Group, 0], [Features, 0.2737045]

trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[1].TrainSet, reuseRowObject: false);
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[1].TestSet, reuseRowObject: false);
trainSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[1].TrainSet,
reuseRowObject: false);

testSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[1].TestSet,
reuseRowObject: false);

PrintPreviewRows(trainSet, testSet);
// The data in the Train split.
// [Group, 0], [Features, 0.7262433]
Expand All @@ -60,8 +78,14 @@ public static void Example()
// [Group, 1], [Features, 0.2060332]
// [Group, 1], [Features, 0.4421779]

trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TrainSet, reuseRowObject: false);
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TestSet, reuseRowObject: false);
trainSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[2].TrainSet,
reuseRowObject: false);

testSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[2].TestSet,
reuseRowObject: false);

PrintPreviewRows(trainSet, testSet);
// The data in the Train split.
// [Group, 0], [Features, 0.7262433]
Expand All @@ -79,8 +103,14 @@ public static void Example()

// Example of a split without specifying a sampling key column.
folds = mlContext.Data.CrossValidationSplit(dataview, numberOfFolds: 3);
trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[0].TrainSet, reuseRowObject: false);
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[0].TestSet, reuseRowObject: false);
trainSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[0].TrainSet,
reuseRowObject: false);

testSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[0].TestSet,
reuseRowObject: false);

PrintPreviewRows(trainSet, testSet);
// The data in the Train split.
// [Group, 0], [Features, 0.7262433]
Expand All @@ -96,8 +126,14 @@ public static void Example()
// [Group, 2], [Features, 0.5588848]
// [Group, 0], [Features, 0.9060271]

trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[1].TrainSet, reuseRowObject: false);
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[1].TestSet, reuseRowObject: false);
trainSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[1].TrainSet,
reuseRowObject: false);

testSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[1].TestSet,
reuseRowObject: false);

PrintPreviewRows(trainSet, testSet);
// The data in the Train split.
// [Group, 2], [Features, 0.7680227]
Expand All @@ -113,8 +149,13 @@ public static void Example()
// [Group, 2], [Features, 0.9775497]
// [Group, 0], [Features, 0.2737045]

trainSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TrainSet, reuseRowObject: false);
testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TestSet, reuseRowObject: false);
trainSet = mlContext.Data
.CreateEnumerable<DataPoint>(folds[2].TrainSet,
reuseRowObject: false);

testSet = mlContext.Data.CreateEnumerable<DataPoint>(folds[2].TestSet,
reuseRowObject: false);

PrintPreviewRows(trainSet, testSet);
// The data in the Train split.
// [Group, 0], [Features, 0.7262433]
Expand All @@ -131,7 +172,9 @@ public static void Example()
// [Group, 1], [Features, 0.4421779]
}

private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed = 0)
private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count,
int seed = 0)

{
var random = new Random(seed);
for (int i = 0; i < count; i++)
Expand All @@ -146,7 +189,8 @@ private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int se
}
}

// Example with features and group column. A data set is a collection of such examples.
// Example with features and group column. A data set is a collection of
// such examples.
private class DataPoint
{
public float Group { get; set; }
Expand All @@ -155,7 +199,9 @@ private class DataPoint
}

// print helper
private static void PrintPreviewRows(IEnumerable<DataPoint> trainSet, IEnumerable<DataPoint> testSet)
private static void PrintPreviewRows(IEnumerable<DataPoint> trainSet,
IEnumerable<DataPoint> testSet)

{

Console.WriteLine($"The data in the Train split.");
Expand Down
Loading