Skip to content
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
70e3480
count table transform
yaeldMS Nov 7, 2019
26169c8
Dracula with unit tests
yaeldMS Dec 2, 2019
b818101
Fix entry point catalog test
yaeldMS Dec 2, 2019
08a7108
Address code review comments
yaeldMS Dec 3, 2019
2b2428b
create estimator from trained transformer
yaeldMS Dec 10, 2019
0ef935b
switch from three dimensional array of counts to two dimensional arra…
yaeldMS Dec 11, 2019
097f2f1
change mechanism for loading a pre-trained count table
yaeldMS Dec 16, 2019
73fd1ff
Add a sample
yaeldMS Dec 18, 2019
741b5ad
fix entrypoint catalog
yaeldMS Dec 25, 2019
8d00f47
documentation
yaeldMS Dec 27, 2019
c13ff8d
count table transform
yaeldMS Nov 7, 2019
f756a20
Dracula with unit tests
yaeldMS Dec 2, 2019
8880cc5
Address code review comments
yaeldMS Dec 3, 2019
330c6c5
create estimator from trained transformer
yaeldMS Dec 10, 2019
5c33181
change mechanism for loading a pre-trained count table
yaeldMS Dec 16, 2019
c8a9df5
Add a sample
yaeldMS Dec 18, 2019
a93ea18
documentation
yaeldMS Dec 27, 2019
9c921ce
fix unit tests
yaeldMS Dec 28, 2019
d7616a7
Delete unused file
yaeldMS Jan 1, 2020
90803b7
make CountTable* classes internal
yaeldMS Jan 10, 2020
c9cf4ce
Possible solution for adding noise only when training a pipeline
yaeldMS Jan 28, 2020
3d23f80
Fix bug
yaeldMS Jan 29, 2020
96e0041
Make all APIs and classes internal.
yaeldMS Feb 5, 2020
3fc56ec
Exclude dracula sample.
yaeldMS Feb 9, 2020
66f7865
Switch to using HashingTransformer instead of HashJoiningTransform.
yaeldMS May 18, 2020
91887da
Fix EntryPointCatalog test
yaeldMS May 19, 2020
c36e5b3
Address code review comments.
yaeldMS Jun 1, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

namespace Samples.Dynamic
{
public static class CountTargetEncoding
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Category = "a", Label = 0 },
new DataPoint(){ Category = "a", Label = 1 },
new DataPoint(){ Category = "a", Label = 0 },
new DataPoint(){ Category = "a", Label = 0 },
new DataPoint(){ Category = "b", Label = 1 },
new DataPoint(){ Category = "b", Label = 2 },
new DataPoint(){ Category = "b", Label = 2 },
new DataPoint(){ Category = "b", Label = 1 },
new DataPoint(){ Category = "c", Label = 0 },
new DataPoint(){ Category = "c", Label = 0 },
new DataPoint(){ Category = "c", Label = 0 },
new DataPoint(){ Category = "c", Label = 0 },
new DataPoint(){ Category = "d", Label = 0 },
new DataPoint(){ Category = "d", Label = 1 },
new DataPoint(){ Category = "d", Label = 2 },
new DataPoint(){ Category = "d", Label = 3 },
};

// Convert training data to IDataView, the general data type used in
// ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);

// Define the CountTargetEncoding estimator.
var count = mlContext.Transforms.CountTargetEncode("Features", "Category");

// Now we can transform the data and look at the output to confirm the
// behavior of the estimator. This operation doesn't actually evaluate
// data until we read the data below.
var transformer = count.Fit(data);
var transformedData = transformer.Transform(data);
var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 3.0000, 1.0000, 0.0000, 0.0000, 0.8473, -1.0986, -3.2452, -4.3694, 0.0000
// 3.0000, 1.0000, 0.0000, 0.0000, 0.8473, -1.0986, -3.2452, -4.3694, 0.0000
// 3.0000, 1.0000, 0.0000, 0.0000, 0.8473, -1.0986, -3.2452, -4.3694, 0.0000
// 3.0000, 1.0000, 0.0000, 0.0000, 0.8473, -1.0986, -3.2452, -4.3694, 0.0000
// 0.0000, 2.0000, 2.0000, 0.0000, -2.1972, -0.2007, -0.2513, -4.3694, 0.0000
// 0.0000, 2.0000, 2.0000, 0.0000, -2.1972, -0.2007, -0.2513, -4.3694, 0.0000
// 0.0000, 2.0000, 2.0000, 0.0000, -2.1972, -0.2007, -0.2513, -4.3694, 0.0000
// 0.0000, 2.0000, 2.0000, 0.0000, -2.1972, -0.2007, -0.2513, -4.3694, 0.0000
// 4.0000, 0.0000, 0.0000, 0.0000, 2.1972, -2.9444, -3.2452, -4.3694, 0.0000
// 4.0000, 0.0000, 0.0000, 0.0000, 2.1972, -2.9444, -3.2452, -4.3694, 0.0000
// 4.0000, 0.0000, 0.0000, 0.0000, 2.1972, -2.9444, -3.2452, -4.3694, 0.0000
// 4.0000, 0.0000, 0.0000, 0.0000, 2.1972, -2.9444, -3.2452, -4.3694, 0.0000
// 1.0000, 1.0000, 1.0000, 1.0000, -0.8473, -1.0986, -1.1664, -1.3099, 0.0000
// 1.0000, 1.0000, 1.0000, 1.0000, -0.8473, -1.0986, -1.1664, -1.3099, 0.0000
// 1.0000, 1.0000, 1.0000, 1.0000, -0.8473, -1.0986, -1.1664, -1.3099, 0.0000
// 1.0000, 1.0000, 1.0000, 1.0000, -0.8473, -1.0986, -1.1664, -1.3099, 0.0000

// The count tables can be saved and be retrained later with additional data.
mlContext.Model.Save(transformer, data.Schema, "CountTargetEncoding.zip");
var loadedTransformer = mlContext.Model.Load("CountTargetEncoding.zip", out _);
count = mlContext.Transforms.CountTargetEncode("Features", loadedTransformer as CountTargetEncodingTransformer, "Category");

var moreSamples = new List<DataPoint>()
{
new DataPoint(){ Category = "a", Label = 3 },
new DataPoint(){ Category = "a", Label = 3 },
new DataPoint(){ Category = "b", Label = 2 },
new DataPoint(){ Category = "c", Label = 1 },
new DataPoint(){ Category = "c", Label = 1 },
new DataPoint(){ Category = "d", Label = 0 },
new DataPoint(){ Category = "e", Label = 3 },
new DataPoint(){ Category = "d", Label = 4 },
};
var moreData = mlContext.Data.LoadFromEnumerable(moreSamples);
transformer = count.Fit(moreData);
transformedData = transformer.Transform(moreData);
column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));

// Expected output:
// 3.0000, 1.0000, 0.0000, 2.0000, 0.0000, -0.2151, -1.5261, -4.0073, -0.6665, -4.0073, 0.0000
// 3.0000, 1.0000, 0.0000, 2.0000, 0.0000, -0.2151, -1.5261, -4.0073, -0.6665, -4.0073, 0.0000
// 0.0000, 2.0000, 3.0000, 0.0000, 0.0000, -3.8501, -0.5108, 0.0834, -2.7081, -3.8501, 0.0000
// 4.0000, 2.0000, 0.0000, 0.0000, 0.0000, 0.3610, -0.7472, -4.0073, -2.8717, -4.0073, 0.0000
// 4.0000, 2.0000, 0.0000, 0.0000, 0.0000, 0.3610, -0.7472, -4.0073, -2.8717, -4.0073, 0.0000
// 2.0000, 1.0000, 1.0000, 1.0000, 1.0000, -0.8303, -1.5261, -1.6529, -1.4088, -1.6529, 0.0000
// 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, -2.7081, -1.9459, -2.7081, 0.7885, -2.7081, 0.0000
// 2.0000, 1.0000, 1.0000, 1.0000, 1.0000, -0.8303, -1.5261, -1.6529, -1.4088, -1.6529, 0.0000
}

private class DataPoint
{
public string Category;
public float Label;
}
}
}
4 changes: 4 additions & 0 deletions docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
<RootNamespace>Samples</RootNamespace>
</PropertyGroup>

<ItemGroup>
<Compile Remove="Dynamic\Transforms\CountTargetEncoding.cs" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\src\Microsoft.ML.Vision\Microsoft.ML.Vision.csproj" />
<ProjectReference Include="..\..\..\src\Microsoft.ML.Featurizers\Microsoft.ML.Featurizers.csproj" />
Expand Down
6 changes: 6 additions & 0 deletions src/Microsoft.ML.Core/Data/IEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,12 @@ public interface ITransformer : ICanSaveModel
IRowToRowMapper GetRowToRowMapper(DataViewSchema inputSchema);
}

[BestFriend]
internal interface ITransformerWithDifferentMappingAtTrainingTime : ITransformer
{
IDataView TransformForTrainingPipeline(IDataView input);
}

/// <summary>
/// The estimator (in Spark terminology) is an 'untrained transformer'. It needs to 'fit' on the data to manufacture
/// a transformer.
Expand Down
5 changes: 4 additions & 1 deletion src/Microsoft.ML.Data/DataLoadSave/EstimatorChain.cs
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ public TransformerChain<TLastTransformer> Fit(IDataView input)
{
var est = _estimators[i];
xfs[i] = est.Fit(current);
current = xfs[i].Transform(current);
if (xfs[i] is ITransformerWithDifferentMappingAtTrainingTime xf)
current = xf.TransformForTrainingPipeline(current);
else
current = xfs[i].Transform(current);
if (_needCacheAfter[i] && i < _estimators.Length - 1)
{
Contracts.AssertValue(_host);
Expand Down
4 changes: 4 additions & 0 deletions src/Microsoft.ML.Data/Transforms/Hashing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ private static VersionInfo GetVersionInfo()
}

private readonly HashingEstimator.ColumnOptions[] _columns;
[BestFriend]
internal IReadOnlyCollection<HashingEstimator.ColumnOptions> Columns => _columns;
private readonly VBuffer<ReadOnlyMemory<char>>[] _keyValues;
private readonly VectorDataViewType[] _kvTypes;
private readonly bool _nonOnnxExportableVersion;
Expand Down Expand Up @@ -1740,8 +1742,10 @@ public override void Process()
public sealed class HashingEstimator : IEstimator<HashingTransformer>
{
internal const int NumBitsMin = 1;
[BestFriend]
internal const int NumBitsLim = 32;

[BestFriend]
internal static class Defaults
{
public const int NumberOfBits = NumBitsLim - 1;
Expand Down
Loading