Skip to content
Merged
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Core/Utilities/Hashing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ public static uint HashSequence(uint[] sequence, int min, int lim)

/// <summary>
/// Combines the given hash value with a uint value, using the murmur hash 3 algorithm.
/// Make certain to also use <see cref="MixHash"/> on the final hashed value, if you
/// depend upon having distinct bits.
/// Make certain to also use <see cref="MixHash(uint)"/> or <see cref="MixHash(uint, int)"/> on the final hashed value,
/// if you depend upon having distinct bits.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint MurmurRound(uint hash, uint chunk)
Expand Down Expand Up @@ -257,7 +257,7 @@ public static uint MurmurHashV2(uint hash, ReadOnlySpan<char> span, bool toUpper
}

// Final mixing ritual for the hash.
hash = MixHashV2(hash, len);
hash = MixHash(hash, len);

return hash;
}
Expand Down Expand Up @@ -373,7 +373,7 @@ public static uint MixHash(uint hash)
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static uint MixHashV2(uint hash, int len)
public static uint MixHash(uint hash, int len)
{
hash ^= (uint)len;
hash ^= hash >> 16;
Expand Down
13 changes: 8 additions & 5 deletions src/Microsoft.ML.Data/Commands/CrossValidationCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -328,11 +328,14 @@ private string GetSplitColumn(IChannel ch, IDataView input, ref IDataView output
{
ch.Info("Hashing the stratification column");
var origStratCol = stratificationColumn;
int tmp;
int inc = 0;
while (input.Schema.TryGetColumnIndex(stratificationColumn, out tmp))
stratificationColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
output = new HashingEstimator(Host, origStratCol, stratificationColumn, 30).Fit(input).Transform(input);
stratificationColumn = input.Schema.GetTempColumnName("strat");

// HashingEstimator currently handles all primitive types except for DateTime, DateTimeOffset and TimeSpan.
var itemType = type.GetItemType();
if (itemType is DateTimeDataViewType || itemType is DateTimeOffsetDataViewType || itemType is TimeSpanDataViewType)
input = new TypeConvertingTransformer(Host, origStratCol, DataKind.Int64, origStratCol).Transform(input);

output = new HashingEstimator(Host, stratificationColumn, origStratCol, 30).Fit(input).Transform(input);
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realize the previous code too used 30 bits. But the default seems to be 31 bits. Any reason why we have 30 here? If there is a specific reason for 30 bits, can you please add a note to that in the code?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am actually not sure why 30 was chosen. If you think it's important I can try changing it back to 31.


In reply to: 386729669 [](ancestors = 386729669)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no reason to say it should change to 31 either :-). But since it is is used in a couple of places, can you please make it a named constant?


In reply to: 425102125 [](ancestors = 425102125,386729669)

}

Expand Down
21 changes: 10 additions & 11 deletions src/Microsoft.ML.Data/DataLoadSave/DataOperationsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -511,19 +511,18 @@ internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDa
var type = data.Schema[stratCol].Type;
if (!RangeFilter.IsValidRangeFilterColumnType(env, type))
{
// Hash the samplingKeyColumn.
// REVIEW: this could currently crash, since Hash only accepts a limited set
// of column types. It used to be HashJoin, but we should probably extend Hash
// instead of having two hash transformations.
var origStratCol = samplingKeyColumn;
samplingKeyColumn = data.Schema.GetTempColumnName(samplingKeyColumn);
HashingEstimator.ColumnOptionsInternal columnOptions;
if (seed.HasValue)
columnOptions = new HashingEstimator.ColumnOptionsInternal(samplingKeyColumn, origStratCol, 30, (uint)seed.Value);
else if (((ISeededEnvironment)env).Seed.HasValue)
columnOptions = new HashingEstimator.ColumnOptionsInternal(samplingKeyColumn, origStratCol, 30, (uint)((ISeededEnvironment)env).Seed.Value);
else
columnOptions = new HashingEstimator.ColumnOptionsInternal(samplingKeyColumn, origStratCol, 30);
// HashingEstimator currently handles all primitive types except for DateTime, DateTimeOffset and TimeSpan.
var itemType = type.GetItemType();
if (itemType is DateTimeDataViewType || itemType is DateTimeOffsetDataViewType || itemType is TimeSpanDataViewType)
data = new TypeConvertingTransformer(env, origStratCol, DataKind.Int64, origStratCol).Transform(data);

var localSeed = seed.HasValue ? seed : ((ISeededEnvironment)env).Seed.HasValue ? ((ISeededEnvironment)env).Seed : null;
var columnOptions =
localSeed.HasValue ?
new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, (uint)localSeed.Value, combine: true) :
new HashingEstimator.ColumnOptions(samplingKeyColumn, origStratCol, 30, combine: true);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar comment as before. If the 30bits hashing is standard, can we define it as some global constant somewhere?

data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data);
}
else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,7 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms
/// ]]></format>
/// </example>
public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, params ColumnOptions[] columns)
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog),
columns.Select(x => new ColumnOptionsInternal(x.Name, x.InputColumnName, x.NumberOfBits, x.Seed,
x.UseOrderedHashing, x.MaximumNumberOfInverts)).ToArray());
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog), columns);

/// <summary>
/// Create a <see cref="TypeConvertingEstimator"/>, which converts the type of the data to the type specified in <paramref name="outputKind"/>.
Expand Down
Loading