-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Add Seed property to MLContext and use as default for data splits #4775
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
9bbccaf
9fd29ac
2080dc0
218bebd
d098fee
66fc849
0ee3b9a
67d69c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -17,7 +17,7 @@ namespace Microsoft.ML | |||||||||
| public sealed class DataOperationsCatalog : IInternalCatalog | ||||||||||
| { | ||||||||||
| IHostEnvironment IInternalCatalog.Environment => _env; | ||||||||||
| private readonly IHostEnvironment _env; | ||||||||||
| private readonly ISeededEnvironment _env; | ||||||||||
|
|
||||||||||
| /// <summary> | ||||||||||
| /// A pair of datasets, for the train and test set. | ||||||||||
|
|
@@ -44,7 +44,7 @@ internal TrainTestData(IDataView trainSet, IDataView testSet) | |||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| internal DataOperationsCatalog(IHostEnvironment env) | ||||||||||
| internal DataOperationsCatalog(ISeededEnvironment env) | ||||||||||
| { | ||||||||||
| Contracts.AssertValue(env); | ||||||||||
| _env = env; | ||||||||||
|
|
@@ -493,16 +493,15 @@ internal static IEnumerable<TrainTestData> CrossValidationSplit(IHostEnvironment | |||||||||
| /// <summary> | ||||||||||
| /// Ensures the provided <paramref name="samplingKeyColumn"/> is valid for <see cref="RangeFilter"/>, hashing it if necessary, or creates a new column <paramref name="samplingKeyColumn"/> is null. | ||||||||||
| /// </summary> | ||||||||||
| internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDataView data, ref string samplingKeyColumn, int? seed = null) | ||||||||||
| internal static void EnsureGroupPreservationColumn(ISeededEnvironment env, ref IDataView data, ref string samplingKeyColumn, int? seed = null) | ||||||||||
| { | ||||||||||
| Contracts.CheckValue(env, nameof(env)); | ||||||||||
| var host = env.Register("rand"); | ||||||||||
| // We need to handle two cases: if samplingKeyColumn is provided, we use hashJoin to | ||||||||||
| // build a single hash of it. If it is not, we generate a random number. | ||||||||||
| if (samplingKeyColumn == null) | ||||||||||
| { | ||||||||||
| samplingKeyColumn = data.Schema.GetTempColumnName("SamplingKeyColumn"); | ||||||||||
| data = new GenerateNumberTransform(env, data, samplingKeyColumn, (uint?)(seed ?? host.Rand.Next())); | ||||||||||
| data = new GenerateNumberTransform(env, data, samplingKeyColumn, (uint?)(seed ?? env.Seed)); | ||||||||||
|
najeeb-kazmi marked this conversation as resolved.
Outdated
|
||||||||||
| } | ||||||||||
| else | ||||||||||
| { | ||||||||||
|
|
@@ -518,7 +517,7 @@ internal static void EnsureGroupPreservationColumn(IHostEnvironment env, ref IDa | |||||||||
| // instead of having two hash transformations. | ||||||||||
| var origStratCol = samplingKeyColumn; | ||||||||||
| samplingKeyColumn = data.Schema.GetTempColumnName(samplingKeyColumn); | ||||||||||
| var columnOptions = new HashingEstimator.ColumnOptionsInternal(samplingKeyColumn, origStratCol, 30, (uint)(seed ?? host.Rand.Next())); | ||||||||||
| var columnOptions = new HashingEstimator.ColumnOptionsInternal(samplingKeyColumn, origStratCol, 30, (uint)(seed ?? env.Seed)); | ||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This can also be null, can't it? #Resolved
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, if machinelearning/src/Microsoft.ML.Data/Transforms/Hashing.cs Lines 1129 to 1132 in 46e7dc6
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But you are casting it to In reply to: 375521058 [](ancestors = 375521058)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yes, you're right. Thanks for catching that @yaeldekel. I've changed the handling back to how it was before I added |
||||||||||
| data = new HashingEstimator(env, columnOptions).Fit(data).Transform(data); | ||||||||||
| } | ||||||||||
| else | ||||||||||
|
|
||||||||||
Uh oh!
There was an error while loading. Please reload this page.