diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs index e564b68cb8..239e7d93ac 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs @@ -4,7 +4,7 @@ namespace Microsoft.ML.Samples.Dynamic { - public static class LdaTransform + public static class LatentDirichletAllocationTransform { public static void Example() { @@ -30,7 +30,7 @@ public static void Example() // A pipeline for featurizing the "Review" column var pipeline = ml.Transforms.Text.ProduceWordBags(review). - Append(ml.Transforms.Text.LatentDirichletAllocation(review, ldaFeatures, numTopic:3)); + Append(ml.Transforms.Text.LatentDirichletAllocation(review, ldaFeatures, numberOfTopics: 3)); // The transformed data var transformer = pipeline.Fit(trainData); diff --git a/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs index d094d444ba..1d1df76477 100644 --- a/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/LdaStaticExtensions.cs @@ -12,65 +12,65 @@ namespace Microsoft.ML.StaticPipe /// /// Information on the result of fitting a LDA transform. /// - public sealed class LdaFitResult + public sealed class LatentDirichletAllocationFitResult { /// /// For user defined delegates that accept instances of the containing type. /// /// - public delegate void OnFit(LdaFitResult result); + public delegate void OnFit(LatentDirichletAllocationFitResult result); public LatentDirichletAllocationTransformer.LdaSummary LdaTopicSummary; - public LdaFitResult(LatentDirichletAllocationTransformer.LdaSummary ldaTopicSummary) + public LatentDirichletAllocationFitResult(LatentDirichletAllocationTransformer.LdaSummary ldaTopicSummary) { LdaTopicSummary = ldaTopicSummary; } } - public static class LdaStaticExtensions + public static class LatentDirichletAllocationStaticExtensions { private struct Config { - public readonly int NumTopic; + public readonly int NumberOfTopics; public readonly Single AlphaSum; public readonly Single Beta; - public readonly int MHStep; - public readonly int NumIter; + public readonly int SamplingStepCount; + public readonly int MaximumNumberOfIterations; public readonly int LikelihoodInterval; - public readonly int NumThread; - public readonly int NumMaxDocToken; - public readonly int NumSummaryTermPerTopic; - public readonly int NumBurninIter; + public readonly int NumberOfThreads; + public readonly int MaximumTokenCountPerDocument; + public readonly int NumberOfSummaryTermsPerTopic; + public readonly int NumberOfBurninIterations; public readonly bool ResetRandomGenerator; public readonly Action OnFit; - public Config(int numTopic, Single alphaSum, Single beta, int mhStep, int numIter, int likelihoodInterval, - int numThread, int numMaxDocToken, int numSummaryTermPerTopic, int numBurninIter, bool resetRandomGenerator, + public Config(int numberOfTopics, Single alphaSum, Single beta, int samplingStepCount, int maximumNumberOfIterations, int likelihoodInterval, + int numberOfThreads, int maximumTokenCountPerDocument, int numberOfSummaryTermsPerTopic, int numberOfBurninIterations, bool resetRandomGenerator, Action onFit) { - NumTopic = numTopic; + NumberOfTopics = numberOfTopics; AlphaSum = alphaSum; Beta = beta; - MHStep = mhStep; - NumIter = numIter; + SamplingStepCount = samplingStepCount; + MaximumNumberOfIterations = maximumNumberOfIterations; LikelihoodInterval = likelihoodInterval; - NumThread = numThread; - NumMaxDocToken = numMaxDocToken; - NumSummaryTermPerTopic = numSummaryTermPerTopic; - NumBurninIter = numBurninIter; + NumberOfThreads = numberOfThreads; + MaximumTokenCountPerDocument = maximumTokenCountPerDocument; + NumberOfSummaryTermsPerTopic = numberOfSummaryTermsPerTopic; + NumberOfBurninIterations = numberOfBurninIterations; ResetRandomGenerator = resetRandomGenerator; OnFit = onFit; } } - private static Action Wrap(LdaFitResult.OnFit onFit) + private static Action Wrap(LatentDirichletAllocationFitResult.OnFit onFit) { if (onFit == null) return null; - return ldaTopicSummary => onFit(new LdaFitResult(ldaTopicSummary)); + return ldaTopicSummary => onFit(new LatentDirichletAllocationFitResult(ldaTopicSummary)); } private interface ILdaCol @@ -108,16 +108,16 @@ public override IEstimator Reconcile(IHostEnvironment env, infos[i] = new LatentDirichletAllocationEstimator.ColumnOptions(outputNames[toOutput[i]], inputNames[tcol.Input], - tcol.Config.NumTopic, + tcol.Config.NumberOfTopics, tcol.Config.AlphaSum, tcol.Config.Beta, - tcol.Config.MHStep, - tcol.Config.NumIter, + tcol.Config.SamplingStepCount, + tcol.Config.MaximumNumberOfIterations, tcol.Config.LikelihoodInterval, - tcol.Config.NumThread, - tcol.Config.NumMaxDocToken, - tcol.Config.NumSummaryTermPerTopic, - tcol.Config.NumBurninIter, + tcol.Config.NumberOfThreads, + tcol.Config.MaximumTokenCountPerDocument, + tcol.Config.NumberOfSummaryTermsPerTopic, + tcol.Config.NumberOfBurninIterations, tcol.Config.ResetRandomGenerator); if (tcol.Config.OnFit != null) @@ -137,36 +137,36 @@ public override IEstimator Reconcile(IHostEnvironment env, /// /// A vector of floats representing the document. - /// The number of topics. + /// The number of topics. /// Dirichlet prior on document-topic vectors. /// Dirichlet prior on vocab-topic vectors. - /// Number of Metropolis Hasting step. - /// Number of iterations. + /// Number of Metropolis Hasting step. + /// Number of iterations. /// Compute log likelihood over local dataset on this iteration interval. - /// The number of training threads. Default value depends on number of logical processors. - /// The threshold of maximum count of tokens per doc. - /// The number of words to summarize the topic. - /// The number of burn-in iterations. + /// The number of training threads. Default value depends on number of logical processors. + /// The threshold of maximum count of tokens per doc. + /// The number of words to summarize the topic. + /// The number of burn-in iterations. /// Reset the random number generator for each document. /// Called upon fitting with the learnt enumeration on the dataset. - public static Vector ToLdaTopicVector(this Vector input, - int numTopic = LatentDirichletAllocationEstimator.Defaults.NumTopic, + public static Vector LatentDirichletAllocation(this Vector input, + int numberOfTopics = LatentDirichletAllocationEstimator.Defaults.NumberOfTopics, Single alphaSum = LatentDirichletAllocationEstimator.Defaults.AlphaSum, Single beta = LatentDirichletAllocationEstimator.Defaults.Beta, - int mhstep = LatentDirichletAllocationEstimator.Defaults.Mhstep, - int numIterations = LatentDirichletAllocationEstimator.Defaults.NumIterations, + int samplingStepCount = LatentDirichletAllocationEstimator.Defaults.SamplingStepCount, + int maximumNumberOfIterations = LatentDirichletAllocationEstimator.Defaults.MaximumNumberOfIterations, int likelihoodInterval = LatentDirichletAllocationEstimator.Defaults.LikelihoodInterval, - int numThreads = LatentDirichletAllocationEstimator.Defaults.NumThreads, - int numMaxDocToken = LatentDirichletAllocationEstimator.Defaults.NumMaxDocToken, - int numSummaryTermPerTopic = LatentDirichletAllocationEstimator.Defaults.NumSummaryTermPerTopic, - int numBurninIterations = LatentDirichletAllocationEstimator.Defaults.NumBurninIterations, + int numberOfThreads = LatentDirichletAllocationEstimator.Defaults.NumberOfThreads, + int maximumTokenCountPerDocument = LatentDirichletAllocationEstimator.Defaults.MaximumTokenCountPerDocument, + int numberOfSummaryTermsPerTopic = LatentDirichletAllocationEstimator.Defaults.NumberOfSummaryTermsPerTopic, + int numberOfBurninIterations = LatentDirichletAllocationEstimator.Defaults.NumberOfBurninIterations, bool resetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator, - LdaFitResult.OnFit onFit = null) + LatentDirichletAllocationFitResult.OnFit onFit = null) { Contracts.CheckValue(input, nameof(input)); return new ImplVector(input, - new Config(numTopic, alphaSum, beta, mhstep, numIterations, likelihoodInterval, numThreads, numMaxDocToken, numSummaryTermPerTopic, - numBurninIterations, resetRandomGenerator, Wrap(onFit))); + new Config(numberOfTopics, alphaSum, beta, samplingStepCount, maximumNumberOfIterations, likelihoodInterval, numberOfThreads, maximumTokenCountPerDocument, numberOfSummaryTermsPerTopic, + numberOfBurninIterations, resetRandomGenerator, Wrap(onFit))); } } } \ No newline at end of file diff --git a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs index 3c247ab1fd..aec1bfe763 100644 --- a/src/Microsoft.ML.Transforms/Text/LdaTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/LdaTransform.cs @@ -58,7 +58,7 @@ internal sealed class Options : TransformInputBase [Argument(ArgumentType.AtMostOnce, HelpText = "The number of topics", SortOrder = 50)] [TGUI(SuggestedSweeps = "20,40,100,200")] [TlcModule.SweepableDiscreteParam("NumTopic", new object[] { 20, 40, 100, 200 })] - public int NumTopic = LatentDirichletAllocationEstimator.Defaults.NumTopic; + public int NumTopic = LatentDirichletAllocationEstimator.Defaults.NumberOfTopics; [Argument(ArgumentType.AtMostOnce, HelpText = "Dirichlet prior on document-topic vectors")] [TGUI(SuggestedSweeps = "1,10,100,200")] @@ -73,30 +73,30 @@ internal sealed class Options : TransformInputBase [Argument(ArgumentType.Multiple, HelpText = "Number of Metropolis Hasting step")] [TGUI(SuggestedSweeps = "2,4,8,16")] [TlcModule.SweepableDiscreteParam("Mhstep", new object[] { 2, 4, 8, 16 })] - public int Mhstep = LatentDirichletAllocationEstimator.Defaults.Mhstep; + public int Mhstep = LatentDirichletAllocationEstimator.Defaults.SamplingStepCount; [Argument(ArgumentType.AtMostOnce, HelpText = "Number of iterations", ShortName = "iter")] [TGUI(SuggestedSweeps = "100,200,300,400")] [TlcModule.SweepableDiscreteParam("NumIterations", new object[] { 100, 200, 300, 400 })] - public int NumIterations = LatentDirichletAllocationEstimator.Defaults.NumIterations; + public int NumIterations = LatentDirichletAllocationEstimator.Defaults.MaximumNumberOfIterations; [Argument(ArgumentType.AtMostOnce, HelpText = "Compute log likelihood over local dataset on this iteration interval", ShortName = "llInterval")] public int LikelihoodInterval = LatentDirichletAllocationEstimator.Defaults.LikelihoodInterval; // REVIEW: Should change the default when multi-threading support is optimized. [Argument(ArgumentType.AtMostOnce, HelpText = "The number of training threads. Default value depends on number of logical processors.", ShortName = "t", SortOrder = 50)] - public int NumThreads = LatentDirichletAllocationEstimator.Defaults.NumThreads; + public int NumThreads = LatentDirichletAllocationEstimator.Defaults.NumberOfThreads; [Argument(ArgumentType.AtMostOnce, HelpText = "The threshold of maximum count of tokens per doc", ShortName = "maxNumToken", SortOrder = 50)] - public int NumMaxDocToken = LatentDirichletAllocationEstimator.Defaults.NumMaxDocToken; + public int NumMaxDocToken = LatentDirichletAllocationEstimator.Defaults.MaximumTokenCountPerDocument; [Argument(ArgumentType.AtMostOnce, HelpText = "The number of words to summarize the topic", ShortName = "ns")] - public int NumSummaryTermPerTopic = LatentDirichletAllocationEstimator.Defaults.NumSummaryTermPerTopic; + public int NumSummaryTermPerTopic = LatentDirichletAllocationEstimator.Defaults.NumberOfSummaryTermsPerTopic; [Argument(ArgumentType.AtMostOnce, HelpText = "The number of burn-in iterations", ShortName = "burninIter")] [TGUI(SuggestedSweeps = "10,20,30,40")] [TlcModule.SweepableDiscreteParam("NumBurninIterations", new object[] { 10, 20, 30, 40 })] - public int NumBurninIterations = LatentDirichletAllocationEstimator.Defaults.NumBurninIterations; + public int NumBurninIterations = LatentDirichletAllocationEstimator.Defaults.NumberOfBurninIterations; [Argument(ArgumentType.AtMostOnce, HelpText = "Reset the random number generator for each document", ShortName = "reset")] public bool ResetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator; @@ -219,17 +219,17 @@ internal LdaState(IExceptionContext ectx, LatentDirichletAllocationEstimator.Col _numVocab = numVocab; _ldaTrainer = new LdaSingleBox( - InfoEx.NumTopic, + InfoEx.NumberOfTopics, numVocab, /* Need to set number of vocabulary here */ InfoEx.AlphaSum, InfoEx.Beta, - InfoEx.NumIter, + InfoEx.NumberOfIterations, InfoEx.LikelihoodInterval, - InfoEx.NumThread, - InfoEx.MHStep, - InfoEx.NumSummaryTermPerTopic, + InfoEx.NumberOfThreads, + InfoEx.SamplingStepCount, + InfoEx.NumberOfSummaryTermsPerTopic, false, - InfoEx.NumMaxDocToken); + InfoEx.MaximumTokenCountPerDocument); } internal LdaState(IExceptionContext ectx, ModelLoadContext ctx) @@ -257,19 +257,19 @@ internal LdaState(IExceptionContext ectx, ModelLoadContext ctx) ectx.CheckDecode(aliasMemBlockSize > 0); _ldaTrainer = new LdaSingleBox( - InfoEx.NumTopic, + InfoEx.NumberOfTopics, _numVocab, /* Need to set number of vocabulary here */ InfoEx.AlphaSum, InfoEx.Beta, - InfoEx.NumIter, + InfoEx.NumberOfIterations, InfoEx.LikelihoodInterval, - InfoEx.NumThread, - InfoEx.MHStep, - InfoEx.NumSummaryTermPerTopic, + InfoEx.NumberOfThreads, + InfoEx.SamplingStepCount, + InfoEx.NumberOfSummaryTermsPerTopic, false, - InfoEx.NumMaxDocToken); + InfoEx.MaximumTokenCountPerDocument); - _ldaTrainer.AllocateModelMemory(_numVocab, InfoEx.NumTopic, memBlockSize, aliasMemBlockSize); + _ldaTrainer.AllocateModelMemory(_numVocab, InfoEx.NumberOfTopics, memBlockSize, aliasMemBlockSize); for (int i = 0; i < _numVocab; i++) { @@ -400,7 +400,7 @@ public int FeedTrain(IExceptionContext ectx, in VBuffer input) // Ignore this row. return 0; } - if (docSize >= InfoEx.NumMaxDocToken - termFreq) + if (docSize >= InfoEx.MaximumTokenCountPerDocument - termFreq) break; // If legal then add the term. @@ -448,7 +448,7 @@ public void Output(in VBuffer src, ref VBuffer dst, int numBurnin } } - int len = InfoEx.NumTopic; + int len = InfoEx.NumberOfTopics; var srcValues = src.GetValues(); if (srcValues.Length == 0) { @@ -476,7 +476,7 @@ public void Output(in VBuffer src, ref VBuffer dst, int numBurnin return; } - if (docSize >= InfoEx.NumMaxDocToken - termFreq) + if (docSize >= InfoEx.MaximumTokenCountPerDocument - termFreq) break; docSize += termFreq; @@ -557,7 +557,7 @@ protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore() for (int i = 0; i < _parent.ColumnPairs.Length; i++) { var info = _parent._columns[i]; - result[i] = new DataViewSchema.DetachedColumn(_parent.ColumnPairs[i].outputColumnName, new VectorType(NumberDataViewType.Single, info.NumTopic), null); + result[i] = new DataViewSchema.DetachedColumn(_parent.ColumnPairs[i].outputColumnName, new VectorType(NumberDataViewType.Single, info.NumberOfTopics), null); } return result; } @@ -576,7 +576,7 @@ private ValueGetter> GetTopic(DataViewRow input, int iinfo) var getSrc = RowCursorUtils.GetVecGetterAs(NumberDataViewType.Double, input, _srcCols[iinfo]); var src = default(VBuffer); var lda = _parent._ldas[iinfo]; - int numBurninIter = lda.InfoEx.NumBurninIter; + int numBurninIter = lda.InfoEx.NumberOfBurninIterations; bool reset = lda.InfoEx.ResetRandomGenerator; return (ref VBuffer dst) => @@ -831,7 +831,7 @@ private static List>> Train(IHostEnvironment env, I break; } - if (docSize >= columns[i].NumMaxDocToken - termFreq) + if (docSize >= columns[i].MaximumTokenCountPerDocument - termFreq) break; //control the document length //if legal then add the term @@ -920,16 +920,16 @@ public sealed class LatentDirichletAllocationEstimator : IEstimatorThe environment. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// The number of topics. + /// The number of topics. /// Dirichlet prior on document-topic vectors. /// Dirichlet prior on vocab-topic vectors. - /// Number of Metropolis Hasting step. - /// Number of iterations. + /// Number of Metropolis Hasting step. + /// Number of iterations. + /// The number of training threads. Default value depends on number of logical processors. + /// The threshold of maximum count of tokens per doc. + /// The number of words to summarize the topic. /// Compute log likelihood over local dataset on this iteration interval. - /// The number of training threads. Default value depends on number of logical processors. - /// The threshold of maximum count of tokens per doc. - /// The number of words to summarize the topic. - /// The number of burn-in iterations. + /// The number of burn-in iterations. /// Reset the random number generator for each document. internal LatentDirichletAllocationEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, - int numTopic = Defaults.NumTopic, + int numberOfTopics = Defaults.NumberOfTopics, float alphaSum = Defaults.AlphaSum, float beta = Defaults.Beta, - int mhstep = Defaults.Mhstep, - int numIterations = Defaults.NumIterations, + int samplingStepCount = Defaults.SamplingStepCount, + int maximumNumberOfIterations = Defaults.MaximumNumberOfIterations, + int numberOfThreads = Defaults.NumberOfThreads, + int maximumTokenCountPerDocument = Defaults.MaximumTokenCountPerDocument, + int numberOfSummaryTermsPerTopic = Defaults.NumberOfSummaryTermsPerTopic, int likelihoodInterval = Defaults.LikelihoodInterval, - int numThreads = Defaults.NumThreads, - int numMaxDocToken = Defaults.NumMaxDocToken, - int numSummaryTermPerTopic = Defaults.NumSummaryTermPerTopic, - int numBurninIterations = Defaults.NumBurninIterations, + int numberOfBurninIterations = Defaults.NumberOfBurninIterations, bool resetRandomGenerator = Defaults.ResetRandomGenerator) : this(env, new[] { new ColumnOptions(outputColumnName, inputColumnName ?? outputColumnName, - numTopic, alphaSum, beta, mhstep, numIterations, likelihoodInterval, numThreads, numMaxDocToken, - numSummaryTermPerTopic, numBurninIterations, resetRandomGenerator) }) + numberOfTopics, alphaSum, beta, samplingStepCount, maximumNumberOfIterations, likelihoodInterval, numberOfThreads, maximumTokenCountPerDocument, + numberOfSummaryTermsPerTopic, numberOfBurninIterations, resetRandomGenerator) }) { } /// @@ -995,7 +995,7 @@ public sealed class ColumnOptions /// /// The number of topics. /// - public readonly int NumTopic; + public readonly int NumberOfTopics; /// /// Dirichlet prior on document-topic vectors. /// @@ -1007,11 +1007,11 @@ public sealed class ColumnOptions /// /// Number of Metropolis Hasting step. /// - public readonly int MHStep; + public readonly int SamplingStepCount; /// /// Number of iterations. /// - public readonly int NumIter; + public readonly int NumberOfIterations; /// /// Compute log likelihood over local dataset on this iteration interval. /// @@ -1019,19 +1019,19 @@ public sealed class ColumnOptions /// /// The number of training threads. /// - public readonly int NumThread; + public readonly int NumberOfThreads; /// /// The threshold of maximum count of tokens per doc. /// - public readonly int NumMaxDocToken; + public readonly int MaximumTokenCountPerDocument; /// /// The number of words to summarize the topic. /// - public readonly int NumSummaryTermPerTopic; + public readonly int NumberOfSummaryTermsPerTopic; /// /// The number of burn-in iterations. /// - public readonly int NumBurninIter; + public readonly int NumberOfBurninIterations; /// /// Reset the random number generator for each document. /// @@ -1042,54 +1042,54 @@ public sealed class ColumnOptions /// /// The column containing the output scores over a set of topics, represented as a vector of floats. /// The column representing the document as a vector of floats.A null value for the column means is replaced. - /// The number of topics. + /// The number of topics. /// Dirichlet prior on document-topic vectors. /// Dirichlet prior on vocab-topic vectors. - /// Number of Metropolis Hasting step. - /// Number of iterations. + /// Number of Metropolis Hasting step. + /// Number of iterations. /// Compute log likelihood over local dataset on this iteration interval. - /// The number of training threads. Default value depends on number of logical processors. - /// The threshold of maximum count of tokens per doc. - /// The number of words to summarize the topic. - /// The number of burn-in iterations. + /// The number of training threads. Default value depends on number of logical processors. + /// The threshold of maximum count of tokens per doc. + /// The number of words to summarize the topic. + /// The number of burn-in iterations. /// Reset the random number generator for each document. public ColumnOptions(string name, string inputColumnName = null, - int numTopic = LatentDirichletAllocationEstimator.Defaults.NumTopic, + int numberOfTopics = LatentDirichletAllocationEstimator.Defaults.NumberOfTopics, float alphaSum = LatentDirichletAllocationEstimator.Defaults.AlphaSum, float beta = LatentDirichletAllocationEstimator.Defaults.Beta, - int mhStep = LatentDirichletAllocationEstimator.Defaults.Mhstep, - int numIter = LatentDirichletAllocationEstimator.Defaults.NumIterations, + int samplingStepCount = LatentDirichletAllocationEstimator.Defaults.SamplingStepCount, + int maximumNumberOfIterations = LatentDirichletAllocationEstimator.Defaults.MaximumNumberOfIterations, int likelihoodInterval = LatentDirichletAllocationEstimator.Defaults.LikelihoodInterval, - int numThread = LatentDirichletAllocationEstimator.Defaults.NumThreads, - int numMaxDocToken = LatentDirichletAllocationEstimator.Defaults.NumMaxDocToken, - int numSummaryTermPerTopic = LatentDirichletAllocationEstimator.Defaults.NumSummaryTermPerTopic, - int numBurninIter = LatentDirichletAllocationEstimator.Defaults.NumBurninIterations, + int numberOfThreads = LatentDirichletAllocationEstimator.Defaults.NumberOfThreads, + int maximumTokenCountPerDocument = LatentDirichletAllocationEstimator.Defaults.MaximumTokenCountPerDocument, + int numberOfSummaryTermsPerTopic = LatentDirichletAllocationEstimator.Defaults.NumberOfSummaryTermsPerTopic, + int numberOfBurninIterations = LatentDirichletAllocationEstimator.Defaults.NumberOfBurninIterations, bool resetRandomGenerator = LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator) { Contracts.CheckValue(name, nameof(name)); Contracts.CheckValueOrNull(inputColumnName); - Contracts.CheckParam(numTopic > 0, nameof(numTopic), "Must be positive."); - Contracts.CheckParam(mhStep > 0, nameof(mhStep), "Must be positive."); - Contracts.CheckParam(numIter > 0, nameof(numIter), "Must be positive."); + Contracts.CheckParam(numberOfTopics > 0, nameof(numberOfTopics), "Must be positive."); + Contracts.CheckParam(samplingStepCount > 0, nameof(samplingStepCount), "Must be positive."); + Contracts.CheckParam(maximumNumberOfIterations > 0, nameof(maximumNumberOfIterations), "Must be positive."); Contracts.CheckParam(likelihoodInterval > 0, nameof(likelihoodInterval), "Must be positive."); - Contracts.CheckParam(numThread >= 0, nameof(numThread), "Must be positive or zero."); - Contracts.CheckParam(numMaxDocToken > 0, nameof(numMaxDocToken), "Must be positive."); - Contracts.CheckParam(numSummaryTermPerTopic > 0, nameof(numSummaryTermPerTopic), "Must be positive"); - Contracts.CheckParam(numBurninIter >= 0, nameof(numBurninIter), "Must be non-negative."); + Contracts.CheckParam(numberOfThreads >= 0, nameof(numberOfThreads), "Must be positive or zero."); + Contracts.CheckParam(maximumTokenCountPerDocument > 0, nameof(maximumTokenCountPerDocument), "Must be positive."); + Contracts.CheckParam(numberOfSummaryTermsPerTopic > 0, nameof(numberOfSummaryTermsPerTopic), "Must be positive"); + Contracts.CheckParam(numberOfBurninIterations >= 0, nameof(numberOfBurninIterations), "Must be non-negative."); Name = name; InputColumnName = inputColumnName ?? name; - NumTopic = numTopic; + NumberOfTopics = numberOfTopics; AlphaSum = alphaSum; Beta = beta; - MHStep = mhStep; - NumIter = numIter; + SamplingStepCount = samplingStepCount; + NumberOfIterations = maximumNumberOfIterations; LikelihoodInterval = likelihoodInterval; - NumThread = numThread; - NumMaxDocToken = numMaxDocToken; - NumSummaryTermPerTopic = numSummaryTermPerTopic; - NumBurninIter = numBurninIter; + NumberOfThreads = numberOfThreads; + MaximumTokenCountPerDocument = maximumTokenCountPerDocument; + NumberOfSummaryTermsPerTopic = numberOfSummaryTermsPerTopic; + NumberOfBurninIterations = numberOfBurninIterations; ResetRandomGenerator = resetRandomGenerator; } @@ -1128,33 +1128,33 @@ internal ColumnOptions(IExceptionContext ectx, ModelLoadContext ctx) // int NumBurninIter; // byte ResetRandomGenerator; - NumTopic = ctx.Reader.ReadInt32(); - ectx.CheckDecode(NumTopic > 0); + NumberOfTopics = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumberOfTopics > 0); AlphaSum = ctx.Reader.ReadSingle(); Beta = ctx.Reader.ReadSingle(); - MHStep = ctx.Reader.ReadInt32(); - ectx.CheckDecode(MHStep > 0); + SamplingStepCount = ctx.Reader.ReadInt32(); + ectx.CheckDecode(SamplingStepCount > 0); - NumIter = ctx.Reader.ReadInt32(); - ectx.CheckDecode(NumIter > 0); + NumberOfIterations = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumberOfIterations > 0); LikelihoodInterval = ctx.Reader.ReadInt32(); ectx.CheckDecode(LikelihoodInterval > 0); - NumThread = ctx.Reader.ReadInt32(); - ectx.CheckDecode(NumThread >= 0); + NumberOfThreads = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumberOfThreads >= 0); - NumMaxDocToken = ctx.Reader.ReadInt32(); - ectx.CheckDecode(NumMaxDocToken > 0); + MaximumTokenCountPerDocument = ctx.Reader.ReadInt32(); + ectx.CheckDecode(MaximumTokenCountPerDocument > 0); - NumSummaryTermPerTopic = ctx.Reader.ReadInt32(); - ectx.CheckDecode(NumSummaryTermPerTopic > 0); + NumberOfSummaryTermsPerTopic = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumberOfSummaryTermsPerTopic > 0); - NumBurninIter = ctx.Reader.ReadInt32(); - ectx.CheckDecode(NumBurninIter >= 0); + NumberOfBurninIterations = ctx.Reader.ReadInt32(); + ectx.CheckDecode(NumberOfBurninIterations >= 0); ResetRandomGenerator = ctx.Reader.ReadBoolByte(); } @@ -1176,16 +1176,16 @@ internal void Save(ModelSaveContext ctx) // int NumBurninIter; // byte ResetRandomGenerator; - ctx.Writer.Write(NumTopic); + ctx.Writer.Write(NumberOfTopics); ctx.Writer.Write(AlphaSum); ctx.Writer.Write(Beta); - ctx.Writer.Write(MHStep); - ctx.Writer.Write(NumIter); + ctx.Writer.Write(SamplingStepCount); + ctx.Writer.Write(NumberOfIterations); ctx.Writer.Write(LikelihoodInterval); - ctx.Writer.Write(NumThread); - ctx.Writer.Write(NumMaxDocToken); - ctx.Writer.Write(NumSummaryTermPerTopic); - ctx.Writer.Write(NumBurninIter); + ctx.Writer.Write(NumberOfThreads); + ctx.Writer.Write(MaximumTokenCountPerDocument); + ctx.Writer.Write(NumberOfSummaryTermsPerTopic); + ctx.Writer.Write(NumberOfBurninIterations); ctx.Writer.WriteBoolByte(ResetRandomGenerator); } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 92647a4ab0..230ed970d4 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -579,17 +579,10 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T /// The transform's catalog. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// The number of topics. - /// Dirichlet prior on document-topic vectors. - /// Dirichlet prior on vocab-topic vectors. - /// Number of Metropolis Hasting step. - /// Number of iterations. - /// Compute log likelihood over local dataset on this iteration interval. - /// The number of training threads. Default value depends on number of logical processors. - /// The threshold of maximum count of tokens per doc. - /// The number of words to summarize the topic. - /// The number of burn-in iterations. - /// Reset the random number generator for each document. + /// The number of topics. + /// Number of iterations. + /// The threshold of maximum count of tokens per doc. + /// The number of words to summarize the topic. /// /// /// new LatentDirichletAllocationEstimator(CatalogUtils.GetEnvironment(catalog), - outputColumnName, inputColumnName, numTopic, alphaSum, beta, mhstep, numIterations, likelihoodInterval, numThreads, - numMaxDocToken, numSummaryTermPerTopic, numBurninIterations, resetRandomGenerator); + outputColumnName, inputColumnName, numberOfTopics, + LatentDirichletAllocationEstimator.Defaults.AlphaSum, + LatentDirichletAllocationEstimator.Defaults.Beta, + LatentDirichletAllocationEstimator.Defaults.SamplingStepCount, + maximumNumberOfIterations, + LatentDirichletAllocationEstimator.Defaults.NumberOfThreads, + maximumTokenCountPerDocument, + numberOfSummaryTermsPerTopic, + LatentDirichletAllocationEstimator.Defaults.LikelihoodInterval, + LatentDirichletAllocationEstimator.Defaults.NumberOfBurninIterations, + LatentDirichletAllocationEstimator.Defaults.ResetRandomGenerator); /// /// Uses LightLDA to transform a document (represented as a vector of floats) diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index 08f5431a55..933eab22e3 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -183,7 +183,7 @@ public void InspectLdaModelParameters() // Define the pipeline. var pipeline = mlContext.Transforms.Text.ProduceWordBags("SentimentBag", "SentimentText") - .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "SentimentBag", numTopic: numTopics, numIterations: 10)); + .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "SentimentBag", numberOfTopics: numTopics, maximumNumberOfIterations: 10)); // Fit the pipeline. var model = pipeline.Fit(data); diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 813ec4c352..9796e8021d 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -675,7 +675,7 @@ public void LdaTopicModel() var est = data.MakeNewEstimator() .Append(r => ( r.label, - topics: r.text.ToBagofWords().ToLdaTopicVector(numTopic: 3, numSummaryTermPerTopic:5, alphaSum: 10, onFit: m => ldaSummary = m.LdaTopicSummary))); + topics: r.text.ToBagofWords().LatentDirichletAllocation(numberOfTopics: 3, numberOfSummaryTermsPerTopic:5, alphaSum: 10, onFit: m => ldaSummary = m.LdaTopicSummary))); var transformer = est.Fit(data); var tdata = transformer.Transform(data); diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index 56ae2fbc0e..dc2e5a1584 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -11,6 +11,7 @@ using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms; +using Microsoft.ML.Transforms.Text; using Xunit; namespace Microsoft.ML.RunTests @@ -1318,7 +1319,9 @@ public void TestLDATransform() builder.AddColumn("F1V", NumberDataViewType.Single, data); var srcView = builder.GetDataView(); - var est = ML.Transforms.Text.LatentDirichletAllocation("F1V", numTopic: 3, numSummaryTermPerTopic: 3, alphaSum: 3, numThreads: 1, resetRandomGenerator: true); + var opt = new LatentDirichletAllocationEstimator.ColumnOptions(name: "F1V", numberOfTopics: 3, + numberOfSummaryTermsPerTopic: 3, alphaSum: 3, numberOfThreads: 1, resetRandomGenerator: true); + var est = ML.Transforms.Text.LatentDirichletAllocation(opt); var ldaTransformer = est.Fit(srcView); var transformedData = ldaTransformer.Transform(srcView); diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 13c97bbcfb..ecaf5cd36e 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -288,7 +288,7 @@ public void LdaWorkout() .Load(sentimentDataPath); var est = new WordBagEstimator(env, "bag_of_words", "text"). - Append(new LatentDirichletAllocationEstimator(env, "topics", "bag_of_words", 10, numIterations: 10, + Append(new LatentDirichletAllocationEstimator(env, "topics", "bag_of_words", 10, maximumNumberOfIterations: 10, resetRandomGenerator: true)); // The following call fails because of the following issue