From fc9a73a8cc93dd898b0e76a129de8d319998bf6f Mon Sep 17 00:00:00 2001
From: Ivan Matantsev <ivmatan@microsoft.com>
Date: Tue, 5 Jun 2018 16:08:13 -0700
Subject: [PATCH 1/4] Add Cluster evaluator

---
 .../Models/ClassificationEvaluator.cs         |  4 +-
 src/Microsoft.ML/Models/ClusterEvaluator.cs   | 71 ++++++++++++++
 src/Microsoft.ML/Models/ClusterMetrics.cs     | 94 +++++++++++++++++++
 src/Microsoft.ML/Models/CrossValidator.cs     |  7 ++
 src/Microsoft.ML/Models/TrainTestEvaluator.cs |  7 ++
 .../Scenarios/ClusteringTests.cs              | 11 +++
 6 files changed, 192 insertions(+), 2 deletions(-)
 create mode 100644 src/Microsoft.ML/Models/ClusterEvaluator.cs
 create mode 100644 src/Microsoft.ML/Models/ClusterMetrics.cs
diff --git a/src/Microsoft.ML/Models/ClassificationEvaluator.cs b/src/Microsoft.ML/Models/ClassificationEvaluator.cs
index 8fedc3fb4f..bc97a372a0 100644
--- a/src/Microsoft.ML/Models/ClassificationEvaluator.cs
+++ b/src/Microsoft.ML/Models/ClassificationEvaluator.cs
@@ -57,13 +57,13 @@ public ClassificationMetrics Evaluate(PredictionModel model, ILearningPipelineLo
                 IDataView overallMetrics = experiment.GetOutput(evaluteOutput.OverallMetrics);
                 if (overallMetrics == null)
                 {
-                    throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(BinaryClassificationEvaluator)} Evaluate.");
+                    throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(ClassificationEvaluator)} Evaluate.");
                 }
 
                 IDataView confusionMatrix = experiment.GetOutput(evaluteOutput.ConfusionMatrix);
                 if (confusionMatrix == null)
                 {
-                    throw environment.Except($"Could not find ConfusionMatrix in the results returned in {nameof(BinaryClassificationEvaluator)} Evaluate.");
+                    throw environment.Except($"Could not find ConfusionMatrix in the results returned in {nameof(ClassificationEvaluator)} Evaluate.");
                 }
 
                 var metric = ClassificationMetrics.FromMetrics(environment, overallMetrics, confusionMatrix);
diff --git a/src/Microsoft.ML/Models/ClusterEvaluator.cs b/src/Microsoft.ML/Models/ClusterEvaluator.cs
new file mode 100644
index 0000000000..5aceca16f7
--- /dev/null
+++ b/src/Microsoft.ML/Models/ClusterEvaluator.cs
@@ -0,0 +1,71 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime;
+using Microsoft.ML.Runtime.Data;
+using Microsoft.ML.Transforms;
+
+namespace Microsoft.ML.Models
+{
+    public sealed partial class ClusterEvaluator
+    {
+        /// <summary>
+        /// Computes the quality metrics for the PredictionModel using the specified data set.
+        /// </summary>
+        /// <param name="model">
+        /// The trained PredictionModel to be evaluated.
+        /// </param>
+        /// <param name="testData">
+        /// The test data that will be predicted and used to evaluate the model.
+        /// </param>
+        /// <returns>
+        /// A ClusterMetrics instance that describes how well the model performed against the test data.
+        /// </returns>
+        public ClusterMetrics Evaluate(PredictionModel model, ILearningPipelineLoader testData)
+        {
+            using (var environment = new TlcEnvironment())
+            {
+                environment.CheckValue(model, nameof(model));
+                environment.CheckValue(testData, nameof(testData));
+
+                Experiment experiment = environment.CreateExperiment();
+
+                ILearningPipelineStep testDataStep = testData.ApplyStep(previousStep: null, experiment);
+                if (!(testDataStep is ILearningPipelineDataStep testDataOutput))
+                {
+                    throw environment.Except($"The {nameof(ILearningPipelineLoader)} did not return a {nameof(ILearningPipelineDataStep)} from ApplyStep.");
+                }
+
+                var datasetScorer = new DatasetTransformScorer
+                {
+                    Data = testDataOutput.Data,
+                };
+                DatasetTransformScorer.Output scoreOutput = experiment.Add(datasetScorer);
+
+                Data = scoreOutput.ScoredData;
+                Output evaluteOutput = experiment.Add(this);
+
+                experiment.Compile();
+
+                experiment.SetInput(datasetScorer.TransformModel, model.PredictorModel);
+                testData.SetInput(environment, experiment);
+
+                experiment.Run();
+
+                IDataView overallMetrics = experiment.GetOutput(evaluteOutput.OverallMetrics);
+
+                if (overallMetrics == null)
+                {
+                    throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(ClusterEvaluator)} Evaluate.");
+                }
+
+                var metric = ClusterMetrics.FromOverallMetrics(environment, overallMetrics);
+
+                Contracts.Assert(metric.Count == 1, $"Exactly one metric set was expected but found {metric.Count} metrics");
+
+                return metric[0];
+            }
+        }
+    }
+}
diff --git a/src/Microsoft.ML/Models/ClusterMetrics.cs b/src/Microsoft.ML/Models/ClusterMetrics.cs
new file mode 100644
index 0000000000..0f39784c97
--- /dev/null
+++ b/src/Microsoft.ML/Models/ClusterMetrics.cs
@@ -0,0 +1,94 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime;
+using Microsoft.ML.Runtime.Api;
+using Microsoft.ML.Runtime.Data;
+using System;
+using System.Collections.Generic;
+
+namespace Microsoft.ML.Models
+{
+    /// <summary>
+    /// This class contains the overall metrics computed by regression evaluators.
+    /// </summary>
+    public sealed class ClusterMetrics
+    {
+        private ClusterMetrics()
+        {
+        }
+
+        internal static List<ClusterMetrics> FromOverallMetrics(IHostEnvironment env, IDataView overallMetrics)
+        {
+            Contracts.AssertValue(env);
+            env.AssertValue(overallMetrics);
+
+            var metricsEnumerable = overallMetrics.AsEnumerable<SerializationClass>(env, true, ignoreMissingColumns: true);
+            if (!metricsEnumerable.GetEnumerator().MoveNext())
+            {
+                throw env.Except("The overall ClusteringMetrics didn't have any rows.");
+            }
+
+            var metrics = new List<ClusterMetrics>();
+            foreach (var metric in metricsEnumerable)
+            {
+                metrics.Add(new ClusterMetrics()
+                {
+                    AvgMinScore = metric.AvgMinScore,
+                    Nmi = metric.Nmi,
+                    Dbi = metric.Dbi,
+                });
+            }
+
+            return metrics;
+        }
+
+        /// <summary>
+        /// Davies-Bouldin Index.
+        /// </summary>
+        /// <remarks>
+        /// DBI is a measure of the how much scatter is in the cluster and the cluster separation.
+        /// </remarks>
+        public double Dbi { get; private set; }
+
+        /// <summary>
+        /// Normalized Mutual Information
+        /// </summary>
+        /// <remarks>
+        /// NMI is a measure of the mutual dependence of the variables.
+        /// Normalized variants work on data that already has cluster labels.
+        /// It returns values from 0 to 1, where higher numbers are better.
+        /// </remarks>
+        public double Nmi { get; private set; }
+
+        /// <summary>
+        /// Average minimum score.
+        /// </summary>
+        /// <remarks>
+        /// This makes sense for K-Means algorithm, where the 'score' is the distance from the centroid to the example.
+        /// The average score is, therefore, a measure of proximity of the examples to cluster centroids.
+        /// In other words, it's the 'cluster tightness' measure.
+        /// Note however, that this metric will only decrease if the number of clusters is increased, and in the extreme case (where each distinct example is its own cluster) it will be equal to zero.
+        /// </remarks>
+        public double AvgMinScore { get; private set; }
+
+        /// <summary>
+        /// This class contains the public fields necessary to deserialize from IDataView.
+        /// </summary>
+        private sealed class SerializationClass
+        {
+#pragma warning disable 649 // never assigned
+            [ColumnName(Runtime.Data.ClusteringEvaluator.Dbi)]
+            public Double Dbi;
+
+            [ColumnName(Runtime.Data.ClusteringEvaluator.Nmi)]
+            public Double Nmi;
+
+            [ColumnName(Runtime.Data.ClusteringEvaluator.AvgMinScore)]
+            public Double AvgMinScore;
+
+#pragma warning restore 649 // never assigned
+        }
+    }
+}
diff --git a/src/Microsoft.ML/Models/CrossValidator.cs b/src/Microsoft.ML/Models/CrossValidator.cs
index 173e03916c..f95bb4b3b3 100644
--- a/src/Microsoft.ML/Models/CrossValidator.cs
+++ b/src/Microsoft.ML/Models/CrossValidator.cs
@@ -142,6 +142,12 @@ public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(Lea
                             environment,
                             experiment.GetOutput(crossValidateOutput.OverallMetrics));
                     }
+                    else if( Kind== MacroUtilsTrainerKinds.SignatureClusteringTrainer)
+                    {
+                        cvOutput.ClusterMetrics = ClusterMetrics.FromOverallMetrics(
+                            environment,
+                            experiment.GetOutput(crossValidateOutput.OverallMetrics));
+                    }
                     else
                     {
                         //Implement metrics for ranking, clustering and anomaly detection.
@@ -174,6 +180,7 @@ public class CrossValidationOutput<TInput, TOutput>
         public List<BinaryClassificationMetrics> BinaryClassificationMetrics;
         public List<ClassificationMetrics> ClassificationMetrics;
         public List<RegressionMetrics> RegressionMetrics;
+        public List<ClusterMetrics> ClusterMetrics;
         public PredictionModel<TInput, TOutput>[] PredictorModels;
 
         //REVIEW: Add warnings and per instance results and implement 
diff --git a/src/Microsoft.ML/Models/TrainTestEvaluator.cs b/src/Microsoft.ML/Models/TrainTestEvaluator.cs
index 19261e82de..03fde75dd4 100644
--- a/src/Microsoft.ML/Models/TrainTestEvaluator.cs
+++ b/src/Microsoft.ML/Models/TrainTestEvaluator.cs
@@ -140,6 +140,12 @@ public TrainTestEvaluatorOutput<TInput, TOutput> TrainTestEvaluate<TInput, TOutp
                         environment,
                         experiment.GetOutput(crossValidateOutput.OverallMetrics)).FirstOrDefault();
                 }
+                else if (Kind==MacroUtilsTrainerKinds.SignatureClusteringTrainer)
+                {
+                    trainTestOutput.ClusterMetrics = ClusterMetrics.FromOverallMetrics(
+                        environment,
+                        experiment.GetOutput(crossValidateOutput.OverallMetrics)).FirstOrDefault();
+                }
                 else
                 {
                     //Implement metrics for ranking, clustering and anomaly detection.
@@ -171,6 +177,7 @@ public class TrainTestEvaluatorOutput<TInput, TOutput>
         public BinaryClassificationMetrics BinaryClassificationMetrics;
         public ClassificationMetrics ClassificationMetrics;
         public RegressionMetrics RegressionMetrics;
+        public ClusterMetrics ClusterMetrics;
         public PredictionModel<TInput, TOutput> PredictorModels;
 
         //REVIEW: Add warnings and per instance results and implement 
diff --git a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs
index 560ee11d28..ad1027bb1a 100644
--- a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs
+++ b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs
@@ -1,4 +1,5 @@
 ﻿using Microsoft.ML.Data;
+using Microsoft.ML.Models;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Runtime.Api;
 using Microsoft.ML.Trainers;
@@ -116,6 +117,16 @@ public void PredictClusters()
                 Assert.True(!labels.Contains(scores.SelectedClusterId));
                 labels.Add(scores.SelectedClusterId);
             }
+
+            var evaluator = new ClusterEvaluator();
+            var testData = CollectionDataSource.Create(clusters);
+            ClusterMetrics metrics = evaluator.Evaluate(model, testData);
+
+            //Label is not specified, so NMI would be equal to NaN
+            Assert.Equal(metrics.Nmi, double.NaN);
+            //Calculate dbi is false by default so Dbi would be 0
+            Assert.Equal(metrics.Dbi, (double)0.0);
+            Assert.Equal(metrics.AvgMinScore, (double)0.0, 5);
         }
     }
 }

From 0196e933d56e4dfe57f737caa1ec0d5c67e5617b Mon Sep 17 00:00:00 2001
From: Ivan Matantsev <ivmatan@microsoft.com>
Date: Tue, 5 Jun 2018 16:29:14 -0700
Subject: [PATCH 2/4] fix copypaste

---
 src/Microsoft.ML/Models/ClusterMetrics.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Microsoft.ML/Models/ClusterMetrics.cs b/src/Microsoft.ML/Models/ClusterMetrics.cs
index 0f39784c97..77795a565a 100644
--- a/src/Microsoft.ML/Models/ClusterMetrics.cs
+++ b/src/Microsoft.ML/Models/ClusterMetrics.cs
@@ -11,7 +11,7 @@
 namespace Microsoft.ML.Models
 {
     /// <summary>
-    /// This class contains the overall metrics computed by regression evaluators.
+    /// This class contains the overall metrics computed by cluster evaluators.
     /// </summary>
     public sealed class ClusterMetrics
     {

From e416826437b92e3ab26c5c410b018d48c8bd5a39 Mon Sep 17 00:00:00 2001
From: Ivan Matantsev <ivmatan@microsoft.com>
Date: Wed, 6 Jun 2018 08:42:48 -0700
Subject: [PATCH 3/4] address comments

---
 src/Microsoft.ML/Models/ClusterMetrics.cs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Microsoft.ML/Models/ClusterMetrics.cs b/src/Microsoft.ML/Models/ClusterMetrics.cs
index 77795a565a..7f88784ef8 100644
--- a/src/Microsoft.ML/Models/ClusterMetrics.cs
+++ b/src/Microsoft.ML/Models/ClusterMetrics.cs
@@ -56,9 +56,8 @@ internal static List<ClusterMetrics> FromOverallMetrics(IHostEnvironment env, ID
         /// Normalized Mutual Information
         /// </summary>
         /// <remarks>
-        /// NMI is a measure of the mutual dependence of the variables.
-        /// Normalized variants work on data that already has cluster labels.
-        /// It returns values from 0 to 1, where higher numbers are better.
+        /// NMI is a measure of the mutual dependence between the true and predicted cluster labels for instances in the dataset.
+        /// NMI ranges between 0 and 1 where "0" indicates clustering is random and "1" indicates clustering is perfect w.r.t true labels. 
         /// </remarks>
         public double Nmi { get; private set; }
 
@@ -66,10 +65,11 @@ internal static List<ClusterMetrics> FromOverallMetrics(IHostEnvironment env, ID
         /// Average minimum score.
         /// </summary>
         /// <remarks>
-        /// This makes sense for K-Means algorithm, where the 'score' is the distance from the centroid to the example.
-        /// The average score is, therefore, a measure of proximity of the examples to cluster centroids.
-        /// In other words, it's the 'cluster tightness' measure.
-        /// Note however, that this metric will only decrease if the number of clusters is increased, and in the extreme case (where each distinct example is its own cluster) it will be equal to zero.
+        /// AvgMinScore is the average squared-distance of examples from the respective cluster centroids.
+        /// It is defined as 
+        /// AvgMinScore  = (1/m) * sum ((xi - c(xi))^2)
+        /// where m is the number of instances in the dataset.
+        /// xi is the i'th instance and c(xi) is the centriod of the predicted cluster for xi.
         /// </remarks>
         public double AvgMinScore { get; private set; }
 

From 53daa75ed27fc8505c822bd0236ed62b68f0725c Mon Sep 17 00:00:00 2001
From: Ivan Matantsev <ivmatan@microsoft.com>
Date: Wed, 6 Jun 2018 10:17:18 -0700
Subject: [PATCH 4/4] formatting

---
 src/Microsoft.ML/Models/CrossValidator.cs     | 8 ++++----
 src/Microsoft.ML/Models/TrainTestEvaluator.cs | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/Microsoft.ML/Models/CrossValidator.cs b/src/Microsoft.ML/Models/CrossValidator.cs
index f95bb4b3b3..ab84f8a715 100644
--- a/src/Microsoft.ML/Models/CrossValidator.cs
+++ b/src/Microsoft.ML/Models/CrossValidator.cs
@@ -19,7 +19,7 @@ public sealed partial class CrossValidator
         /// <typeparam name="TOutput">Class type that represents prediction schema.</typeparam>
         /// <param name="pipeline">Machine learning pipeline may contain loader, transforms and at least one trainer.</param>
         /// <returns>List containing metrics and predictor model for each fold</returns>
-        public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(LearningPipeline pipeline) 
+        public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(LearningPipeline pipeline)
             where TInput : class
             where TOutput : class, new()
         {
@@ -76,7 +76,7 @@ public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(Lea
                         {
                             PredictorModel = predictorModel
                         };
-                        
+
                         var scorerOutput = subGraph.Add(scorer);
                         lastTransformModel = scorerOutput.ScoringTransform;
                         step = new ScorerPipelineStep(scorerOutput.ScoredData, scorerOutput.ScoringTransform);
@@ -129,7 +129,7 @@ public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(Lea
                             experiment.GetOutput(crossValidateOutput.OverallMetrics),
                             experiment.GetOutput(crossValidateOutput.ConfusionMatrix), 2);
                     }
-                    else if(Kind == MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer)
+                    else if (Kind == MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer)
                     {
                         cvOutput.ClassificationMetrics = ClassificationMetrics.FromMetrics(
                             environment,
@@ -142,7 +142,7 @@ public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(Lea
                             environment,
                             experiment.GetOutput(crossValidateOutput.OverallMetrics));
                     }
-                    else if( Kind== MacroUtilsTrainerKinds.SignatureClusteringTrainer)
+                    else if (Kind == MacroUtilsTrainerKinds.SignatureClusteringTrainer)
                     {
                         cvOutput.ClusterMetrics = ClusterMetrics.FromOverallMetrics(
                             environment,
diff --git a/src/Microsoft.ML/Models/TrainTestEvaluator.cs b/src/Microsoft.ML/Models/TrainTestEvaluator.cs
index 03fde75dd4..ae00a34de6 100644
--- a/src/Microsoft.ML/Models/TrainTestEvaluator.cs
+++ b/src/Microsoft.ML/Models/TrainTestEvaluator.cs
@@ -102,7 +102,7 @@ public TrainTestEvaluatorOutput<TInput, TOutput> TrainTestEvaluate<TInput, TOutp
                 }
 
                 var experiment = environment.CreateExperiment();
-                
+
                 TrainingData = (loaders[0].ApplyStep(null, experiment) as ILearningPipelineDataStep).Data;
                 TestingData = (testData.ApplyStep(null, experiment) as ILearningPipelineDataStep).Data;
                 Nodes = subGraph;
@@ -140,7 +140,7 @@ public TrainTestEvaluatorOutput<TInput, TOutput> TrainTestEvaluate<TInput, TOutp
                         environment,
                         experiment.GetOutput(crossValidateOutput.OverallMetrics)).FirstOrDefault();
                 }
-                else if (Kind==MacroUtilsTrainerKinds.SignatureClusteringTrainer)
+                else if (Kind == MacroUtilsTrainerKinds.SignatureClusteringTrainer)
                 {
                     trainTestOutput.ClusterMetrics = ClusterMetrics.FromOverallMetrics(
                         environment,
@@ -164,7 +164,7 @@ public TrainTestEvaluatorOutput<TInput, TOutput> TrainTestEvaluate<TInput, TOutp
 
                     trainTestOutput.PredictorModels = new PredictionModel<TInput, TOutput>(predictor, memoryStream);
                 }
-                
+
                 return trainTestOutput;
             }
         }