Skip to content
8 changes: 4 additions & 4 deletions src/Microsoft.ML.AutoML/Experiment/Runners/CrossValRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ public CrossValRunner(MLContext context,

private static double CalcAverageScore(IEnumerable<double> scores)
{
if (scores.Any(s => double.IsNaN(s)))
{
var newScores = scores.Where(r => !double.IsNaN(r));
// Return NaN iff all scores are NaN
if (newScores.Count() == 0)
return double.NaN;
}
return scores.Average();
return newScores.Average();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Runtime;

namespace Microsoft.ML.AutoML
Expand Down Expand Up @@ -70,27 +71,105 @@ public CrossValSummaryRunner(MLContext context,

// Get the model from the best fold
var bestFoldIndex = BestResultUtil.GetIndexOfBestScore(trainResults.Select(r => r.score), _optimizingMetricInfo.IsMaximizing);
// bestFoldIndex will be -1 if the optimization metric for all folds is NaN.
// In this case, return model from the first fold.
bestFoldIndex = bestFoldIndex != -1 ? bestFoldIndex : 0;
Copy link
Contributor

@justinormont justinormont Apr 16, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd also look into places where the metric gets compared.

For instance below in GetIndexClosestToAverage():

private static int GetIndexClosestToAverage(IEnumerable<double> values, double average)
{
int avgFoldIndex = -1;
var smallestDistFromAvg = double.PositiveInfinity;
for (var i = 0; i < values.Count(); i++)
{
var distFromAvg = Math.Abs(values.ElementAt(i) - average);
if (distFromAvg < smallestDistFromAvg || smallestDistFromAvg == double.PositiveInfinity)
{
smallestDistFromAvg = distFromAvg;
avgFoldIndex = i;
}
}
return avgFoldIndex;
}

While GetIndexClosestToAverage() could be adjusted to handle CV folds returning NaN, it would be better to remove the function and instead return a new instance of the metric class w/ the actual averages. The current function was created before the AutoML code had access to create a new instance of the metric classes, so it just returned the closest to the average of the folds.

I would also look at a bit further to where AutoML compares the return metrics from each model in the sweep, where it chooses which is the best. The comparison to NaN may also be there. #Resolved

Copy link
Member Author

@najeeb-kazmi najeeb-kazmi Apr 17, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have changed the calculation of average to exclude any folds with NaN metrics. In finding the best run, there are not any comparisons to NaN, but an index of -1 is returned if all the runs have NaN metric. I have added a warning there checking for that case.

As for returning a Metrics object containing the averages of the metrics across the folds, I think this would be out of scope for this particular issue. Also, since we only have the type TMetrics here, creating the right metric would be fairly involved. #Resolved

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Looks good.

In the future, we can use the MetricsStatistics class to do the averaging so that we're not duplicating functionality. It has the benefit of handling the PerClassLogLoss, though I suspect it's not properly handing the class ordering differences.

/// <summary>
/// The <see cref="RegressionMetricsStatistics"/> class holds summary
/// statistics over multiple observations of <see cref="RegressionMetrics"/>.
/// </summary>
public sealed class RegressionMetricsStatistics : IMetricsStatistics<RegressionMetrics>

var bestModel = trainResults.ElementAt(bestFoldIndex).model;

// Get the metrics from the fold whose score is closest to avg of all fold scores
var avgScore = trainResults.Average(r => r.score);
// Get the average metrics across all folds
var avgScore = GetAverageOfNonNaNScores(trainResults.Select(x => x.score));
var indexClosestToAvg = GetIndexClosestToAverage(trainResults.Select(r => r.score), avgScore);
var metricsClosestToAvg = trainResults[indexClosestToAvg].metrics;
var avgMetrics = GetAverageMetrics(trainResults.Select(x => x.metrics), metricsClosestToAvg);

// Build result objects
var suggestedPipelineRunDetail = new SuggestedPipelineRunDetail<TMetrics>(pipeline, avgScore, allRunsSucceeded, metricsClosestToAvg, bestModel, null);
var suggestedPipelineRunDetail = new SuggestedPipelineRunDetail<TMetrics>(pipeline, avgScore, allRunsSucceeded, avgMetrics, bestModel, null);
var runDetail = suggestedPipelineRunDetail.ToIterationResult(_preFeaturizer);
return (suggestedPipelineRunDetail, runDetail);
}

private static TMetrics GetAverageMetrics(IEnumerable<TMetrics> metrics, TMetrics metricsClosestToAvg)
{
if (typeof(TMetrics) == typeof(BinaryClassificationMetrics))
{
var newMetrics = metrics.Select(x => x as BinaryClassificationMetrics);
Contracts.Assert(newMetrics != null);

var result = new BinaryClassificationMetrics(
auc: GetAverageOfNonNaNScores(newMetrics.Select(x => x.AreaUnderRocCurve)),
accuracy: GetAverageOfNonNaNScores(newMetrics.Select(x => x.Accuracy)),
positivePrecision: GetAverageOfNonNaNScores(newMetrics.Select(x => x.PositivePrecision)),
positiveRecall: GetAverageOfNonNaNScores(newMetrics.Select(x => x.PositiveRecall)),
negativePrecision: GetAverageOfNonNaNScores(newMetrics.Select(x => x.NegativePrecision)),
negativeRecall: GetAverageOfNonNaNScores(newMetrics.Select(x => x.NegativeRecall)),
f1Score: GetAverageOfNonNaNScores(newMetrics.Select(x => x.F1Score)),
auprc: GetAverageOfNonNaNScores(newMetrics.Select(x => x.AreaUnderPrecisionRecallCurve)),
// Return ConfusionMatrix from the fold closest to average score
confusionMatrix: (metricsClosestToAvg as BinaryClassificationMetrics).ConfusionMatrix);
return result as TMetrics;
}

if (typeof(TMetrics) == typeof(MulticlassClassificationMetrics))
{
var newMetrics = metrics.Select(x => x as MulticlassClassificationMetrics);
Contracts.Assert(newMetrics != null);

var result = new MulticlassClassificationMetrics(
accuracyMicro: GetAverageOfNonNaNScores(newMetrics.Select(x => x.MicroAccuracy)),
accuracyMacro: GetAverageOfNonNaNScores(newMetrics.Select(x => x.MacroAccuracy)),
logLoss: GetAverageOfNonNaNScores(newMetrics.Select(x => x.LogLoss)),
logLossReduction: GetAverageOfNonNaNScores(newMetrics.Select(x => x.LogLossReduction)),
topKPredictionCount: newMetrics.ElementAt(0).TopKPredictionCount,
topKAccuracy: GetAverageOfNonNaNScores(newMetrics.Select(x => x.TopKAccuracy)),
// Return PerClassLogLoss and ConfusionMatrix from the fold closest to average score
perClassLogLoss: (metricsClosestToAvg as MulticlassClassificationMetrics).PerClassLogLoss.ToArray(),
confusionMatrix: (metricsClosestToAvg as MulticlassClassificationMetrics).ConfusionMatrix);
return result as TMetrics;
}

if (typeof(TMetrics) == typeof(RegressionMetrics))
{
var newMetrics = metrics.Select(x => x as RegressionMetrics);
Contracts.Assert(newMetrics != null);

var result = new RegressionMetrics(
l1: GetAverageOfNonNaNScores(newMetrics.Select(x => x.MeanAbsoluteError)),
l2: GetAverageOfNonNaNScores(newMetrics.Select(x => x.MeanSquaredError)),
rms: GetAverageOfNonNaNScores(newMetrics.Select(x => x.RootMeanSquaredError)),
lossFunction: GetAverageOfNonNaNScores(newMetrics.Select(x => x.LossFunction)),
rSquared: GetAverageOfNonNaNScores(newMetrics.Select(x => x.RSquared)));
return result as TMetrics;
}

throw new NotImplementedException($"Metric {typeof(TMetrics)} not implemented");
}

private static double GetAverageOfNonNaNScores(IEnumerable<double> results)
{
var newResults = results.Where(r => !double.IsNaN(r));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's your thoughts on metrics that can be +/-Infinity? Should they included in the average or not? Any trade-offs?

Behavior currently:

  • Input includes only double.PositiveInfinity xor double.NegativeInfinity, GetAverageOfNonNaNScores() would return average=+/-Inf
  • Input has both double.PositiveInfinity and double.NegativeInfinity, GetAverageOfNonNaNScores() would return average=NaN

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I've looked at the calculation of LogLoss, which is the main metric I would be concerned about being infinity. It looks to me that it can only be Double.PositiveInfinity if the aggregation over the all the rows in the evaluation set overflows Double.MaxValue.

Here's the code for binary classification:

return Double.IsNaN(_logLoss) ? Double.NaN : (_numLogLossPositives + _numLogLossNegatives > 0)
? _logLoss / (_numLogLossPositives + _numLogLossNegatives) : 0;

This is only Double.PositiveInfinity if _logLoss is, which will only be the case if it overflows here

because logloss cannot be Double.PositiveInfinity. prob will never be 0 if label is positive, and prob will never be 1 if label is negative.

Double logloss;
if (!Single.IsNaN(prob))
{
if (_label > 0)
{
// REVIEW: Should we bring back the option to use ln instead of log2?
logloss = -Math.Log(prob, 2);
}
else
logloss = -Math.Log(1.0 - prob, 2);
}
else
logloss = Double.NaN;
UnweightedCounters.Update(_score, prob, _label, logloss, 1);

This does mean that if prob is NaN for any row, then logloss will be NaN for that row, and _logLoss for the entire evaluation set will be NaN.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly for multiclass classification:

LogLoss will only be Double.PositiveInfinity if _totalLogLoss is

public double LogLoss { get { return _numInstances > 0 ? _totalLogLoss / _numInstances : 0; } }

and that will only be the case if it overflows here

because loglossCurr itself will never be Double.PositiveInfinity, as it can only be as large as -Math.Log(Epsilon), which is approximately 34.54

double logloss;
if (intLabel < _scoresArr.Length)
{
// REVIEW: This assumes that the predictions are probabilities, not just relative scores
// for the classes. Is this a correct assumption?
float p = Math.Min(1, Math.Max(Epsilon, _scoresArr[intLabel]));
logloss = -Math.Log(p);
}
else
{
// Penalize logloss if the label was not seen during training
logloss = -Math.Log(Epsilon);
_numUnknownClassInstances++;
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with the principle of reporting NaN log loss if a single row has NaN probability and log loss. Because if there is a problem, the user needs to know via the metric. While it may be fine to suppress this for a handful of rows, this leads to questions on what to do when a lot or majority of rows have this problem. So, I think it is better to not suppress NaNs in the calculation of total log loss.

By the same principle, we should not be suppressing infinities in the metrics for all the folds. If one fold has an infinity, the average returned should be infinity.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For log-loss, the Infinity occurs when the model is perfectly confident (p=0 or p=1) and wrong, for any of the predicted labels in the scoring dataset.

I created a dotnet fiddle to show this:
https://dotnetfiddle.net/DPIn5Y

It's demonstrating the code for binary log-loss:

Double logloss;
if (!Single.IsNaN(prob))
{
if (_label > 0)
{
// REVIEW: Should we bring back the option to use ln instead of log2?
logloss = -Math.Log(prob, 2);
}
else
logloss = -Math.Log(1.0 - prob, 2);
}
else
logloss = Double.NaN;
UnweightedCounters.Update(_score, prob, _label, logloss, 1);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, that's the true label, not the predicted label.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I filed an issue to investigate if we should threshold the probability for log-loss in binary classification. This would cause the AutoML code (discussed above) to not receive a Infinity value.

Issue: #5055 Investigate thresholding binary log-loss

// Return NaN iff all scores are NaN
if (newResults.Count() == 0)
return double.NaN;
// Return average of non-NaN scores otherwise
return newResults.Average(r => r);
}

private static int GetIndexClosestToAverage(IEnumerable<double> values, double average)
{
// Average will be NaN iff all values are NaN.
// Return the first index in this case.
if (double.IsNaN(average))
return 0;

int avgFoldIndex = -1;
var smallestDistFromAvg = double.PositiveInfinity;
for (var i = 0; i < values.Count(); i++)
{
var distFromAvg = Math.Abs(values.ElementAt(i) - average);
if (distFromAvg < smallestDistFromAvg || smallestDistFromAvg == double.PositiveInfinity)
var value = values.ElementAt(i);
if (double.IsNaN(value))
continue;
var distFromAvg = Math.Abs(value - average);
if (distFromAvg < smallestDistFromAvg)
{
smallestDistFromAvg = distFromAvg;
avgFoldIndex = i;
Expand Down
6 changes: 6 additions & 0 deletions src/Microsoft.ML.AutoML/Utils/BestResultUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ public static RunDetail<TMetrics> GetBestRun<TMetrics>(IEnumerable<RunDetail<TMe
if (!results.Any()) { return null; }
var scores = results.Select(r => metricsAgent.GetScore(r.ValidationMetrics));
var indexOfBestScore = GetIndexOfBestScore(scores, isMetricMaximizing);
// indexOfBestScore will be -1 if the optimization metric for all models is NaN.
// In this case, return the first model.
indexOfBestScore = indexOfBestScore != -1 ? indexOfBestScore : 0;
return results.ElementAt(indexOfBestScore);
}

Expand All @@ -51,6 +54,9 @@ public static CrossValidationRunDetail<TMetrics> GetBestRun<TMetrics>(IEnumerabl
if (!results.Any()) { return null; }
var scores = results.Select(r => r.Results.Average(x => metricsAgent.GetScore(x.ValidationMetrics)));
var indexOfBestScore = GetIndexOfBestScore(scores, isMetricMaximizing);
// indexOfBestScore will be -1 if the optimization metric for all models is NaN.
// In this case, return the first model.
indexOfBestScore = indexOfBestScore != -1 ? indexOfBestScore : 0;
return results.ElementAt(indexOfBestScore);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,12 @@ internal BinaryClassificationMetrics(double auc, double accuracy, double positiv
F1Score = f1Score;
AreaUnderPrecisionRecallCurve = auprc;
}

internal BinaryClassificationMetrics(double auc, double accuracy, double positivePrecision, double positiveRecall,
double negativePrecision, double negativeRecall, double f1Score, double auprc, ConfusionMatrix confusionMatrix)
: this(auc, accuracy, positivePrecision, positiveRecall, negativePrecision, negativeRecall, f1Score, auprc)
{
ConfusionMatrix = confusionMatrix;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -134,5 +134,12 @@ internal MulticlassClassificationMetrics(double accuracyMicro, double accuracyMa
TopKAccuracy = topKAccuracy;
PerClassLogLoss = perClassLogLoss.ToImmutableArray();
}

internal MulticlassClassificationMetrics(double accuracyMicro, double accuracyMacro, double logLoss, double logLossReduction,
int topKPredictionCount, double topKAccuracy, double[] perClassLogLoss, ConfusionMatrix confusionMatrix)
: this(accuracyMicro, accuracyMacro, logLoss, logLossReduction, topKPredictionCount, topKAccuracy, perClassLogLoss)
{
ConfusionMatrix = confusionMatrix;
}
}
}