Skip to content

Add Cluster evaluator #316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Microsoft.ML/Models/ClassificationEvaluator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ public ClassificationMetrics Evaluate(PredictionModel model, ILearningPipelineLo
IDataView overallMetrics = experiment.GetOutput(evaluteOutput.OverallMetrics);
if (overallMetrics == null)
{
throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(BinaryClassificationEvaluator)} Evaluate.");
throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(ClassificationEvaluator)} Evaluate.");
}

IDataView confusionMatrix = experiment.GetOutput(evaluteOutput.ConfusionMatrix);
if (confusionMatrix == null)
{
throw environment.Except($"Could not find ConfusionMatrix in the results returned in {nameof(BinaryClassificationEvaluator)} Evaluate.");
throw environment.Except($"Could not find ConfusionMatrix in the results returned in {nameof(ClassificationEvaluator)} Evaluate.");
}

var metric = ClassificationMetrics.FromMetrics(environment, overallMetrics, confusionMatrix);
Expand Down
71 changes: 71 additions & 0 deletions src/Microsoft.ML/Models/ClusterEvaluator.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime;
using Microsoft.ML.Runtime.Data;
using Microsoft.ML.Transforms;

namespace Microsoft.ML.Models
{
public sealed partial class ClusterEvaluator
{
/// <summary>
/// Computes the quality metrics for the PredictionModel using the specified data set.
/// </summary>
/// <param name="model">
/// The trained PredictionModel to be evaluated.
/// </param>
/// <param name="testData">
/// The test data that will be predicted and used to evaluate the model.
/// </param>
/// <returns>
/// A ClusterMetrics instance that describes how well the model performed against the test data.
/// </returns>
public ClusterMetrics Evaluate(PredictionModel model, ILearningPipelineLoader testData)
{
using (var environment = new TlcEnvironment())
{
environment.CheckValue(model, nameof(model));
environment.CheckValue(testData, nameof(testData));

Experiment experiment = environment.CreateExperiment();

ILearningPipelineStep testDataStep = testData.ApplyStep(previousStep: null, experiment);
if (!(testDataStep is ILearningPipelineDataStep testDataOutput))
{
throw environment.Except($"The {nameof(ILearningPipelineLoader)} did not return a {nameof(ILearningPipelineDataStep)} from ApplyStep.");
}

var datasetScorer = new DatasetTransformScorer
{
Data = testDataOutput.Data,
};
DatasetTransformScorer.Output scoreOutput = experiment.Add(datasetScorer);

Data = scoreOutput.ScoredData;
Output evaluteOutput = experiment.Add(this);

experiment.Compile();

experiment.SetInput(datasetScorer.TransformModel, model.PredictorModel);
testData.SetInput(environment, experiment);

experiment.Run();

IDataView overallMetrics = experiment.GetOutput(evaluteOutput.OverallMetrics);

if (overallMetrics == null)
{
throw environment.Except($"Could not find OverallMetrics in the results returned in {nameof(ClusterEvaluator)} Evaluate.");
}

var metric = ClusterMetrics.FromOverallMetrics(environment, overallMetrics);

Contracts.Assert(metric.Count == 1, $"Exactly one metric set was expected but found {metric.Count} metrics");

return metric[0];
}
}
}
}
94 changes: 94 additions & 0 deletions src/Microsoft.ML/Models/ClusterMetrics.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using Microsoft.ML.Runtime;
using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Runtime.Data;
using System;
using System.Collections.Generic;

namespace Microsoft.ML.Models
{
/// <summary>
/// This class contains the overall metrics computed by cluster evaluators.
/// </summary>
public sealed class ClusterMetrics
{
private ClusterMetrics()
{
}

internal static List<ClusterMetrics> FromOverallMetrics(IHostEnvironment env, IDataView overallMetrics)
{
Contracts.AssertValue(env);
env.AssertValue(overallMetrics);

var metricsEnumerable = overallMetrics.AsEnumerable<SerializationClass>(env, true, ignoreMissingColumns: true);
if (!metricsEnumerable.GetEnumerator().MoveNext())
{
throw env.Except("The overall ClusteringMetrics didn't have any rows.");
}

var metrics = new List<ClusterMetrics>();
foreach (var metric in metricsEnumerable)
{
metrics.Add(new ClusterMetrics()
{
AvgMinScore = metric.AvgMinScore,
Nmi = metric.Nmi,
Dbi = metric.Dbi,
});
}

return metrics;
}

/// <summary>
/// Davies-Bouldin Index.
/// </summary>
/// <remarks>
/// DBI is a measure of the how much scatter is in the cluster and the cluster separation.
/// </remarks>
public double Dbi { get; private set; }

/// <summary>
/// Normalized Mutual Information
/// </summary>
/// <remarks>
/// NMI is a measure of the mutual dependence between the true and predicted cluster labels for instances in the dataset.
/// NMI ranges between 0 and 1 where "0" indicates clustering is random and "1" indicates clustering is perfect w.r.t true labels.
/// </remarks>
public double Nmi { get; private set; }

/// <summary>
/// Average minimum score.
/// </summary>
/// <remarks>
/// AvgMinScore is the average squared-distance of examples from the respective cluster centroids.
/// It is defined as
/// AvgMinScore = (1/m) * sum ((xi - c(xi))^2)
/// where m is the number of instances in the dataset.
/// xi is the i'th instance and c(xi) is the centriod of the predicted cluster for xi.
/// </remarks>
public double AvgMinScore { get; private set; }

/// <summary>
/// This class contains the public fields necessary to deserialize from IDataView.
/// </summary>
private sealed class SerializationClass
{
#pragma warning disable 649 // never assigned
[ColumnName(Runtime.Data.ClusteringEvaluator.Dbi)]
public Double Dbi;

[ColumnName(Runtime.Data.ClusteringEvaluator.Nmi)]
public Double Nmi;

[ColumnName(Runtime.Data.ClusteringEvaluator.AvgMinScore)]
public Double AvgMinScore;

#pragma warning restore 649 // never assigned
}
}
}
13 changes: 10 additions & 3 deletions src/Microsoft.ML/Models/CrossValidator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public sealed partial class CrossValidator
/// <typeparam name="TOutput">Class type that represents prediction schema.</typeparam>
/// <param name="pipeline">Machine learning pipeline may contain loader, transforms and at least one trainer.</param>
/// <returns>List containing metrics and predictor model for each fold</returns>
public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(LearningPipeline pipeline)
public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(LearningPipeline pipeline)
where TInput : class
where TOutput : class, new()
{
Expand Down Expand Up @@ -76,7 +76,7 @@ public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(Lea
{
PredictorModel = predictorModel
};

var scorerOutput = subGraph.Add(scorer);
lastTransformModel = scorerOutput.ScoringTransform;
step = new ScorerPipelineStep(scorerOutput.ScoredData, scorerOutput.ScoringTransform);
Expand Down Expand Up @@ -129,7 +129,7 @@ public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(Lea
experiment.GetOutput(crossValidateOutput.OverallMetrics),
experiment.GetOutput(crossValidateOutput.ConfusionMatrix), 2);
}
else if(Kind == MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer)
else if (Kind == MacroUtilsTrainerKinds.SignatureMultiClassClassifierTrainer)
{
cvOutput.ClassificationMetrics = ClassificationMetrics.FromMetrics(
environment,
Expand All @@ -142,6 +142,12 @@ public CrossValidationOutput<TInput, TOutput> CrossValidate<TInput, TOutput>(Lea
environment,
experiment.GetOutput(crossValidateOutput.OverallMetrics));
}
else if (Kind == MacroUtilsTrainerKinds.SignatureClusteringTrainer)
{
cvOutput.ClusterMetrics = ClusterMetrics.FromOverallMetrics(
environment,
experiment.GetOutput(crossValidateOutput.OverallMetrics));
}
else
{
//Implement metrics for ranking, clustering and anomaly detection.
Expand Down Expand Up @@ -174,6 +180,7 @@ public class CrossValidationOutput<TInput, TOutput>
public List<BinaryClassificationMetrics> BinaryClassificationMetrics;
public List<ClassificationMetrics> ClassificationMetrics;
public List<RegressionMetrics> RegressionMetrics;
public List<ClusterMetrics> ClusterMetrics;
public PredictionModel<TInput, TOutput>[] PredictorModels;

//REVIEW: Add warnings and per instance results and implement
Expand Down
11 changes: 9 additions & 2 deletions src/Microsoft.ML/Models/TrainTestEvaluator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ public TrainTestEvaluatorOutput<TInput, TOutput> TrainTestEvaluate<TInput, TOutp
}

var experiment = environment.CreateExperiment();

TrainingData = (loaders[0].ApplyStep(null, experiment) as ILearningPipelineDataStep).Data;
TestingData = (testData.ApplyStep(null, experiment) as ILearningPipelineDataStep).Data;
Nodes = subGraph;
Expand Down Expand Up @@ -140,6 +140,12 @@ public TrainTestEvaluatorOutput<TInput, TOutput> TrainTestEvaluate<TInput, TOutp
environment,
experiment.GetOutput(crossValidateOutput.OverallMetrics)).FirstOrDefault();
}
else if (Kind == MacroUtilsTrainerKinds.SignatureClusteringTrainer)
{
trainTestOutput.ClusterMetrics = ClusterMetrics.FromOverallMetrics(
environment,
experiment.GetOutput(crossValidateOutput.OverallMetrics)).FirstOrDefault();
}
else
{
//Implement metrics for ranking, clustering and anomaly detection.
Expand All @@ -158,7 +164,7 @@ public TrainTestEvaluatorOutput<TInput, TOutput> TrainTestEvaluate<TInput, TOutp

trainTestOutput.PredictorModels = new PredictionModel<TInput, TOutput>(predictor, memoryStream);
}

return trainTestOutput;
}
}
Expand All @@ -171,6 +177,7 @@ public class TrainTestEvaluatorOutput<TInput, TOutput>
public BinaryClassificationMetrics BinaryClassificationMetrics;
public ClassificationMetrics ClassificationMetrics;
public RegressionMetrics RegressionMetrics;
public ClusterMetrics ClusterMetrics;
public PredictionModel<TInput, TOutput> PredictorModels;

//REVIEW: Add warnings and per instance results and implement
Expand Down
11 changes: 11 additions & 0 deletions test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using Microsoft.ML.Data;
using Microsoft.ML.Models;
using Microsoft.ML.Runtime;
using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Trainers;
Expand Down Expand Up @@ -116,6 +117,16 @@ public void PredictClusters()
Assert.True(!labels.Contains(scores.SelectedClusterId));
labels.Add(scores.SelectedClusterId);
}

var evaluator = new ClusterEvaluator();
var testData = CollectionDataSource.Create(clusters);
ClusterMetrics metrics = evaluator.Evaluate(model, testData);

//Label is not specified, so NMI would be equal to NaN
Assert.Equal(metrics.Nmi, double.NaN);
//Calculate dbi is false by default so Dbi would be 0
Assert.Equal(metrics.Dbi, (double)0.0);
Assert.Equal(metrics.AvgMinScore, (double)0.0, 5);
}
}
}