From 7cfc918e6b1861a63370a4342dac2a46c9f45722 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 26 Sep 2019 10:41:27 -0700 Subject: [PATCH 01/14] Fixed the issue of using PFI with a loaded model, in the cases of Regression, Ranking and Multiclass Classification. Still working in Binary Classification. --- ...mutationFeatureImportanceLoadedFromDisk.cs | 133 +++++++++++++ ...ermutationFeatureImportanceLoadFromDisk.cs | 136 +++++++++++++ ...ermutationFeatureImportanceLoadFromDisk.cs | 141 ++++++++++++++ ...ermutationFeatureImportanceLoadFromDisk.cs | 118 ++++++++++++ .../Scorers/PredictionTransformer.cs | 178 ++++++++++++++++-- 5 files changed, 693 insertions(+), 13 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportanceLoadedFromDisk.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportanceLoadedFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportanceLoadedFromDisk.cs new file mode 100644 index 0000000000..d45f528ecc --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportanceLoadedFromDisk.cs @@ -0,0 +1,133 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Calibrators; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; + +namespace Samples.Dynamic.Trainers.BinaryClassification +{ + public static class PermutationFeatureImportance2 + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for + // exception tracking and logging, as a catalog of available operations + // and as the source of randomness. + var mlContext = new MLContext(seed: 1); + + // Create sample data. + var samples = GenerateData(); + + // Load the sample data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Define a training pipeline that concatenates features into a vector, + // normalizes them, and then trains a linear model. + var featureColumns = + new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }; + var pipeline = mlContext.Transforms + .Concatenate("Features", featureColumns) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.BinaryClassification.Trainers + .SdcaLogisticRegression()); + + // Fit the pipeline to the data. + var model0 = pipeline.Fit(data); + + var modelPath = "./model.zip"; + mlContext.Model.Save(model0, data.Schema, modelPath); + + var model = mlContext.Model.Load(modelPath, out var schema); + + // Transform the dataset. + var transformedData = model.Transform(data); + + // What we got originally: BinaryPredictionTransformer> + // What we get after the fix: BinaryPredictionTransformer, ICalibrator> + // What we should be getting: BinaryPredictionTransformer> + + var linearPredictor = (model as TransformerChain).LastTransformer as BinaryPredictionTransformer>; + // var linearPredictor = (model as TransformerChain).LastTransformer as BinaryPredictionTransformer, ICalibrator>>; + // var linearPredictor = model.LastTransformer; + + // Compute the permutation metrics for the linear model using the + // normalized data. + var permutationMetrics = mlContext.BinaryClassification + .PermutationFeatureImportance(linearPredictor, transformedData, + permutationCount: 30); + + // Now let's look at which features are most important to the model + // overall. Get the feature indices sorted by their impact on AUC. + var sortedIndices = permutationMetrics + .Select((metrics, index) => new { index, metrics.AreaUnderRocCurve }) + .OrderByDescending( + feature => Math.Abs(feature.AreaUnderRocCurve.Mean)) + .Select(feature => feature.index); + + Console.WriteLine("Feature\tModel Weight\tChange in AUC" + + "\t95% Confidence in the Mean Change in AUC"); + var auc = permutationMetrics.Select(x => x.AreaUnderRocCurve).ToArray(); + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}", + featureColumns[i], + linearPredictor.Model.SubModel.Weights[i], + auc[i].Mean, + 1.96 * auc[i].StandardError); + } + + // Expected output: + // Feature Model Weight Change in AUC 95% Confidence in the Mean Change in AUC + // Feature2 35.15 -0.387 0.002015 + // Feature1 17.94 -0.1514 0.0008963 + } + + private class Data + { + public bool Label { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + } + + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the label. + /// + /// The weight to multiply the first feature with to + /// compute the label. + /// The weight to multiply the second feature with to + /// compute the label. + /// The seed for generating feature values and label + /// noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float)(bias + weight1 * data.Feature1 + weight2 * + data.Feature2 + rng.NextDouble() - 0.5); + + data.Label = Sigmoid(value) > 0.5; + yield return data; + } + } + + private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x)); + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs new file mode 100644 index 0000000000..a2f6efa711 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs @@ -0,0 +1,136 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Calibrators; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; + +namespace Samples.Dynamic.Trainers.MulticlassClassification +{ + public static class PermutationFeatureImportance2 + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for + // exception tracking and logging, as a catalog of available operations + // and as the source of randomness. + var mlContext = new MLContext(seed: 1); + + // Create sample data. + var samples = GenerateData(); + + // Load the sample data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Define a training pipeline that concatenates features into a vector, + // normalizes them, and then trains a linear model. + var featureColumns = + new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }; + + var pipeline = mlContext.Transforms + .Concatenate("Features", featureColumns) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.MulticlassClassification.Trainers + .SdcaMaximumEntropy()); + + var model0 = pipeline.Fit(data); + + var modelPath = "./model0.zip"; + mlContext.Model.Save(model0, data.Schema, modelPath); + + var model = mlContext.Model.Load(modelPath, out var schema); + + // Transform the dataset. + var transformedData = model.Transform(data); + + // Extract the predictor. + var linearPredictor = (model as TransformerChain).LastTransformer as MulticlassPredictionTransformer; + + // Compute the permutation metrics for the linear model using the + // normalized data. + var permutationMetrics = mlContext.MulticlassClassification + .PermutationFeatureImportance(linearPredictor, transformedData, + permutationCount: 30); + + // Now let's look at which features are most important to the model + // overall. Get the feature indices sorted by their impact on + // microaccuracy. + var sortedIndices = permutationMetrics + .Select((metrics, index) => new { index, metrics.MicroAccuracy }) + .OrderByDescending(feature => Math.Abs(feature.MicroAccuracy.Mean)) + .Select(feature => feature.index); + + Console.WriteLine("Feature\tChange in MicroAccuracy\t95% Confidence in " + + "the Mean Change in MicroAccuracy"); + + var microAccuracy = permutationMetrics.Select(x => x.MicroAccuracy) + .ToArray(); + + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:G4}\t{2:G4}", + featureColumns[i], + microAccuracy[i].Mean, + 1.96 * microAccuracy[i].StandardError); + } + + // Expected output: + //Feature Change in MicroAccuracy 95% Confidence in the Mean Change in MicroAccuracy + //Feature2 -0.1395 0.0006567 + //Feature1 -0.05367 0.0006908 + } + + private class Data + { + public float Label { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + } + + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the + /// label. + /// The weight to multiply the first feature with to + /// compute the label. + /// The weight to multiply the second feature with to + /// compute the label. + /// The seed for generating feature values and label + /// noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + var max = bias + 4.5 * weight1 + 4.5 * weight2 + 0.5; + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float) + (bias + weight1 * data.Feature1 + weight2 * data.Feature2 + + rng.NextDouble() - 0.5); + + if (value < max / 3) + data.Label = 0; + else if (value < 2 * max / 3) + data.Label = 1; + else + data.Label = 2; + yield return data; + } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs new file mode 100644 index 0000000000..633e5a286b --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs @@ -0,0 +1,141 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Trainers.Ranking +{ + public static class PermutationFeatureImportance2 + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for + // exception tracking and logging, as a catalog of available operations + // and as the source of randomness. + var mlContext = new MLContext(seed: 1); + + // Create sample data. + var samples = GenerateData(); + + // Load the sample data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Define a training pipeline that concatenates features into a vector, + // normalizes them, and then trains a linear model. + var featureColumns = new string[] { nameof(Data.Feature1), nameof( + Data.Feature2) }; + var pipeline = mlContext.Transforms.Concatenate("Features", + featureColumns) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) + .Append(mlContext.Transforms.Conversion.MapValueToKey( + "GroupId")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.Ranking.Trainers.FastTree()); + + var model0 = pipeline.Fit(data); + + var modelPath = "./model0.zip"; + mlContext.Model.Save(model0, data.Schema, modelPath); + + var model = mlContext.Model.Load(modelPath, out var schema); + + var transformedData = model.Transform(data); + + var linearPredictor = (model as TransformerChain).LastTransformer as RankingPredictionTransformer; + + // Compute the permutation metrics for the linear model using the + // normalized data. + var permutationMetrics = mlContext.Ranking.PermutationFeatureImportance( + linearPredictor, transformedData, permutationCount: 30); + + // Now let's look at which features are most important to the model + // overall. Get the feature indices sorted by their impact on NDCG@1. + var sortedIndices = permutationMetrics.Select((metrics, index) => new { + index, + metrics.NormalizedDiscountedCumulativeGains + }) + .OrderByDescending(feature => Math.Abs( + feature.NormalizedDiscountedCumulativeGains[0].Mean)) + + .Select(feature => feature.index); + + Console.WriteLine("Feature\tChange in NDCG@1\t95% Confidence in the" + + "Mean Change in NDCG@1"); + var ndcg = permutationMetrics.Select( + x => x.NormalizedDiscountedCumulativeGains).ToArray(); + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:G4}\t{2:G4}", + featureColumns[i], + ndcg[i][0].Mean, + 1.96 * ndcg[i][0].StandardError); + } + + // Expected output: + // Feature Change in NDCG@1 95% Confidence in the Mean Change in NDCG@1 + // Feature2 -0.2421 0.001748 + // Feature1 -0.0513 0.001184 + } + + private class Data + { + public float Label { get; set; } + + public int GroupId { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + } + + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// + /// The number of examples. + /// + /// The bias, or offset, in the calculation of the label. + /// + /// + /// The weight to multiply the first feature with to + /// compute the label. + /// + /// The weight to multiply the second feature with to + /// compute the label. + /// + /// The seed for generating feature values and label + /// noise. + /// + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1, + int groupSize = 5) + { + var rng = new Random(seed); + var max = bias + 4.5 * weight1 + 4.5 * weight2 + 0.5; + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + GroupId = i / groupSize, + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float)(bias + weight1 * data.Feature1 + weight2 * + data.Feature2 + rng.NextDouble() - 0.5); + if (value < max / 3) + data.Label = 0; + else if (value < 2 * max / 3) + data.Label = 1; + else + data.Label = 2; + yield return data; + } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs new file mode 100644 index 0000000000..cde0799f02 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs @@ -0,0 +1,118 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers; + +namespace Samples.Dynamic.Trainers.Regression +{ + class PermutationFeatureImportance2 + { + public static void Example() + { + Console.WriteLine("ORIGINAL MODEL"); + var mlContext = new MLContext(seed: 1); + var samples = GenerateData(); + var data = mlContext.Data.LoadFromEnumerable(samples); + + var featureColumns = new string[] { nameof(Data.Feature1), + nameof(Data.Feature2) }; + + var pipeline = mlContext.Transforms.Concatenate( + "Features", + featureColumns) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.Regression.Trainers.Ols()); + + var model = pipeline.Fit(data); + Console.WriteLine("LOADED MODEL FROM DISK"); + + var modelPath = "./model.zip"; + mlContext.Model.Save(model, data.Schema, modelPath); + + var loadedModel = mlContext.Model.Load(modelPath, out var schema); + var transformedData = loadedModel.Transform(data); + var linearPredictor = (loadedModel as TransformerChain).LastTransformer as RegressionPredictionTransformer; + + var permutationMetrics = mlContext.Regression + .PermutationFeatureImportance( + linearPredictor, transformedData, permutationCount: 30); + + var sortedIndices = permutationMetrics + .Select((metrics, index) => new + { + index, + metrics.RootMeanSquaredError + }) + + .OrderByDescending(feature => Math.Abs( + feature.RootMeanSquaredError.Mean)) + + .Select(feature => feature.index); + + Console.WriteLine("Feature\tModel Weight\tChange in RMSE\t95% " + + "Confidence in the Mean Change in RMSE"); + + var rmse = permutationMetrics.Select(x => x.RootMeanSquaredError) + .ToArray(); + + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}\t{4:G4}", + featureColumns[i], + linearPredictor.Model.Weights[i], + rmse[i].Mean, + 1.96 * rmse[i].StandardError, + rmse[i].StandardDeviation); + } + + // EXPECTED OUTPUT + //Feature Model Weight Change in RMSE 95 % Confidence in the Mean Change in RMSE + //Feature2 9.00 4.01 0.006723 0.01879 + //Feature1 4.48 1.901 0.003235 0.00904 + } + + private class Data + { + public float Label { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + } + + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the label. + /// + /// The weight to multiply the first feature with to + /// compute the label. + /// The weight to multiply the second feature with to + /// compute the label. + /// The seed for generating feature values and label + /// noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + data.Label = (float)(bias + weight1 * data.Feature1 + weight2 * + data.Feature2 + rng.NextDouble() - 0.5); + yield return data; + } + } + } +} diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index a3b3ee10c1..95bf7ce449 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -2,22 +2,24 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; using System.IO; +using System.Reflection; using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Data.IO; using Microsoft.ML.Runtime; -[assembly: LoadableClass(typeof(BinaryPredictionTransformer>), typeof(BinaryPredictionTransformer), null, typeof(SignatureLoadModel), +[assembly: LoadableClass(typeof(object), typeof(BinaryPredictionTransformer), null, typeof(SignatureLoadModel), "", BinaryPredictionTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(MulticlassPredictionTransformer>>), typeof(MulticlassPredictionTransformer), null, typeof(SignatureLoadModel), +[assembly: LoadableClass(typeof(object), typeof(MulticlassPredictionTransformer), null, typeof(SignatureLoadModel), "", MulticlassPredictionTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(RegressionPredictionTransformer>), typeof(RegressionPredictionTransformer), null, typeof(SignatureLoadModel), +[assembly: LoadableClass(typeof(object), typeof(RegressionPredictionTransformer), null, typeof(SignatureLoadModel), "", RegressionPredictionTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(RankingPredictionTransformer>), typeof(RankingPredictionTransformer), null, typeof(SignatureLoadModel), +[assembly: LoadableClass(typeof(object), typeof(RankingPredictionTransformer), null, typeof(SignatureLoadModel), "", RankingPredictionTransformer.LoaderSignature)] [assembly: LoadableClass(typeof(AnomalyPredictionTransformer>), typeof(AnomalyPredictionTransformer), null, typeof(SignatureLoadModel), @@ -28,7 +30,6 @@ namespace Microsoft.ML.Data { - /// /// Base class for transformers with no feature column, or more than one feature columns. /// @@ -109,6 +110,30 @@ private protected PredictionTransformerBase(IHost host, ModelLoadContext ctx) TrainSchema = loader.Schema; } + [BestFriend] + private protected PredictionTransformerBase(IHost host, ModelLoadContext ctx, TModel model) + { + //MYMARSHALINVOKE + Host = host; + + // *** Binary format *** + // model: prediction model. + // stream: empty data view that contains train schema. + // id of string: feature column. + Model = model; + + // Clone the stream with the schema into memory. + var ms = new MemoryStream(); + ctx.TryLoadBinaryStream(DirTransSchema, reader => + { + reader.BaseStream.CopyTo(ms); + }); + + ms.Position = 0; + var loader = new BinaryLoader(host, new BinaryLoader.Arguments(), ms); + TrainSchema = loader.Schema; + } + /// /// Gets the output schema resulting from the /// @@ -215,6 +240,22 @@ private protected SingleFeaturePredictionTransformerBase(IHost host, ModelLoadCo BindableMapper = ScoreUtils.GetSchemaBindableMapper(Host, ModelAsPredictor); } + private protected SingleFeaturePredictionTransformerBase(IHost host, ModelLoadContext ctx, TModel model) + : base(host, ctx, model) + { + //MYMARSHALINVOKE + FeatureColumnName = ctx.LoadStringOrNull(); + + if (FeatureColumnName == null) + FeatureColumnType = null; + else if (!TrainSchema.TryGetColumnIndex(FeatureColumnName, out int col)) + throw Host.ExceptSchemaMismatch(nameof(FeatureColumnName), "feature", FeatureColumnName); + else + FeatureColumnType = TrainSchema[col].Type; + + BindableMapper = ScoreUtils.GetSchemaBindableMapper(Host, ModelAsPredictor); + } + /// /// Schema propagation for this prediction transformer. /// @@ -359,6 +400,15 @@ internal BinaryPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx) SetScorer(); } + internal BinaryPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) + : base(host, ctx, model) + { + //MYMARSHALINVOKE + Threshold = ctx.Reader.ReadSingle(); + ThresholdColumn = ctx.LoadString(); + SetScorer(); + } + private void SetScorer() { var schema = new RoleMappedSchema(TrainSchema, null, FeatureColumnName); @@ -423,6 +473,14 @@ internal MulticlassPredictionTransformer(IHostEnvironment env, ModelLoadContext SetScorer(); } + internal MulticlassPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) + : base(host, ctx, model) + { + //MYMARSHALINVOKE + _trainLabelColumn = ctx.LoadStringOrNull(); + SetScorer(); + } + private void SetScorer() { var schema = new RoleMappedSchema(TrainSchema, _trainLabelColumn, FeatureColumnName); @@ -475,6 +533,13 @@ internal RegressionPredictionTransformer(IHostEnvironment env, ModelLoadContext Scorer = GetGenericScorer(); } + internal RegressionPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) + : base(host, ctx, model) + { + //MYMARSHALINVOKE + Scorer = GetGenericScorer(); + } + private protected override void SaveCore(ModelSaveContext ctx) { Contracts.AssertValue(ctx); @@ -517,6 +582,13 @@ internal RankingPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx Scorer = GetGenericScorer(); } + internal RankingPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) + : base(host, ctx, model) + { + //MYMARSHALINVOKE + Scorer = GetGenericScorer(); + } + private protected override void SaveCore(ModelSaveContext ctx) { Contracts.AssertValue(ctx); @@ -596,32 +668,112 @@ internal static class BinaryPredictionTransformer { public const string LoaderSignature = "BinaryPredXfer"; - public static BinaryPredictionTransformer> Create(IHostEnvironment env, ModelLoadContext ctx) - => new BinaryPredictionTransformer>(env, ctx); + public static object Create(IHostEnvironment env, ModelLoadContext ctx) + { + //MYMARSHALINVOKE + var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(BinaryPredictionTransformer>)); + ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, "Model"); // MYTODO: don't hardcode the DirModel + + Type generic = typeof(BinaryPredictionTransformer<>); + Type[] genericTypeArgs = { model.GetType() }; + Type constructed = generic.MakeGenericType(genericTypeArgs); + + Type[] constructorArgs = { + typeof(IHostEnvironment), + typeof(ModelLoadContext), + typeof(IHost), + model.GetType() + }; + + var genericCtor = constructed.GetConstructor(BindingFlags.NonPublic | BindingFlags.Instance, null, constructorArgs, null); + var genericInstance = genericCtor.Invoke(new object[] { env, ctx, host, model }); + + return genericInstance; + } } internal static class MulticlassPredictionTransformer { public const string LoaderSignature = "MulticlassPredXfer"; - public static MulticlassPredictionTransformer>> Create(IHostEnvironment env, ModelLoadContext ctx) - => new MulticlassPredictionTransformer>>(env, ctx); + public static object Create(IHostEnvironment env, ModelLoadContext ctx) + { + //MYMARSHALINVOKE + var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(MulticlassPredictionTransformer>>)); + ctx.LoadModel>, SignatureLoadModel>(host, out IPredictorProducing> model, "Model"); // MYTODO: don't hardcode the DirModel + + Type generic = typeof(MulticlassPredictionTransformer<>); + Type[] genericTypeArgs = { model.GetType() }; + Type constructed = generic.MakeGenericType(genericTypeArgs); + + Type[] constructorArgs = { + typeof(IHostEnvironment), + typeof(ModelLoadContext), + typeof(IHost), + model.GetType() + }; + + var genericCtor = constructed.GetConstructor(BindingFlags.NonPublic | BindingFlags.Instance, null, constructorArgs, null); + var genericInstance = genericCtor.Invoke(new object[] { env, ctx, host, model }); + + return genericInstance; + } } internal static class RegressionPredictionTransformer { public const string LoaderSignature = "RegressionPredXfer"; - public static RegressionPredictionTransformer> Create(IHostEnvironment env, ModelLoadContext ctx) - => new RegressionPredictionTransformer>(env, ctx); + public static object Create(IHostEnvironment env, ModelLoadContext ctx) + { + //MYMARSHALINVOKE + var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RegressionPredictionTransformer>)); + ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, "Model"); // MYTODO: don't hardcode the DirModel + + Type generic = typeof(RegressionPredictionTransformer<>); + Type[] genericTypeArgs = { model.GetType() }; + Type constructed = generic.MakeGenericType(genericTypeArgs); + + Type[] constructorArgs = { + typeof(IHostEnvironment), + typeof(ModelLoadContext), + typeof(IHost), + model.GetType() + }; + + var genericCtor = constructed.GetConstructor(BindingFlags.NonPublic | BindingFlags.Instance, null, constructorArgs, null); + var genericInstance = genericCtor.Invoke(new object[] { env, ctx, host, model }); + + return genericInstance; + } } internal static class RankingPredictionTransformer { public const string LoaderSignature = "RankingPredXfer"; - public static RankingPredictionTransformer> Create(IHostEnvironment env, ModelLoadContext ctx) - => new RankingPredictionTransformer>(env, ctx); + public static object Create(IHostEnvironment env, ModelLoadContext ctx) + { + //MYMARSHALINVOKE + var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RankingPredictionTransformer>)); + ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, "Model"); // MYTODO: don't hardcode the DirModel + + Type generic = typeof(RankingPredictionTransformer<>); + Type[] genericTypeArgs = { model.GetType() }; + Type constructed = generic.MakeGenericType(genericTypeArgs); + + Type[] constructorArgs = { + typeof(IHostEnvironment), + typeof(ModelLoadContext), + typeof(IHost), + model.GetType() + }; + + var genericCtor = constructed.GetConstructor(BindingFlags.NonPublic | BindingFlags.Instance, null, constructorArgs, null); + var genericInstance = genericCtor.Invoke(new object[] { env, ctx, host, model }); + + return genericInstance; + } } internal static class AnomalyPredictionTransformer From 1a3e0d646dbe166a8f7c94a09edd44f602cfa2ef Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Fri, 27 Sep 2019 15:07:29 -0700 Subject: [PATCH 02/14] Added tests using PFI when loading from disk. Also modified LbfgsTests so that it uses the appropiate casts now that the PredictionTransformers have been updated. --- .../Explainability.cs | 39 +++ .../PermutationFeatureImportanceTests.cs | 328 +++++++++++++++++- .../TrainerEstimators/LbfgsTests.cs | 6 +- 3 files changed, 369 insertions(+), 4 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Explainability.cs b/test/Microsoft.ML.Functional.Tests/Explainability.cs index 11729c3959..30269262a4 100644 --- a/test/Microsoft.ML.Functional.Tests/Explainability.cs +++ b/test/Microsoft.ML.Functional.Tests/Explainability.cs @@ -2,10 +2,12 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System.IO; using Microsoft.ML.Data; using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework; +using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.FastTree; using Xunit; using Xunit.Abstractions; @@ -49,6 +51,43 @@ public void GlobalFeatureImportanceWithPermutationFeatureImportance() Common.AssertMetricsStatistics(metricsStatistics); } + /// + /// GlobalFeatureImportance: PFI can be used to compute global feature importance. Here it is used with a model loaded from disk. + /// + [Fact] + public void GlobalFeatureImportanceWithPermutationFeatureImportanceWithLoadedModel() + { + var mlContext = new MLContext(seed: 1); + + // Get the dataset + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Regression.Trainers.Sdca()); + + // Fit the pipeline and transform the data. + var model = pipeline.Fit(data); + + var modelAndSchemaPath = GetOutputPath("TestFunctionalTestPFI.zip"); + mlContext.Model.Save(model, data.Schema, modelAndSchemaPath); + + ITransformer loadedModel; + using (var fs = File.OpenRead(modelAndSchemaPath)) + loadedModel = mlContext.Model.Load(modelAndSchemaPath, out var schema); + + var transformedData = loadedModel.Transform(data); + var linearPredictor = (loadedModel as TransformerChain).LastTransformer as RegressionPredictionTransformer; + + // Compute the permutation feature importance to look at global feature importance. + var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(linearPredictor, transformedData); + + // Make sure the correct number of features came back. + Assert.Equal(HousingRegression.Features.Length, permutationMetrics.Length); + foreach (var metricsStatistics in permutationMetrics) + Common.AssertMetricsStatistics(metricsStatistics); + } + /// /// GlobalFeatureImportance: A linear model's feature importance can be viewed through its weight coefficients. /// diff --git a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs index ac86a8703f..44ebe8dcbb 100644 --- a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs +++ b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs @@ -4,11 +4,13 @@ using System; using System.Collections.Immutable; +using System.IO; using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.RunTests; using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; using Xunit; using Xunit.Abstractions; @@ -55,6 +57,48 @@ public void TestPfiRegressionOnDenseFeatures() Done(); } + /// + /// Test PFI Regression for Dense Features with a model loaded from disk + /// + [Fact] + public void TestPfiRegressionOnDenseFeaturesWithLoadedModel() + { + var data = GetDenseDataset(); + var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); + + var modelAndSchemaPath = GetOutputPath("TestPfiRegressionOnDenseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + ITransformer loadedModel; + using (var fs = File.OpenRead(modelAndSchemaPath)) + loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + + var castedModel = loadedModel as RegressionPredictionTransformer; + var pfi = ML.Regression.PermutationFeatureImportance(castedModel, data); + + // Pfi Indices: + // X1: 0 + // X2Important: 1 + // X3: 2 + // X4Rand: 3 + + // For the following metrics lower is better, so maximum delta means more important feature, and vice versa + Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanAbsoluteError.Mean)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanAbsoluteError.Mean)); + + Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanSquaredError.Mean)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanSquaredError.Mean)); + + Assert.Equal(3, MinDeltaIndex(pfi, m => m.RootMeanSquaredError.Mean)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RootMeanSquaredError.Mean)); + + // For the following metrics higher is better, so minimum delta means more important feature, and vice versa + Assert.Equal(1, MinDeltaIndex(pfi, m => m.RSquared.Mean)); + Assert.Equal(3, MaxDeltaIndex(pfi, m => m.RSquared.Mean)); + + Done(); + } + /// /// Test PFI Regression Standard Deviation and Standard Error for Dense Features /// @@ -107,6 +151,68 @@ public void TestPfiRegressionStandardDeviationAndErrorOnDenseFeatures() Done(); } + /// + /// Test PFI Regression Standard Deviation and Standard Error for Dense Features with a model loaded from disk + /// + [Fact] + public void TestPfiRegressionStandardDeviationAndErrorOnDenseFeaturesWithLoadedModel() + { + var data = GetDenseDataset(); + var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); + + var modelAndSchemaPath = GetOutputPath("TestPfiRegressionStandardDeviationAndErrorOnDenseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + ITransformer loadedModel; + using (var fs = File.OpenRead(modelAndSchemaPath)) + loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + + var castedModel = loadedModel as RegressionPredictionTransformer; + var pfi = ML.Regression.PermutationFeatureImportance(castedModel, data, permutationCount: 20); + + // Keep the permutation count high so fluctuations are kept to a minimum + // but not high enough to slow down the tests + // (fluctuations lead to random test failures) + + // Pfi Indices: + // X1: 0 + // X2Important: 1 + // X3: 2 + // X4Rand: 3 + + // For these metrics, the magnitude of the difference will be greatest for 1, least for 3 + // Stardard Deviation will scale with the magnitude of the measure + Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanAbsoluteError.StandardDeviation)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanAbsoluteError.StandardDeviation)); + + Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanSquaredError.StandardDeviation)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanSquaredError.StandardDeviation)); + + Assert.Equal(3, MinDeltaIndex(pfi, m => m.RootMeanSquaredError.StandardDeviation)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RootMeanSquaredError.StandardDeviation)); + + Assert.Equal(3, MinDeltaIndex(pfi, m => m.RSquared.StandardDeviation)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RSquared.StandardDeviation)); + + // Stardard Error will scale with the magnitude of the measure (as it's SD/sqrt(N)) + Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanAbsoluteError.StandardError)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanAbsoluteError.StandardError)); + + Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanSquaredError.StandardError)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanSquaredError.StandardError)); + + Assert.Equal(3, MinDeltaIndex(pfi, m => m.RootMeanSquaredError.StandardError)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RootMeanSquaredError.StandardError)); + + Assert.Equal(3, MinDeltaIndex(pfi, m => m.RSquared.StandardError)); + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RSquared.StandardError)); + + // And test that the Standard Deviation and Standard Error are related as we expect + Assert.Equal(pfi[0].RootMeanSquaredError.StandardError, pfi[0].RootMeanSquaredError.StandardDeviation / Math.Sqrt(pfi[0].RootMeanSquaredError.Count)); + + Done(); + } + /// /// Test PFI Regression for Sparse Features /// @@ -141,6 +247,49 @@ public void TestPfiRegressionOnSparseFeatures() Assert.Equal(5, MinDeltaIndex(results, m => m.RSquared.Mean)); } + /// + /// Test PFI Regression for Sparse Features with a model loaded from disk + /// + [Fact] + public void TestPfiRegressionOnSparseFeaturesWithLoadedModel() + { + var data = GetSparseDataset(); + var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); + + var modelAndSchemaPath = GetOutputPath("TestPfiRegressionOnSparseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + ITransformer loadedModel; + using (var fs = File.OpenRead(modelAndSchemaPath)) + loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + + var castedModel = loadedModel as RegressionPredictionTransformer; + var results = ML.Regression.PermutationFeatureImportance(castedModel, data); + + // Pfi Indices: + // X1: 0 + // X2VBuffer-Slot-0: 1 + // X2VBuffer-Slot-1: 2 + // X2VBuffer-Slot-2: 3 + // X2VBuffer-Slot-3: 4 + // X3Important: 5 + + // Permuted X2VBuffer-Slot-1 lot (f2) should have min impact on SGD metrics, X3Important -- max impact. + // For the following metrics lower is better, so maximum delta means more important feature, and vice versa + Assert.Equal(2, MinDeltaIndex(results, m => m.MeanAbsoluteError.Mean)); + Assert.Equal(5, MaxDeltaIndex(results, m => m.MeanAbsoluteError.Mean)); + + Assert.Equal(2, MinDeltaIndex(results, m => m.MeanSquaredError.Mean)); + Assert.Equal(5, MaxDeltaIndex(results, m => m.MeanSquaredError.Mean)); + + Assert.Equal(2, MinDeltaIndex(results, m => m.RootMeanSquaredError.Mean)); + Assert.Equal(5, MaxDeltaIndex(results, m => m.RootMeanSquaredError.Mean)); + + // For the following metrics higher is better, so minimum delta means more important feature, and vice versa + Assert.Equal(2, MaxDeltaIndex(results, m => m.RSquared.Mean)); + Assert.Equal(5, MinDeltaIndex(results, m => m.RSquared.Mean)); + } + #endregion #region Binary Classification Tests @@ -261,6 +410,52 @@ public void TestPfiMulticlassClassificationOnDenseFeatures() Done(); } + /// + /// Test PFI Multiclass Classification for Dense Features using a model loaded from disk + /// + [Fact] + public void TestPfiMulticlassClassificationOnDenseFeaturesWithLoadedModel() + { + var data = GetDenseDataset(TaskType.MulticlassClassification); + var model = ML.MulticlassClassification.Trainers.LbfgsMaximumEntropy().Fit(data); + + var modelAndSchemaPath = GetOutputPath("TestPfiMulticlassClassificationOnDenseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + ITransformer loadedModel; + using (var fs = File.OpenRead(modelAndSchemaPath)) + loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + + var castedModel = loadedModel as MulticlassPredictionTransformer; + var pfi = ML.MulticlassClassification.PermutationFeatureImportance(castedModel, data); + + // Pfi Indices: + // X1: 0 + // X2Important: 1 + // X3: 2 + // X4Rand: 3 + + // For the following metrics higher is better, so minimum delta means more important feature, and vice versa + Assert.Equal(3, MaxDeltaIndex(pfi, m => m.MicroAccuracy.Mean)); + Assert.Equal(1, MinDeltaIndex(pfi, m => m.MicroAccuracy.Mean)); + Assert.Equal(3, MaxDeltaIndex(pfi, m => m.MacroAccuracy.Mean)); + Assert.Equal(1, MinDeltaIndex(pfi, m => m.MacroAccuracy.Mean)); + Assert.Equal(3, MaxDeltaIndex(pfi, m => m.LogLossReduction.Mean)); + Assert.Equal(1, MinDeltaIndex(pfi, m => m.LogLossReduction.Mean)); + + // For the following metrics-delta lower is better, so maximum delta means more important feature, and vice versa + // Because they are _negative_, the difference will be positive for worse classifiers. + Assert.Equal(1, MaxDeltaIndex(pfi, m => m.LogLoss.Mean)); + Assert.Equal(3, MinDeltaIndex(pfi, m => m.LogLoss.Mean)); + for (int i = 0; i < pfi[0].PerClassLogLoss.Count; i++) + { + Assert.True(MaxDeltaIndex(pfi, m => m.PerClassLogLoss[i].Mean) == 1); + Assert.True(MinDeltaIndex(pfi, m => m.PerClassLogLoss[i].Mean) == 3); + } + + Done(); + } + /// /// Test PFI Multiclass Classification for Sparse Features /// @@ -301,11 +496,60 @@ public void TestPfiMulticlassClassificationOnSparseFeatures() Done(); } + + /// + /// Test PFI Multiclass Classification for Sparse Features using a model loaded from disk + /// + [Fact] + public void TestPfiMulticlassClassificationOnSparseFeaturesWithLoadedModel() + { + var data = GetSparseDataset(TaskType.MulticlassClassification); + var model = ML.MulticlassClassification.Trainers.LbfgsMaximumEntropy( + new LbfgsMaximumEntropyMulticlassTrainer.Options { MaximumNumberOfIterations = 1000 }).Fit(data); + + var modelAndSchemaPath = GetOutputPath("TestPfiMulticlassClassificationOnSparseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + ITransformer loadedModel; + using (var fs = File.OpenRead(modelAndSchemaPath)) + loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + + var castedModel = loadedModel as MulticlassPredictionTransformer; + var pfi = ML.MulticlassClassification.PermutationFeatureImportance(castedModel, data); + + // Pfi Indices: + // X1: 0 + // X2VBuffer-Slot-0: 1 + // X2VBuffer-Slot-1: 2 // Least important + // X2VBuffer-Slot-2: 3 + // X2VBuffer-Slot-3: 4 + // X3Important: 5 // Most important + + // For the following metrics higher is better, so minimum delta means more important feature, and vice versa + Assert.Equal(2, MaxDeltaIndex(pfi, m => m.MicroAccuracy.Mean)); + Assert.Equal(5, MinDeltaIndex(pfi, m => m.MicroAccuracy.Mean)); + Assert.Equal(2, MaxDeltaIndex(pfi, m => m.MacroAccuracy.Mean)); + Assert.Equal(5, MinDeltaIndex(pfi, m => m.MacroAccuracy.Mean)); + Assert.Equal(2, MaxDeltaIndex(pfi, m => m.LogLossReduction.Mean)); + Assert.Equal(5, MinDeltaIndex(pfi, m => m.LogLossReduction.Mean)); + + // For the following metrics-delta lower is better, so maximum delta means more important feature, and vice versa + // Because they are negative metrics, the _difference_ will be positive for worse classifiers. + Assert.Equal(5, MaxDeltaIndex(pfi, m => m.LogLoss.Mean)); + Assert.Equal(2, MinDeltaIndex(pfi, m => m.LogLoss.Mean)); + for (int i = 0; i < pfi[0].PerClassLogLoss.Count; i++) + { + Assert.Equal(5, MaxDeltaIndex(pfi, m => m.PerClassLogLoss[i].Mean)); + Assert.Equal(2, MinDeltaIndex(pfi, m => m.PerClassLogLoss[i].Mean)); + } + + Done(); + } #endregion #region Ranking Tests /// - /// Test PFI Multiclass Classification for Dense Features + /// Test PFI Ranking Classification for Dense Features /// [Fact] public void TestPfiRankingOnDenseFeatures() @@ -335,6 +579,46 @@ public void TestPfiRankingOnDenseFeatures() Done(); } + /// + /// Test PFI Multiclass Classification for Dense Features using model loaded from disk + /// + [Fact] + public void TestPfiRankingOnDenseFeaturesWithLoadedModel() + { + var data = GetDenseDataset(TaskType.Ranking); + var model = ML.Ranking.Trainers.FastTree().Fit(data); + + var modelAndSchemaPath = GetOutputPath("TestPfiRankingOnDenseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + ITransformer loadedModel; + using (var fs = File.OpenRead(modelAndSchemaPath)) + loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + + var castedModel = loadedModel as RankingPredictionTransformer; + var pfi = ML.Ranking.PermutationFeatureImportance(castedModel, data); + + // Pfi Indices: + // X1: 0 // For Ranking, this column won't result in misorderings + // X2Important: 1 + // X3: 2 + // X4Rand: 3 + + // For the following metrics higher is better, so minimum delta means more important feature, and vice versa + for (int i = 0; i < pfi[0].DiscountedCumulativeGains.Count; i++) + { + Assert.Equal(0, MaxDeltaIndex(pfi, m => m.DiscountedCumulativeGains[i].Mean)); + Assert.Equal(1, MinDeltaIndex(pfi, m => m.DiscountedCumulativeGains[i].Mean)); + } + for (int i = 0; i < pfi[0].NormalizedDiscountedCumulativeGains.Count; i++) + { + Assert.Equal(0, MaxDeltaIndex(pfi, m => m.NormalizedDiscountedCumulativeGains[i].Mean)); + Assert.Equal(1, MinDeltaIndex(pfi, m => m.NormalizedDiscountedCumulativeGains[i].Mean)); + } + + Done(); + } + /// /// Test PFI Multiclass Classification for Sparse Features /// @@ -367,6 +651,48 @@ public void TestPfiRankingOnSparseFeatures() Done(); } + + /// + /// Test PFI Multiclass Classification for Sparse Features with model loaded from disk + /// + [Fact] + public void TestPfiRankingOnSparseFeaturesWithLoadedModel() + { + var data = GetSparseDataset(TaskType.Ranking); + var model = ML.Ranking.Trainers.FastTree().Fit(data); + + var modelAndSchemaPath = GetOutputPath("TestPfiRankingOnSparseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + ITransformer loadedModel; + using (var fs = File.OpenRead(modelAndSchemaPath)) + loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + + var castedModel = loadedModel as RankingPredictionTransformer; + var pfi = ML.Ranking.PermutationFeatureImportance(castedModel, data); + + // Pfi Indices: + // X1: 0 + // X2VBuffer-Slot-0: 1 + // X2VBuffer-Slot-1: 2 // Least important + // X2VBuffer-Slot-2: 3 + // X2VBuffer-Slot-3: 4 + // X3Important: 5 // Most important + + // For the following metrics higher is better, so minimum delta means more important feature, and vice versa + for (int i = 0; i < pfi[0].DiscountedCumulativeGains.Count; i++) + { + Assert.Equal(2, MaxDeltaIndex(pfi, m => m.DiscountedCumulativeGains[i].Mean)); + Assert.Equal(5, MinDeltaIndex(pfi, m => m.DiscountedCumulativeGains[i].Mean)); + } + for (int i = 0; i < pfi[0].NormalizedDiscountedCumulativeGains.Count; i++) + { + Assert.Equal(2, MaxDeltaIndex(pfi, m => m.NormalizedDiscountedCumulativeGains[i].Mean)); + Assert.Equal(5, MinDeltaIndex(pfi, m => m.NormalizedDiscountedCumulativeGains[i].Mean)); + } + + Done(); + } #endregion #region Helpers diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs index 1174c87aca..de8f8bade3 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs @@ -125,7 +125,7 @@ public void TestLRWithStats() using (var fs = File.OpenRead(modelAndSchemaPath)) transformerChain = ML.Model.Load(fs, out var schema); - var lastTransformer = ((TransformerChain)transformerChain).LastTransformer as BinaryPredictionTransformer>; + var lastTransformer = ((TransformerChain)transformerChain).LastTransformer as BinaryPredictionTransformer, ICalibrator>>; var model = lastTransformer.Model as ParameterMixingCalibratedModelParameters, ICalibrator>; linearModel = model.SubModel as LinearBinaryModelParameters; @@ -215,8 +215,8 @@ public void TestMLRWithStats() using (var fs = File.OpenRead(modelAndSchemaPath)) transformerChain = ML.Model.Load(fs, out var schema); - var lastTransformer = ((TransformerChain)transformerChain).LastTransformer as MulticlassPredictionTransformer>>; - model = lastTransformer.Model as MaximumEntropyModelParameters; + var lastTransformer = ((TransformerChain)transformerChain).LastTransformer as MulticlassPredictionTransformer; + model = lastTransformer.Model; validateStats(model); From 78f9dbb57808360a7bf652ab30a264717eb0b06a Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Fri, 27 Sep 2019 15:50:33 -0700 Subject: [PATCH 03/14] Cleaning up comments --- ...ermutationFeatureImportanceLoadFromDisk.cs | 8 +-- ...ermutationFeatureImportanceLoadFromDisk.cs | 9 ++-- ...ermutationFeatureImportanceLoadFromDisk.cs | 54 ++++++++++++------- .../Scorers/PredictionTransformer.cs | 44 ++++++++++----- 4 files changed, 75 insertions(+), 40 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs index a2f6efa711..ca7a06954d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs @@ -35,11 +35,12 @@ public static void Example() .Append(mlContext.MulticlassClassification.Trainers .SdcaMaximumEntropy()); + // Fit the pipeline to the data and save the model var model0 = pipeline.Fit(data); - var modelPath = "./model0.zip"; mlContext.Model.Save(model0, data.Schema, modelPath); + // Load the model var model = mlContext.Model.Load(modelPath, out var schema); // Transform the dataset. @@ -78,8 +79,9 @@ public static void Example() // Expected output: //Feature Change in MicroAccuracy 95% Confidence in the Mean Change in MicroAccuracy - //Feature2 -0.1395 0.0006567 - //Feature1 -0.05367 0.0006908 + //Feature2 -0.1396 0.0008036 + //Feature1 -0.05421 0.0006154 + } private class Data diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs index 633e5a286b..dc1bc4e240 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs @@ -34,15 +34,18 @@ public static void Example() .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.Ranking.Trainers.FastTree()); + // Train the model and save to disk var model0 = pipeline.Fit(data); - var modelPath = "./model0.zip"; mlContext.Model.Save(model0, data.Schema, modelPath); + // Load model var model = mlContext.Model.Load(modelPath, out var schema); + // Transform Data var transformedData = model.Transform(data); + // Extract the predictor var linearPredictor = (model as TransformerChain).LastTransformer as RankingPredictionTransformer; // Compute the permutation metrics for the linear model using the @@ -75,8 +78,8 @@ public static void Example() // Expected output: // Feature Change in NDCG@1 95% Confidence in the Mean Change in NDCG@1 - // Feature2 -0.2421 0.001748 - // Feature1 -0.0513 0.001184 + // Feature2 -0.2432 0.001762 + // Feature1 -0.05235 0.001116 } private class Data diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs index cde0799f02..5e825915c9 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs @@ -7,15 +7,23 @@ namespace Samples.Dynamic.Trainers.Regression { - class PermutationFeatureImportance2 + public static class PermutationFeatureImportance2 { public static void Example() { - Console.WriteLine("ORIGINAL MODEL"); + // Create a new context for ML.NET operations. It can be used for + // exception tracking and logging, as a catalog of available operations + // and as the source of randomness. var mlContext = new MLContext(seed: 1); + + // Create sample data. var samples = GenerateData(); + + // Load the sample data as an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); + // Define a training pipeline that concatenates features into a vector, + // normalizes them, and then trains a linear model. var featureColumns = new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }; @@ -25,23 +33,30 @@ public static void Example() .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.Regression.Trainers.Ols()); - var model = pipeline.Fit(data); - Console.WriteLine("LOADED MODEL FROM DISK"); + // Train the model and save to disk + var model0 = pipeline.Fit(data); + var modelPath = "./model0.zip"; + mlContext.Model.Save(model0, data.Schema, modelPath); - var modelPath = "./model.zip"; - mlContext.Model.Save(model, data.Schema, modelPath); + // Load model + var model = mlContext.Model.Load(modelPath, out var schema); - var loadedModel = mlContext.Model.Load(modelPath, out var schema); - var transformedData = loadedModel.Transform(data); - var linearPredictor = (loadedModel as TransformerChain).LastTransformer as RegressionPredictionTransformer; + // Transform Data + var transformedData = model.Transform(data); + // Extract the predictor. + var linearPredictor = (model as TransformerChain).LastTransformer as RegressionPredictionTransformer; + + // Compute the permutation metrics for the linear model using the + // normalized data. var permutationMetrics = mlContext.Regression .PermutationFeatureImportance( linearPredictor, transformedData, permutationCount: 30); + // Now let's look at which features are most important to the model + // overall. Get the feature indices sorted by their impact on RMSE. var sortedIndices = permutationMetrics - .Select((metrics, index) => new - { + .Select((metrics, index) => new { index, metrics.RootMeanSquaredError }) @@ -51,7 +66,7 @@ public static void Example() .Select(feature => feature.index); - Console.WriteLine("Feature\tModel Weight\tChange in RMSE\t95% " + + Console.WriteLine("Feature\tModel Weight\tChange in RMSE\t95%" + "Confidence in the Mean Change in RMSE"); var rmse = permutationMetrics.Select(x => x.RootMeanSquaredError) @@ -59,18 +74,17 @@ public static void Example() foreach (int i in sortedIndices) { - Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}\t{4:G4}", + Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}", featureColumns[i], linearPredictor.Model.Weights[i], rmse[i].Mean, - 1.96 * rmse[i].StandardError, - rmse[i].StandardDeviation); + 1.96 * rmse[i].StandardError); } - // EXPECTED OUTPUT - //Feature Model Weight Change in RMSE 95 % Confidence in the Mean Change in RMSE - //Feature2 9.00 4.01 0.006723 0.01879 - //Feature1 4.48 1.901 0.003235 0.00904 + // Expected output: + // Feature Model Weight Change in RMSE 95% Confidence in the Mean Change in RMSE + // Feature2 9.00 4.01 0.006723 + // Feature1 4.48 1.901 0.003235 } private class Data @@ -115,4 +129,4 @@ private static IEnumerable GenerateData(int nExamples = 10000, } } } -} +} \ No newline at end of file diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index 95bf7ce449..749c8e8c52 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -113,13 +113,13 @@ private protected PredictionTransformerBase(IHost host, ModelLoadContext ctx) [BestFriend] private protected PredictionTransformerBase(IHost host, ModelLoadContext ctx, TModel model) { - //MYMARSHALINVOKE Host = host; // *** Binary format *** // model: prediction model. // stream: empty data view that contains train schema. // id of string: feature column. + Model = model; // Clone the stream with the schema into memory. @@ -243,7 +243,6 @@ private protected SingleFeaturePredictionTransformerBase(IHost host, ModelLoadCo private protected SingleFeaturePredictionTransformerBase(IHost host, ModelLoadContext ctx, TModel model) : base(host, ctx, model) { - //MYMARSHALINVOKE FeatureColumnName = ctx.LoadStringOrNull(); if (FeatureColumnName == null) @@ -403,7 +402,11 @@ internal BinaryPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx) internal BinaryPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) : base(host, ctx, model) { - //MYMARSHALINVOKE + // *** Binary format *** + // + // float: scorer threshold + // id of string: scorer threshold column + Threshold = ctx.Reader.ReadSingle(); ThresholdColumn = ctx.LoadString(); SetScorer(); @@ -476,7 +479,10 @@ internal MulticlassPredictionTransformer(IHostEnvironment env, ModelLoadContext internal MulticlassPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) : base(host, ctx, model) { - //MYMARSHALINVOKE + // *** Binary format *** + // + // id of string: train label column + _trainLabelColumn = ctx.LoadStringOrNull(); SetScorer(); } @@ -536,7 +542,6 @@ internal RegressionPredictionTransformer(IHostEnvironment env, ModelLoadContext internal RegressionPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) : base(host, ctx, model) { - //MYMARSHALINVOKE Scorer = GetGenericScorer(); } @@ -585,7 +590,6 @@ internal RankingPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx internal RankingPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) : base(host, ctx, model) { - //MYMARSHALINVOKE Scorer = GetGenericScorer(); } @@ -667,13 +671,16 @@ private static VersionInfo GetVersionInfo() internal static class BinaryPredictionTransformer { public const string LoaderSignature = "BinaryPredXfer"; + private const string DirModel = "Model"; // This should match "DirModel" in PredictionTransformerBase<> public static object Create(IHostEnvironment env, ModelLoadContext ctx) { - //MYMARSHALINVOKE + // Load internal model to be used as TModel of BinaryPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(BinaryPredictionTransformer>)); - ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, "Model"); // MYTODO: don't hardcode the DirModel + ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); + // Create generic type of BinaryPredictionTransformer using the correct TModel. + // Return an instance of that type, passing the previously loaded model to the constructor Type generic = typeof(BinaryPredictionTransformer<>); Type[] genericTypeArgs = { model.GetType() }; Type constructed = generic.MakeGenericType(genericTypeArgs); @@ -695,13 +702,16 @@ public static object Create(IHostEnvironment env, ModelLoadContext ctx) internal static class MulticlassPredictionTransformer { public const string LoaderSignature = "MulticlassPredXfer"; + private const string DirModel = "Model"; // This should match "DirModel" in PredictionTransformerBase<> public static object Create(IHostEnvironment env, ModelLoadContext ctx) { - //MYMARSHALINVOKE + // Load internal model to be used as TModel of MulticlassPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(MulticlassPredictionTransformer>>)); - ctx.LoadModel>, SignatureLoadModel>(host, out IPredictorProducing> model, "Model"); // MYTODO: don't hardcode the DirModel + ctx.LoadModel>, SignatureLoadModel>(host, out IPredictorProducing> model, DirModel); + // Create generic type of MulticlassPredictionTransformer using the correct TModel. + // Return an instance of that type, passing the previously loaded model to the constructor Type generic = typeof(MulticlassPredictionTransformer<>); Type[] genericTypeArgs = { model.GetType() }; Type constructed = generic.MakeGenericType(genericTypeArgs); @@ -723,13 +733,16 @@ public static object Create(IHostEnvironment env, ModelLoadContext ctx) internal static class RegressionPredictionTransformer { public const string LoaderSignature = "RegressionPredXfer"; + private const string DirModel = "Model"; // This should match "DirModel" in PredictionTransformerBase<> public static object Create(IHostEnvironment env, ModelLoadContext ctx) { - //MYMARSHALINVOKE + // Load internal model to be used as TModel of RegressionPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RegressionPredictionTransformer>)); - ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, "Model"); // MYTODO: don't hardcode the DirModel + ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); // MYTODO: don't hardcode the DirModel + // Create generic type of RegressionPredictionTransformer using the correct TModel. + // Return an instance of that type, passing the previously loaded model to the constructor Type generic = typeof(RegressionPredictionTransformer<>); Type[] genericTypeArgs = { model.GetType() }; Type constructed = generic.MakeGenericType(genericTypeArgs); @@ -751,13 +764,16 @@ public static object Create(IHostEnvironment env, ModelLoadContext ctx) internal static class RankingPredictionTransformer { public const string LoaderSignature = "RankingPredXfer"; + private const string DirModel = "Model"; // This should match "DirModel" in PredictionTransformerBase<> public static object Create(IHostEnvironment env, ModelLoadContext ctx) { - //MYMARSHALINVOKE + // Load internal model to be used as TModel of RankingPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RankingPredictionTransformer>)); - ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, "Model"); // MYTODO: don't hardcode the DirModel + ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); + // Create generic type of RankingPredictionTransformer using the correct TModel. + // Return an instance of that type, passing the previously loaded model to the constructor Type generic = typeof(RankingPredictionTransformer<>); Type[] genericTypeArgs = { model.GetType() }; Type constructed = generic.MakeGenericType(genericTypeArgs); From 0cd704e25b873c6218980333068adab1573028bc Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Fri, 27 Sep 2019 15:51:08 -0700 Subject: [PATCH 04/14] Minor update of LbgfgsTests removing a cast that is now unnecessary --- test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs index de8f8bade3..f80ee68d2d 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/LbfgsTests.cs @@ -126,7 +126,7 @@ public void TestLRWithStats() transformerChain = ML.Model.Load(fs, out var schema); var lastTransformer = ((TransformerChain)transformerChain).LastTransformer as BinaryPredictionTransformer, ICalibrator>>; - var model = lastTransformer.Model as ParameterMixingCalibratedModelParameters, ICalibrator>; + var model = lastTransformer.Model; linearModel = model.SubModel as LinearBinaryModelParameters; From 931defd2ec235ec00147e3696ff0e8b311de6e58 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Fri, 27 Sep 2019 16:03:20 -0700 Subject: [PATCH 05/14] Deleting sample of using BinaryPredictionTransformer with a loaded model, since problems arise because of the ParameterMixingCalibratedModelParameters --- ...mutationFeatureImportanceLoadedFromDisk.cs | 133 ------------------ 1 file changed, 133 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportanceLoadedFromDisk.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportanceLoadedFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportanceLoadedFromDisk.cs deleted file mode 100644 index d45f528ecc..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportanceLoadedFromDisk.cs +++ /dev/null @@ -1,133 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using Microsoft.ML; -using Microsoft.ML.Calibrators; -using Microsoft.ML.Data; -using Microsoft.ML.Trainers; - -namespace Samples.Dynamic.Trainers.BinaryClassification -{ - public static class PermutationFeatureImportance2 - { - public static void Example() - { - // Create a new context for ML.NET operations. It can be used for - // exception tracking and logging, as a catalog of available operations - // and as the source of randomness. - var mlContext = new MLContext(seed: 1); - - // Create sample data. - var samples = GenerateData(); - - // Load the sample data as an IDataView. - var data = mlContext.Data.LoadFromEnumerable(samples); - - // Define a training pipeline that concatenates features into a vector, - // normalizes them, and then trains a linear model. - var featureColumns = - new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }; - var pipeline = mlContext.Transforms - .Concatenate("Features", featureColumns) - .Append(mlContext.Transforms.NormalizeMinMax("Features")) - .Append(mlContext.BinaryClassification.Trainers - .SdcaLogisticRegression()); - - // Fit the pipeline to the data. - var model0 = pipeline.Fit(data); - - var modelPath = "./model.zip"; - mlContext.Model.Save(model0, data.Schema, modelPath); - - var model = mlContext.Model.Load(modelPath, out var schema); - - // Transform the dataset. - var transformedData = model.Transform(data); - - // What we got originally: BinaryPredictionTransformer> - // What we get after the fix: BinaryPredictionTransformer, ICalibrator> - // What we should be getting: BinaryPredictionTransformer> - - var linearPredictor = (model as TransformerChain).LastTransformer as BinaryPredictionTransformer>; - // var linearPredictor = (model as TransformerChain).LastTransformer as BinaryPredictionTransformer, ICalibrator>>; - // var linearPredictor = model.LastTransformer; - - // Compute the permutation metrics for the linear model using the - // normalized data. - var permutationMetrics = mlContext.BinaryClassification - .PermutationFeatureImportance(linearPredictor, transformedData, - permutationCount: 30); - - // Now let's look at which features are most important to the model - // overall. Get the feature indices sorted by their impact on AUC. - var sortedIndices = permutationMetrics - .Select((metrics, index) => new { index, metrics.AreaUnderRocCurve }) - .OrderByDescending( - feature => Math.Abs(feature.AreaUnderRocCurve.Mean)) - .Select(feature => feature.index); - - Console.WriteLine("Feature\tModel Weight\tChange in AUC" - + "\t95% Confidence in the Mean Change in AUC"); - var auc = permutationMetrics.Select(x => x.AreaUnderRocCurve).ToArray(); - foreach (int i in sortedIndices) - { - Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}", - featureColumns[i], - linearPredictor.Model.SubModel.Weights[i], - auc[i].Mean, - 1.96 * auc[i].StandardError); - } - - // Expected output: - // Feature Model Weight Change in AUC 95% Confidence in the Mean Change in AUC - // Feature2 35.15 -0.387 0.002015 - // Feature1 17.94 -0.1514 0.0008963 - } - - private class Data - { - public bool Label { get; set; } - - public float Feature1 { get; set; } - - public float Feature2 { get; set; } - } - - /// - /// Generate an enumerable of Data objects, creating the label as a simple - /// linear combination of the features. - /// - /// The number of examples. - /// The bias, or offset, in the calculation of the label. - /// - /// The weight to multiply the first feature with to - /// compute the label. - /// The weight to multiply the second feature with to - /// compute the label. - /// The seed for generating feature values and label - /// noise. - /// An enumerable of Data objects. - private static IEnumerable GenerateData(int nExamples = 10000, - double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) - { - var rng = new Random(seed); - for (int i = 0; i < nExamples; i++) - { - var data = new Data - { - Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - }; - - // Create a noisy label. - var value = (float)(bias + weight1 * data.Feature1 + weight2 * - data.Feature2 + rng.NextDouble() - 0.5); - - data.Label = Sigmoid(value) > 0.5; - yield return data; - } - } - - private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x)); - } -} From 9d0f7a43667ee9b148d8251e096f4f60b1985e58 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 30 Sep 2019 15:32:47 -0700 Subject: [PATCH 06/14] Removed unnecessary comment. --- src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index 749c8e8c52..b99fd8b216 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -739,7 +739,7 @@ public static object Create(IHostEnvironment env, ModelLoadContext ctx) { // Load internal model to be used as TModel of RegressionPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RegressionPredictionTransformer>)); - ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); // MYTODO: don't hardcode the DirModel + ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); // Create generic type of RegressionPredictionTransformer using the correct TModel. // Return an instance of that type, passing the previously loaded model to the constructor From 9997b2cf74c7e7fbcca3f0f166a8ac2960e8b190 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 30 Sep 2019 16:38:13 -0700 Subject: [PATCH 07/14] Refactor constructors with Initialize methods --- .../Scorers/PredictionTransformer.cs | 59 +++++++++---------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index b99fd8b216..18a7620000 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -92,36 +92,27 @@ private protected PredictionTransformerBase(IHost host, ModelLoadContext ctx) // *** Binary format *** // model: prediction model. - // stream: empty data view that contains train schema. - // id of string: feature column. ctx.LoadModel(host, out TModel model, DirModel); Model = model; - // Clone the stream with the schema into memory. - var ms = new MemoryStream(); - ctx.TryLoadBinaryStream(DirTransSchema, reader => - { - reader.BaseStream.CopyTo(ms); - }); - - ms.Position = 0; - var loader = new BinaryLoader(host, new BinaryLoader.Arguments(), ms); - TrainSchema = loader.Schema; + InitializeLogic(host, ctx); } [BestFriend] private protected PredictionTransformerBase(IHost host, ModelLoadContext ctx, TModel model) { Host = host; + Model = model; // prediction model + InitializeLogic(host, ctx); + } + private protected void InitializeLogic(IHost host, ModelLoadContext ctx) + { // *** Binary format *** - // model: prediction model. // stream: empty data view that contains train schema. // id of string: feature column. - Model = model; - // Clone the stream with the schema into memory. var ms = new MemoryStream(); ctx.TryLoadBinaryStream(DirTransSchema, reader => @@ -134,6 +125,11 @@ private protected PredictionTransformerBase(IHost host, ModelLoadContext ctx, TM TrainSchema = loader.Schema; } + private protected void InitializeLogic(IHost host, ModelLoadContext ctx, TModel model) + { + + } + /// /// Gets the output schema resulting from the /// @@ -389,26 +385,24 @@ internal BinaryPredictionTransformer(IHostEnvironment env, TModel model, DataVie internal BinaryPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(BinaryPredictionTransformer)), ctx) { - // *** Binary format *** - // - // float: scorer threshold - // id of string: scorer threshold column - - Threshold = ctx.Reader.ReadSingle(); - ThresholdColumn = ctx.LoadString(); - SetScorer(); + InitializationLogic(ctx, out Threshold, out ThresholdColumn); } internal BinaryPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) : base(host, ctx, model) + { + InitializationLogic(ctx, out Threshold, out ThresholdColumn); + } + + internal void InitializationLogic(ModelLoadContext ctx, out float threshold, out string thresholdcolumn) { // *** Binary format *** // // float: scorer threshold // id of string: scorer threshold column - Threshold = ctx.Reader.ReadSingle(); - ThresholdColumn = ctx.LoadString(); + threshold = ctx.Reader.ReadSingle(); + thresholdcolumn = ctx.LoadString(); SetScorer(); } @@ -468,22 +462,23 @@ internal MulticlassPredictionTransformer(IHostEnvironment env, TModel model, Dat internal MulticlassPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(MulticlassPredictionTransformer)), ctx) { - // *** Binary format *** - // - // id of string: train label column - - _trainLabelColumn = ctx.LoadStringOrNull(); - SetScorer(); + InitializationLogic(ctx, out _trainLabelColumn); } internal MulticlassPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, IHost host, TModel model) : base(host, ctx, model) + { + + InitializationLogic(ctx, out _trainLabelColumn); + } + + internal void InitializationLogic(ModelLoadContext ctx, out string trainLabelColumn) { // *** Binary format *** // // id of string: train label column - _trainLabelColumn = ctx.LoadStringOrNull(); + trainLabelColumn = ctx.LoadStringOrNull(); SetScorer(); } From efd1ce2745d62167b4edf919a1e466fc546bb9a0 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 30 Sep 2019 17:05:09 -0700 Subject: [PATCH 08/14] - Refactor create methods of prediction transformers into new static class. - Added static class to hold DirModel string --- .../Scorers/PredictionTransformer.cs | 96 ++++++++----------- 1 file changed, 40 insertions(+), 56 deletions(-) diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index 18a7620000..625a913d03 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -30,6 +30,11 @@ namespace Microsoft.ML.Data { + internal class PredictionTransformerBase + { + internal const string DirModel = "Model"; + } + /// /// Base class for transformers with no feature column, or more than one feature columns. /// @@ -45,7 +50,7 @@ public abstract class PredictionTransformerBase : IPredictionTransformer private protected IPredictor ModelAsPredictor => (IPredictor)Model; [BestFriend] - private protected const string DirModel = "Model"; + private protected const string DirModel = PredictionTransformerBase.DirModel; [BestFriend] private protected const string DirTransSchema = "TrainSchema"; [BestFriend] @@ -125,11 +130,6 @@ private protected void InitializeLogic(IHost host, ModelLoadContext ctx) TrainSchema = loader.Schema; } - private protected void InitializeLogic(IHost host, ModelLoadContext ctx, TModel model) - { - - } - /// /// Gets the output schema resulting from the /// @@ -666,7 +666,7 @@ private static VersionInfo GetVersionInfo() internal static class BinaryPredictionTransformer { public const string LoaderSignature = "BinaryPredXfer"; - private const string DirModel = "Model"; // This should match "DirModel" in PredictionTransformerBase<> + private const string DirModel = PredictionTransformerBase.DirModel; public static object Create(IHostEnvironment env, ModelLoadContext ctx) { @@ -674,30 +674,15 @@ public static object Create(IHostEnvironment env, ModelLoadContext ctx) var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(BinaryPredictionTransformer>)); ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); - // Create generic type of BinaryPredictionTransformer using the correct TModel. - // Return an instance of that type, passing the previously loaded model to the constructor Type generic = typeof(BinaryPredictionTransformer<>); - Type[] genericTypeArgs = { model.GetType() }; - Type constructed = generic.MakeGenericType(genericTypeArgs); - - Type[] constructorArgs = { - typeof(IHostEnvironment), - typeof(ModelLoadContext), - typeof(IHost), - model.GetType() - }; - - var genericCtor = constructed.GetConstructor(BindingFlags.NonPublic | BindingFlags.Instance, null, constructorArgs, null); - var genericInstance = genericCtor.Invoke(new object[] { env, ctx, host, model }); - - return genericInstance; + return CreatePredictionTransformer.Create(env, ctx, host, model, generic); } } internal static class MulticlassPredictionTransformer { public const string LoaderSignature = "MulticlassPredXfer"; - private const string DirModel = "Model"; // This should match "DirModel" in PredictionTransformerBase<> + private const string DirModel = PredictionTransformerBase.DirModel; public static object Create(IHostEnvironment env, ModelLoadContext ctx) { @@ -705,30 +690,15 @@ public static object Create(IHostEnvironment env, ModelLoadContext ctx) var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(MulticlassPredictionTransformer>>)); ctx.LoadModel>, SignatureLoadModel>(host, out IPredictorProducing> model, DirModel); - // Create generic type of MulticlassPredictionTransformer using the correct TModel. - // Return an instance of that type, passing the previously loaded model to the constructor Type generic = typeof(MulticlassPredictionTransformer<>); - Type[] genericTypeArgs = { model.GetType() }; - Type constructed = generic.MakeGenericType(genericTypeArgs); - - Type[] constructorArgs = { - typeof(IHostEnvironment), - typeof(ModelLoadContext), - typeof(IHost), - model.GetType() - }; - - var genericCtor = constructed.GetConstructor(BindingFlags.NonPublic | BindingFlags.Instance, null, constructorArgs, null); - var genericInstance = genericCtor.Invoke(new object[] { env, ctx, host, model }); - - return genericInstance; + return CreatePredictionTransformer.Create(env, ctx, host, model, generic); } } internal static class RegressionPredictionTransformer { public const string LoaderSignature = "RegressionPredXfer"; - private const string DirModel = "Model"; // This should match "DirModel" in PredictionTransformerBase<> + private const string DirModel = PredictionTransformerBase.DirModel; public static object Create(IHostEnvironment env, ModelLoadContext ctx) { @@ -736,9 +706,34 @@ public static object Create(IHostEnvironment env, ModelLoadContext ctx) var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RegressionPredictionTransformer>)); ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); - // Create generic type of RegressionPredictionTransformer using the correct TModel. - // Return an instance of that type, passing the previously loaded model to the constructor Type generic = typeof(RegressionPredictionTransformer<>); + return CreatePredictionTransformer.Create(env, ctx, host, model, generic); + + } + } + + internal static class RankingPredictionTransformer + { + public const string LoaderSignature = "RankingPredXfer"; + private const string DirModel = PredictionTransformerBase.DirModel; + + public static object Create(IHostEnvironment env, ModelLoadContext ctx) + { + // Load internal model to be used as TModel of RankingPredictionTransformer + var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RankingPredictionTransformer>)); + ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); + + Type generic = typeof(RankingPredictionTransformer<>); + return CreatePredictionTransformer.Create(env, ctx, host, model, generic); + } + } + + internal static class CreatePredictionTransformer + { + internal static object Create(IHostEnvironment env, ModelLoadContext ctx, IHost host, IPredictorProducing model, Type generic) + { + // Create generic type of the prediction transformer using the correct TModel. + // Return an instance of that type, passing the previously loaded model to the constructor Type[] genericTypeArgs = { model.GetType() }; Type constructed = generic.MakeGenericType(genericTypeArgs); @@ -754,22 +749,11 @@ public static object Create(IHostEnvironment env, ModelLoadContext ctx) return genericInstance; } - } - internal static class RankingPredictionTransformer - { - public const string LoaderSignature = "RankingPredXfer"; - private const string DirModel = "Model"; // This should match "DirModel" in PredictionTransformerBase<> - - public static object Create(IHostEnvironment env, ModelLoadContext ctx) + internal static object Create(IHostEnvironment env, ModelLoadContext ctx, IHost host, IPredictorProducing> model, Type generic) { - // Load internal model to be used as TModel of RankingPredictionTransformer - var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RankingPredictionTransformer>)); - ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); - - // Create generic type of RankingPredictionTransformer using the correct TModel. + // Create generic type of the prediction transformer using the correct TModel. // Return an instance of that type, passing the previously loaded model to the constructor - Type generic = typeof(RankingPredictionTransformer<>); Type[] genericTypeArgs = { model.GetType() }; Type constructed = generic.MakeGenericType(genericTypeArgs); From 8e190e480ae2270da6ee0f587f1a9cb7a52976fe Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Mon, 30 Sep 2019 17:39:14 -0700 Subject: [PATCH 09/14] Make non-generic PredictionTransformerBase static --- src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index 625a913d03..8efff4145d 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -30,7 +30,7 @@ namespace Microsoft.ML.Data { - internal class PredictionTransformerBase + internal static class PredictionTransformerBase { internal const string DirModel = "Model"; } From 46fbd1a3b7a744534b7f154f6a3c60f12d76b87e Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 1 Oct 2019 13:03:14 -0700 Subject: [PATCH 10/14] Changed LoadableClassAttribute instType of prediction transformers --- .../Scorers/PredictionTransformer.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index 8efff4145d..93d5a73736 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -10,16 +10,16 @@ using Microsoft.ML.Data.IO; using Microsoft.ML.Runtime; -[assembly: LoadableClass(typeof(object), typeof(BinaryPredictionTransformer), null, typeof(SignatureLoadModel), +[assembly: LoadableClass(typeof(ISingleFeaturePredictionTransformer), typeof(BinaryPredictionTransformer), null, typeof(SignatureLoadModel), "", BinaryPredictionTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(object), typeof(MulticlassPredictionTransformer), null, typeof(SignatureLoadModel), +[assembly: LoadableClass(typeof(ISingleFeaturePredictionTransformer), typeof(MulticlassPredictionTransformer), null, typeof(SignatureLoadModel), "", MulticlassPredictionTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(object), typeof(RegressionPredictionTransformer), null, typeof(SignatureLoadModel), +[assembly: LoadableClass(typeof(ISingleFeaturePredictionTransformer), typeof(RegressionPredictionTransformer), null, typeof(SignatureLoadModel), "", RegressionPredictionTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(object), typeof(RankingPredictionTransformer), null, typeof(SignatureLoadModel), +[assembly: LoadableClass(typeof(ISingleFeaturePredictionTransformer), typeof(RankingPredictionTransformer), null, typeof(SignatureLoadModel), "", RankingPredictionTransformer.LoaderSignature)] [assembly: LoadableClass(typeof(AnomalyPredictionTransformer>), typeof(AnomalyPredictionTransformer), null, typeof(SignatureLoadModel), @@ -668,14 +668,14 @@ internal static class BinaryPredictionTransformer public const string LoaderSignature = "BinaryPredXfer"; private const string DirModel = PredictionTransformerBase.DirModel; - public static object Create(IHostEnvironment env, ModelLoadContext ctx) + public static ISingleFeaturePredictionTransformer Create(IHostEnvironment env, ModelLoadContext ctx) { // Load internal model to be used as TModel of BinaryPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(BinaryPredictionTransformer>)); ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); Type generic = typeof(BinaryPredictionTransformer<>); - return CreatePredictionTransformer.Create(env, ctx, host, model, generic); + return (ISingleFeaturePredictionTransformer) CreatePredictionTransformer.Create(env, ctx, host, model, generic); } } @@ -684,14 +684,14 @@ internal static class MulticlassPredictionTransformer public const string LoaderSignature = "MulticlassPredXfer"; private const string DirModel = PredictionTransformerBase.DirModel; - public static object Create(IHostEnvironment env, ModelLoadContext ctx) + public static ISingleFeaturePredictionTransformer Create(IHostEnvironment env, ModelLoadContext ctx) { // Load internal model to be used as TModel of MulticlassPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(MulticlassPredictionTransformer>>)); ctx.LoadModel>, SignatureLoadModel>(host, out IPredictorProducing> model, DirModel); Type generic = typeof(MulticlassPredictionTransformer<>); - return CreatePredictionTransformer.Create(env, ctx, host, model, generic); + return (ISingleFeaturePredictionTransformer) CreatePredictionTransformer.Create(env, ctx, host, model, generic); } } @@ -700,14 +700,14 @@ internal static class RegressionPredictionTransformer public const string LoaderSignature = "RegressionPredXfer"; private const string DirModel = PredictionTransformerBase.DirModel; - public static object Create(IHostEnvironment env, ModelLoadContext ctx) + public static ISingleFeaturePredictionTransformer Create(IHostEnvironment env, ModelLoadContext ctx) { // Load internal model to be used as TModel of RegressionPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RegressionPredictionTransformer>)); ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); Type generic = typeof(RegressionPredictionTransformer<>); - return CreatePredictionTransformer.Create(env, ctx, host, model, generic); + return (ISingleFeaturePredictionTransformer) CreatePredictionTransformer.Create(env, ctx, host, model, generic); } } @@ -717,14 +717,14 @@ internal static class RankingPredictionTransformer public const string LoaderSignature = "RankingPredXfer"; private const string DirModel = PredictionTransformerBase.DirModel; - public static object Create(IHostEnvironment env, ModelLoadContext ctx) + public static ISingleFeaturePredictionTransformer Create(IHostEnvironment env, ModelLoadContext ctx) { // Load internal model to be used as TModel of RankingPredictionTransformer var host = Contracts.CheckRef(env, nameof(env)).Register(nameof(RankingPredictionTransformer>)); ctx.LoadModel, SignatureLoadModel>(host, out IPredictorProducing model, DirModel); Type generic = typeof(RankingPredictionTransformer<>); - return CreatePredictionTransformer.Create(env, ctx, host, model, generic); + return (ISingleFeaturePredictionTransformer) CreatePredictionTransformer.Create(env, ctx, host, model, generic); } } From e6dad0e58a8e6f82330eb3f4dc73a9450223ac8b Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 1 Oct 2019 13:13:42 -0700 Subject: [PATCH 11/14] Removed unused file stream variables --- .../Explainability.cs | 4 +-- .../PermutationFeatureImportanceTests.cs | 35 ++++--------------- 2 files changed, 8 insertions(+), 31 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Explainability.cs b/test/Microsoft.ML.Functional.Tests/Explainability.cs index 30269262a4..8f030efe1a 100644 --- a/test/Microsoft.ML.Functional.Tests/Explainability.cs +++ b/test/Microsoft.ML.Functional.Tests/Explainability.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.IO; using Microsoft.ML.Data; using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.RunTests; @@ -73,8 +72,7 @@ public void GlobalFeatureImportanceWithPermutationFeatureImportanceWithLoadedMod mlContext.Model.Save(model, data.Schema, modelAndSchemaPath); ITransformer loadedModel; - using (var fs = File.OpenRead(modelAndSchemaPath)) - loadedModel = mlContext.Model.Load(modelAndSchemaPath, out var schema); + loadedModel = mlContext.Model.Load(modelAndSchemaPath, out var schema); var transformedData = loadedModel.Transform(data); var linearPredictor = (loadedModel as TransformerChain).LastTransformer as RegressionPredictionTransformer; diff --git a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs index 44ebe8dcbb..779b1c52bc 100644 --- a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs +++ b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs @@ -69,10 +69,7 @@ public void TestPfiRegressionOnDenseFeaturesWithLoadedModel() var modelAndSchemaPath = GetOutputPath("TestPfiRegressionOnDenseFeatures.zip"); ML.Model.Save(model, data.Schema, modelAndSchemaPath); - ITransformer loadedModel; - using (var fs = File.OpenRead(modelAndSchemaPath)) - loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); var castedModel = loadedModel as RegressionPredictionTransformer; var pfi = ML.Regression.PermutationFeatureImportance(castedModel, data); @@ -163,10 +160,7 @@ public void TestPfiRegressionStandardDeviationAndErrorOnDenseFeaturesWithLoadedM var modelAndSchemaPath = GetOutputPath("TestPfiRegressionStandardDeviationAndErrorOnDenseFeatures.zip"); ML.Model.Save(model, data.Schema, modelAndSchemaPath); - ITransformer loadedModel; - using (var fs = File.OpenRead(modelAndSchemaPath)) - loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); var castedModel = loadedModel as RegressionPredictionTransformer; var pfi = ML.Regression.PermutationFeatureImportance(castedModel, data, permutationCount: 20); @@ -259,10 +253,7 @@ public void TestPfiRegressionOnSparseFeaturesWithLoadedModel() var modelAndSchemaPath = GetOutputPath("TestPfiRegressionOnSparseFeatures.zip"); ML.Model.Save(model, data.Schema, modelAndSchemaPath); - ITransformer loadedModel; - using (var fs = File.OpenRead(modelAndSchemaPath)) - loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); var castedModel = loadedModel as RegressionPredictionTransformer; var results = ML.Regression.PermutationFeatureImportance(castedModel, data); @@ -422,10 +413,7 @@ public void TestPfiMulticlassClassificationOnDenseFeaturesWithLoadedModel() var modelAndSchemaPath = GetOutputPath("TestPfiMulticlassClassificationOnDenseFeatures.zip"); ML.Model.Save(model, data.Schema, modelAndSchemaPath); - ITransformer loadedModel; - using (var fs = File.OpenRead(modelAndSchemaPath)) - loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); var castedModel = loadedModel as MulticlassPredictionTransformer; var pfi = ML.MulticlassClassification.PermutationFeatureImportance(castedModel, data); @@ -510,10 +498,7 @@ public void TestPfiMulticlassClassificationOnSparseFeaturesWithLoadedModel() var modelAndSchemaPath = GetOutputPath("TestPfiMulticlassClassificationOnSparseFeatures.zip"); ML.Model.Save(model, data.Schema, modelAndSchemaPath); - ITransformer loadedModel; - using (var fs = File.OpenRead(modelAndSchemaPath)) - loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); var castedModel = loadedModel as MulticlassPredictionTransformer; var pfi = ML.MulticlassClassification.PermutationFeatureImportance(castedModel, data); @@ -591,10 +576,7 @@ public void TestPfiRankingOnDenseFeaturesWithLoadedModel() var modelAndSchemaPath = GetOutputPath("TestPfiRankingOnDenseFeatures.zip"); ML.Model.Save(model, data.Schema, modelAndSchemaPath); - ITransformer loadedModel; - using (var fs = File.OpenRead(modelAndSchemaPath)) - loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); var castedModel = loadedModel as RankingPredictionTransformer; var pfi = ML.Ranking.PermutationFeatureImportance(castedModel, data); @@ -664,10 +646,7 @@ public void TestPfiRankingOnSparseFeaturesWithLoadedModel() var modelAndSchemaPath = GetOutputPath("TestPfiRankingOnSparseFeatures.zip"); ML.Model.Save(model, data.Schema, modelAndSchemaPath); - ITransformer loadedModel; - using (var fs = File.OpenRead(modelAndSchemaPath)) - loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); var castedModel = loadedModel as RankingPredictionTransformer; var pfi = ML.Ranking.PermutationFeatureImportance(castedModel, data); From af764ff1595fa18a2835faeb291f4598c0084f01 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 1 Oct 2019 13:15:34 -0700 Subject: [PATCH 12/14] Update name of Sample classes to match their file's name --- .../PermutationFeatureImportanceLoadFromDisk.cs | 2 +- .../Ranking/PermutationFeatureImportanceLoadFromDisk.cs | 2 +- .../Regression/PermutationFeatureImportanceLoadFromDisk.cs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs index ca7a06954d..3a94a113c4 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportanceLoadFromDisk.cs @@ -8,7 +8,7 @@ namespace Samples.Dynamic.Trainers.MulticlassClassification { - public static class PermutationFeatureImportance2 + public static class PermutationFeatureImportanceLoadFromDisk { public static void Example() { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs index dc1bc4e240..e12e17554b 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportanceLoadFromDisk.cs @@ -7,7 +7,7 @@ namespace Samples.Dynamic.Trainers.Ranking { - public static class PermutationFeatureImportance2 + public static class PermutationFeatureImportanceLoadFromDisk { public static void Example() { diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs index 5e825915c9..1a783b6c2a 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportanceLoadFromDisk.cs @@ -7,7 +7,7 @@ namespace Samples.Dynamic.Trainers.Regression { - public static class PermutationFeatureImportance2 + public static class PermutationFeatureImportanceLoadFromDisk { public static void Example() { From b37ce6ec810642ee8783b75daeffdae242066394 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 1 Oct 2019 14:23:55 -0700 Subject: [PATCH 13/14] Refactored tests to reuse existing code --- .../Explainability.cs | 60 ++- .../PermutationFeatureImportanceTests.cs | 408 +++++------------- 2 files changed, 136 insertions(+), 332 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Explainability.cs b/test/Microsoft.ML.Functional.Tests/Explainability.cs index 8f030efe1a..a63da1abea 100644 --- a/test/Microsoft.ML.Functional.Tests/Explainability.cs +++ b/test/Microsoft.ML.Functional.Tests/Explainability.cs @@ -25,8 +25,10 @@ public Explainability(ITestOutputHelper output) : base(output) /// /// GlobalFeatureImportance: PFI can be used to compute global feature importance. /// - [Fact] - public void GlobalFeatureImportanceWithPermutationFeatureImportance() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void GlobalFeatureImportanceWithPermutationFeatureImportance(bool saveModel) { var mlContext = new MLContext(seed: 1); @@ -37,45 +39,35 @@ public void GlobalFeatureImportanceWithPermutationFeatureImportance() var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) .Append(mlContext.Regression.Trainers.Sdca()); - // Fit the pipeline and transform the data. + // Fit the pipeline var model = pipeline.Fit(data); - var transformedData = model.Transform(data); - - // Compute the permutation feature importance to look at global feature importance. - var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(model.LastTransformer, transformedData); - // Make sure the correct number of features came back. - Assert.Equal(HousingRegression.Features.Length, permutationMetrics.Length); - foreach (var metricsStatistics in permutationMetrics) - Common.AssertMetricsStatistics(metricsStatistics); - } + IDataView transformedData; + RegressionPredictionTransformer linearPredictor; - /// - /// GlobalFeatureImportance: PFI can be used to compute global feature importance. Here it is used with a model loaded from disk. - /// - [Fact] - public void GlobalFeatureImportanceWithPermutationFeatureImportanceWithLoadedModel() - { - var mlContext = new MLContext(seed: 1); - - // Get the dataset - var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); - - // Create a pipeline to train on the housing data. - var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) - .Append(mlContext.Regression.Trainers.Sdca()); + if(saveModel) + { + ITransformer loadedModel; - // Fit the pipeline and transform the data. - var model = pipeline.Fit(data); + // Load and save the model + var modelAndSchemaPath = GetOutputPath("TestFunctionalTestPFI.zip"); + mlContext.Model.Save(model, data.Schema, modelAndSchemaPath); + loadedModel = mlContext.Model.Load(modelAndSchemaPath, out var schema); - var modelAndSchemaPath = GetOutputPath("TestFunctionalTestPFI.zip"); - mlContext.Model.Save(model, data.Schema, modelAndSchemaPath); + // Transform the data + transformedData = loadedModel.Transform(data); - ITransformer loadedModel; - loadedModel = mlContext.Model.Load(modelAndSchemaPath, out var schema); + // Extract linear predictor + linearPredictor = (loadedModel as TransformerChain).LastTransformer as RegressionPredictionTransformer; + } + else + { + // Transform the data + transformedData = model.Transform(data); - var transformedData = loadedModel.Transform(data); - var linearPredictor = (loadedModel as TransformerChain).LastTransformer as RegressionPredictionTransformer; + // Extract linear predictor + linearPredictor = model.LastTransformer; + } // Compute the permutation feature importance to look at global feature importance. var permutationMetrics = mlContext.Regression.PermutationFeatureImportance(linearPredictor, transformedData); diff --git a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs index 779b1c52bc..ac04e5452a 100644 --- a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs +++ b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs @@ -27,51 +27,28 @@ public PermutationFeatureImportanceTests(ITestOutputHelper output) : base(output /// /// Test PFI Regression for Dense Features /// - [Fact] - public void TestPfiRegressionOnDenseFeatures() - { - var data = GetDenseDataset(); - var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); - var pfi = ML.Regression.PermutationFeatureImportance(model, data); - - // Pfi Indices: - // X1: 0 - // X2Important: 1 - // X3: 2 - // X4Rand: 3 - - // For the following metrics lower is better, so maximum delta means more important feature, and vice versa - Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanAbsoluteError.Mean)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanAbsoluteError.Mean)); - - Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanSquaredError.Mean)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanSquaredError.Mean)); - - Assert.Equal(3, MinDeltaIndex(pfi, m => m.RootMeanSquaredError.Mean)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RootMeanSquaredError.Mean)); - - // For the following metrics higher is better, so minimum delta means more important feature, and vice versa - Assert.Equal(1, MinDeltaIndex(pfi, m => m.RSquared.Mean)); - Assert.Equal(3, MaxDeltaIndex(pfi, m => m.RSquared.Mean)); - - Done(); - } - - /// - /// Test PFI Regression for Dense Features with a model loaded from disk - /// - [Fact] - public void TestPfiRegressionOnDenseFeaturesWithLoadedModel() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestPfiRegressionOnDenseFeatures(bool saveModel) { var data = GetDenseDataset(); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); - var modelAndSchemaPath = GetOutputPath("TestPfiRegressionOnDenseFeatures.zip"); - ML.Model.Save(model, data.Schema, modelAndSchemaPath); + ImmutableArray pfi; + if(saveModel) + { + var modelAndSchemaPath = GetOutputPath("TestPfiRegressionOnDenseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); - var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - var castedModel = loadedModel as RegressionPredictionTransformer; - var pfi = ML.Regression.PermutationFeatureImportance(castedModel, data); + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + var castedModel = loadedModel as RegressionPredictionTransformer; + pfi = ML.Regression.PermutationFeatureImportance(castedModel, data); + } + else + { + pfi = ML.Regression.PermutationFeatureImportance(model, data); + } // Pfi Indices: // X1: 0 @@ -99,70 +76,29 @@ public void TestPfiRegressionOnDenseFeaturesWithLoadedModel() /// /// Test PFI Regression Standard Deviation and Standard Error for Dense Features /// - [Fact] - public void TestPfiRegressionStandardDeviationAndErrorOnDenseFeatures() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestPfiRegressionStandardDeviationAndErrorOnDenseFeatures(bool saveModel) { var data = GetDenseDataset(); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); - var pfi = ML.Regression.PermutationFeatureImportance(model, data, permutationCount: 20); - // Keep the permutation count high so fluctuations are kept to a minimum - // but not high enough to slow down the tests - // (fluctuations lead to random test failures) - - // Pfi Indices: - // X1: 0 - // X2Important: 1 - // X3: 2 - // X4Rand: 3 - - // For these metrics, the magnitude of the difference will be greatest for 1, least for 3 - // Stardard Deviation will scale with the magnitude of the measure - Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanAbsoluteError.StandardDeviation)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanAbsoluteError.StandardDeviation)); - - Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanSquaredError.StandardDeviation)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanSquaredError.StandardDeviation)); - - Assert.Equal(3, MinDeltaIndex(pfi, m => m.RootMeanSquaredError.StandardDeviation)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RootMeanSquaredError.StandardDeviation)); - Assert.Equal(3, MinDeltaIndex(pfi, m => m.RSquared.StandardDeviation)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RSquared.StandardDeviation)); - - // Stardard Error will scale with the magnitude of the measure (as it's SD/sqrt(N)) - Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanAbsoluteError.StandardError)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanAbsoluteError.StandardError)); + ImmutableArray pfi; - Assert.Equal(3, MinDeltaIndex(pfi, m => m.MeanSquaredError.StandardError)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.MeanSquaredError.StandardError)); - - Assert.Equal(3, MinDeltaIndex(pfi, m => m.RootMeanSquaredError.StandardError)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RootMeanSquaredError.StandardError)); - - Assert.Equal(3, MinDeltaIndex(pfi, m => m.RSquared.StandardError)); - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.RSquared.StandardError)); - - // And test that the Standard Deviation and Standard Error are related as we expect - Assert.Equal(pfi[0].RootMeanSquaredError.StandardError, pfi[0].RootMeanSquaredError.StandardDeviation / Math.Sqrt(pfi[0].RootMeanSquaredError.Count)); - - Done(); - } - - /// - /// Test PFI Regression Standard Deviation and Standard Error for Dense Features with a model loaded from disk - /// - [Fact] - public void TestPfiRegressionStandardDeviationAndErrorOnDenseFeaturesWithLoadedModel() - { - var data = GetDenseDataset(); - var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); - - var modelAndSchemaPath = GetOutputPath("TestPfiRegressionStandardDeviationAndErrorOnDenseFeatures.zip"); - ML.Model.Save(model, data.Schema, modelAndSchemaPath); + if(saveModel) + { + var modelAndSchemaPath = GetOutputPath("TestPfiRegressionStandardDeviationAndErrorOnDenseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); - var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - var castedModel = loadedModel as RegressionPredictionTransformer; - var pfi = ML.Regression.PermutationFeatureImportance(castedModel, data, permutationCount: 20); + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + var castedModel = loadedModel as RegressionPredictionTransformer; + pfi = ML.Regression.PermutationFeatureImportance(castedModel, data, permutationCount: 20); + } + else + { + pfi = ML.Regression.PermutationFeatureImportance(model, data, permutationCount: 20); + } // Keep the permutation count high so fluctuations are kept to a minimum // but not high enough to slow down the tests @@ -210,52 +146,28 @@ public void TestPfiRegressionStandardDeviationAndErrorOnDenseFeaturesWithLoadedM /// /// Test PFI Regression for Sparse Features /// - [Fact] - public void TestPfiRegressionOnSparseFeatures() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestPfiRegressionOnSparseFeatures(bool saveModel) { var data = GetSparseDataset(); var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); - var results = ML.Regression.PermutationFeatureImportance(model, data); - // Pfi Indices: - // X1: 0 - // X2VBuffer-Slot-0: 1 - // X2VBuffer-Slot-1: 2 - // X2VBuffer-Slot-2: 3 - // X2VBuffer-Slot-3: 4 - // X3Important: 5 - - // Permuted X2VBuffer-Slot-1 lot (f2) should have min impact on SGD metrics, X3Important -- max impact. - // For the following metrics lower is better, so maximum delta means more important feature, and vice versa - Assert.Equal(2, MinDeltaIndex(results, m => m.MeanAbsoluteError.Mean)); - Assert.Equal(5, MaxDeltaIndex(results, m => m.MeanAbsoluteError.Mean)); - - Assert.Equal(2, MinDeltaIndex(results, m => m.MeanSquaredError.Mean)); - Assert.Equal(5, MaxDeltaIndex(results, m => m.MeanSquaredError.Mean)); - - Assert.Equal(2, MinDeltaIndex(results, m => m.RootMeanSquaredError.Mean)); - Assert.Equal(5, MaxDeltaIndex(results, m => m.RootMeanSquaredError.Mean)); - - // For the following metrics higher is better, so minimum delta means more important feature, and vice versa - Assert.Equal(2, MaxDeltaIndex(results, m => m.RSquared.Mean)); - Assert.Equal(5, MinDeltaIndex(results, m => m.RSquared.Mean)); - } - - /// - /// Test PFI Regression for Sparse Features with a model loaded from disk - /// - [Fact] - public void TestPfiRegressionOnSparseFeaturesWithLoadedModel() - { - var data = GetSparseDataset(); - var model = ML.Regression.Trainers.OnlineGradientDescent().Fit(data); - - var modelAndSchemaPath = GetOutputPath("TestPfiRegressionOnSparseFeatures.zip"); - ML.Model.Save(model, data.Schema, modelAndSchemaPath); + ImmutableArray results; + if(saveModel) + { + var modelAndSchemaPath = GetOutputPath("TestPfiRegressionOnSparseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); - var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - var castedModel = loadedModel as RegressionPredictionTransformer; - var results = ML.Regression.PermutationFeatureImportance(castedModel, data); + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + var castedModel = loadedModel as RegressionPredictionTransformer; + results = ML.Regression.PermutationFeatureImportance(castedModel, data); + } + else + { + results = ML.Regression.PermutationFeatureImportance(model, data); + } // Pfi Indices: // X1: 0 @@ -367,56 +279,29 @@ public void TestPfiBinaryClassificationOnSparseFeatures() /// /// Test PFI Multiclass Classification for Dense Features /// - [Fact] - public void TestPfiMulticlassClassificationOnDenseFeatures() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestPfiMulticlassClassificationOnDenseFeatures(bool saveModel) { var data = GetDenseDataset(TaskType.MulticlassClassification); var model = ML.MulticlassClassification.Trainers.LbfgsMaximumEntropy().Fit(data); - var pfi = ML.MulticlassClassification.PermutationFeatureImportance(model, data); - - // Pfi Indices: - // X1: 0 - // X2Important: 1 - // X3: 2 - // X4Rand: 3 - // For the following metrics higher is better, so minimum delta means more important feature, and vice versa - Assert.Equal(3, MaxDeltaIndex(pfi, m => m.MicroAccuracy.Mean)); - Assert.Equal(1, MinDeltaIndex(pfi, m => m.MicroAccuracy.Mean)); - Assert.Equal(3, MaxDeltaIndex(pfi, m => m.MacroAccuracy.Mean)); - Assert.Equal(1, MinDeltaIndex(pfi, m => m.MacroAccuracy.Mean)); - Assert.Equal(3, MaxDeltaIndex(pfi, m => m.LogLossReduction.Mean)); - Assert.Equal(1, MinDeltaIndex(pfi, m => m.LogLossReduction.Mean)); + ImmutableArray pfi; + if(saveModel) + { + var modelAndSchemaPath = GetOutputPath("TestPfiMulticlassClassificationOnDenseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); - // For the following metrics-delta lower is better, so maximum delta means more important feature, and vice versa - // Because they are _negative_, the difference will be positive for worse classifiers. - Assert.Equal(1, MaxDeltaIndex(pfi, m => m.LogLoss.Mean)); - Assert.Equal(3, MinDeltaIndex(pfi, m => m.LogLoss.Mean)); - for (int i = 0; i < pfi[0].PerClassLogLoss.Count; i++) + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + var castedModel = loadedModel as MulticlassPredictionTransformer; + pfi = ML.MulticlassClassification.PermutationFeatureImportance(castedModel, data); + } + else { - Assert.True(MaxDeltaIndex(pfi, m => m.PerClassLogLoss[i].Mean) == 1); - Assert.True(MinDeltaIndex(pfi, m => m.PerClassLogLoss[i].Mean) == 3); + pfi = ML.MulticlassClassification.PermutationFeatureImportance(model, data); } - Done(); - } - - /// - /// Test PFI Multiclass Classification for Dense Features using a model loaded from disk - /// - [Fact] - public void TestPfiMulticlassClassificationOnDenseFeaturesWithLoadedModel() - { - var data = GetDenseDataset(TaskType.MulticlassClassification); - var model = ML.MulticlassClassification.Trainers.LbfgsMaximumEntropy().Fit(data); - - var modelAndSchemaPath = GetOutputPath("TestPfiMulticlassClassificationOnDenseFeatures.zip"); - ML.Model.Save(model, data.Schema, modelAndSchemaPath); - - var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - var castedModel = loadedModel as MulticlassPredictionTransformer; - var pfi = ML.MulticlassClassification.PermutationFeatureImportance(castedModel, data); - // Pfi Indices: // X1: 0 // X2Important: 1 @@ -447,61 +332,30 @@ public void TestPfiMulticlassClassificationOnDenseFeaturesWithLoadedModel() /// /// Test PFI Multiclass Classification for Sparse Features /// - [Fact] - public void TestPfiMulticlassClassificationOnSparseFeatures() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestPfiMulticlassClassificationOnSparseFeatures(bool saveModel) { var data = GetSparseDataset(TaskType.MulticlassClassification); var model = ML.MulticlassClassification.Trainers.LbfgsMaximumEntropy( new LbfgsMaximumEntropyMulticlassTrainer.Options { MaximumNumberOfIterations = 1000 }).Fit(data); - var pfi = ML.MulticlassClassification.PermutationFeatureImportance(model, data); - - // Pfi Indices: - // X1: 0 - // X2VBuffer-Slot-0: 1 - // X2VBuffer-Slot-1: 2 // Least important - // X2VBuffer-Slot-2: 3 - // X2VBuffer-Slot-3: 4 - // X3Important: 5 // Most important - - // For the following metrics higher is better, so minimum delta means more important feature, and vice versa - Assert.Equal(2, MaxDeltaIndex(pfi, m => m.MicroAccuracy.Mean)); - Assert.Equal(5, MinDeltaIndex(pfi, m => m.MicroAccuracy.Mean)); - Assert.Equal(2, MaxDeltaIndex(pfi, m => m.MacroAccuracy.Mean)); - Assert.Equal(5, MinDeltaIndex(pfi, m => m.MacroAccuracy.Mean)); - Assert.Equal(2, MaxDeltaIndex(pfi, m => m.LogLossReduction.Mean)); - Assert.Equal(5, MinDeltaIndex(pfi, m => m.LogLossReduction.Mean)); + ImmutableArray pfi; + if(saveModel) + { + var modelAndSchemaPath = GetOutputPath("TestPfiMulticlassClassificationOnSparseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); - // For the following metrics-delta lower is better, so maximum delta means more important feature, and vice versa - // Because they are negative metrics, the _difference_ will be positive for worse classifiers. - Assert.Equal(5, MaxDeltaIndex(pfi, m => m.LogLoss.Mean)); - Assert.Equal(2, MinDeltaIndex(pfi, m => m.LogLoss.Mean)); - for (int i = 0; i < pfi[0].PerClassLogLoss.Count; i++) + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + var castedModel = loadedModel as MulticlassPredictionTransformer; + pfi = ML.MulticlassClassification.PermutationFeatureImportance(castedModel, data); + } + else { - Assert.Equal(5, MaxDeltaIndex(pfi, m => m.PerClassLogLoss[i].Mean)); - Assert.Equal(2, MinDeltaIndex(pfi, m => m.PerClassLogLoss[i].Mean)); + pfi = ML.MulticlassClassification.PermutationFeatureImportance(model, data); } - Done(); - } - - /// - /// Test PFI Multiclass Classification for Sparse Features using a model loaded from disk - /// - [Fact] - public void TestPfiMulticlassClassificationOnSparseFeaturesWithLoadedModel() - { - var data = GetSparseDataset(TaskType.MulticlassClassification); - var model = ML.MulticlassClassification.Trainers.LbfgsMaximumEntropy( - new LbfgsMaximumEntropyMulticlassTrainer.Options { MaximumNumberOfIterations = 1000 }).Fit(data); - - var modelAndSchemaPath = GetOutputPath("TestPfiMulticlassClassificationOnSparseFeatures.zip"); - ML.Model.Save(model, data.Schema, modelAndSchemaPath); - - var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - var castedModel = loadedModel as MulticlassPredictionTransformer; - var pfi = ML.MulticlassClassification.PermutationFeatureImportance(castedModel, data); - // Pfi Indices: // X1: 0 // X2VBuffer-Slot-0: 1 @@ -536,49 +390,29 @@ public void TestPfiMulticlassClassificationOnSparseFeaturesWithLoadedModel() /// /// Test PFI Ranking Classification for Dense Features /// - [Fact] - public void TestPfiRankingOnDenseFeatures() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestPfiRankingOnDenseFeatures(bool saveModel) { var data = GetDenseDataset(TaskType.Ranking); var model = ML.Ranking.Trainers.FastTree().Fit(data); - var pfi = ML.Ranking.PermutationFeatureImportance(model, data); - // Pfi Indices: - // X1: 0 // For Ranking, this column won't result in misorderings - // X2Important: 1 - // X3: 2 - // X4Rand: 3 - - // For the following metrics higher is better, so minimum delta means more important feature, and vice versa - for (int i = 0; i < pfi[0].DiscountedCumulativeGains.Count; i++) + ImmutableArray pfi; + if(saveModel) { - Assert.Equal(0, MaxDeltaIndex(pfi, m => m.DiscountedCumulativeGains[i].Mean)); - Assert.Equal(1, MinDeltaIndex(pfi, m => m.DiscountedCumulativeGains[i].Mean)); + var modelAndSchemaPath = GetOutputPath("TestPfiRankingOnDenseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + var castedModel = loadedModel as RankingPredictionTransformer; + pfi = ML.Ranking.PermutationFeatureImportance(castedModel, data); } - for (int i = 0; i < pfi[0].NormalizedDiscountedCumulativeGains.Count; i++) + else { - Assert.Equal(0, MaxDeltaIndex(pfi, m => m.NormalizedDiscountedCumulativeGains[i].Mean)); - Assert.Equal(1, MinDeltaIndex(pfi, m => m.NormalizedDiscountedCumulativeGains[i].Mean)); + pfi = ML.Ranking.PermutationFeatureImportance(model, data); } - Done(); - } - - /// - /// Test PFI Multiclass Classification for Dense Features using model loaded from disk - /// - [Fact] - public void TestPfiRankingOnDenseFeaturesWithLoadedModel() - { - var data = GetDenseDataset(TaskType.Ranking); - var model = ML.Ranking.Trainers.FastTree().Fit(data); - - var modelAndSchemaPath = GetOutputPath("TestPfiRankingOnDenseFeatures.zip"); - ML.Model.Save(model, data.Schema, modelAndSchemaPath); - - var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - var castedModel = loadedModel as RankingPredictionTransformer; - var pfi = ML.Ranking.PermutationFeatureImportance(castedModel, data); // Pfi Indices: // X1: 0 // For Ranking, this column won't result in misorderings @@ -601,55 +435,33 @@ public void TestPfiRankingOnDenseFeaturesWithLoadedModel() Done(); } + /// /// Test PFI Multiclass Classification for Sparse Features /// - [Fact] - public void TestPfiRankingOnSparseFeatures() + [Theory] + [InlineData(true)] + [InlineData(false)] + public void TestPfiRankingOnSparseFeatures(bool saveModel) { var data = GetSparseDataset(TaskType.Ranking); var model = ML.Ranking.Trainers.FastTree().Fit(data); - var pfi = ML.Ranking.PermutationFeatureImportance(model, data); - - // Pfi Indices: - // X1: 0 - // X2VBuffer-Slot-0: 1 - // X2VBuffer-Slot-1: 2 // Least important - // X2VBuffer-Slot-2: 3 - // X2VBuffer-Slot-3: 4 - // X3Important: 5 // Most important - // For the following metrics higher is better, so minimum delta means more important feature, and vice versa - for (int i = 0; i < pfi[0].DiscountedCumulativeGains.Count; i++) + ImmutableArray pfi; + if(saveModel) { - Assert.Equal(2, MaxDeltaIndex(pfi, m => m.DiscountedCumulativeGains[i].Mean)); - Assert.Equal(5, MinDeltaIndex(pfi, m => m.DiscountedCumulativeGains[i].Mean)); + var modelAndSchemaPath = GetOutputPath("TestPfiRankingOnSparseFeatures.zip"); + ML.Model.Save(model, data.Schema, modelAndSchemaPath); + + var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); + var castedModel = loadedModel as RankingPredictionTransformer; + pfi = ML.Ranking.PermutationFeatureImportance(castedModel, data); } - for (int i = 0; i < pfi[0].NormalizedDiscountedCumulativeGains.Count; i++) + else { - Assert.Equal(2, MaxDeltaIndex(pfi, m => m.NormalizedDiscountedCumulativeGains[i].Mean)); - Assert.Equal(5, MinDeltaIndex(pfi, m => m.NormalizedDiscountedCumulativeGains[i].Mean)); + pfi = ML.Ranking.PermutationFeatureImportance(model, data); } - Done(); - } - - /// - /// Test PFI Multiclass Classification for Sparse Features with model loaded from disk - /// - [Fact] - public void TestPfiRankingOnSparseFeaturesWithLoadedModel() - { - var data = GetSparseDataset(TaskType.Ranking); - var model = ML.Ranking.Trainers.FastTree().Fit(data); - - var modelAndSchemaPath = GetOutputPath("TestPfiRankingOnSparseFeatures.zip"); - ML.Model.Save(model, data.Schema, modelAndSchemaPath); - - var loadedModel = ML.Model.Load(modelAndSchemaPath, out var schema); - var castedModel = loadedModel as RankingPredictionTransformer; - var pfi = ML.Ranking.PermutationFeatureImportance(castedModel, data); - // Pfi Indices: // X1: 0 // X2VBuffer-Slot-0: 1 From e1e4671c4da634c6627b518957eaaecd55096814 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Tue, 1 Oct 2019 15:44:54 -0700 Subject: [PATCH 14/14] Changed initialization methods to private --- src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs index 93d5a73736..53751614c2 100644 --- a/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs +++ b/src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs @@ -112,7 +112,7 @@ private protected PredictionTransformerBase(IHost host, ModelLoadContext ctx, TM InitializeLogic(host, ctx); } - private protected void InitializeLogic(IHost host, ModelLoadContext ctx) + private void InitializeLogic(IHost host, ModelLoadContext ctx) { // *** Binary format *** // stream: empty data view that contains train schema. @@ -394,7 +394,7 @@ internal BinaryPredictionTransformer(IHostEnvironment env, ModelLoadContext ctx, InitializationLogic(ctx, out Threshold, out ThresholdColumn); } - internal void InitializationLogic(ModelLoadContext ctx, out float threshold, out string thresholdcolumn) + private void InitializationLogic(ModelLoadContext ctx, out float threshold, out string thresholdcolumn) { // *** Binary format *** // @@ -472,7 +472,7 @@ internal MulticlassPredictionTransformer(IHostEnvironment env, ModelLoadContext InitializationLogic(ctx, out _trainLabelColumn); } - internal void InitializationLogic(ModelLoadContext ctx, out string trainLabelColumn) + private void InitializationLogic(ModelLoadContext ctx, out string trainLabelColumn) { // *** Binary format *** //