From becf7d6a059af033c4a79cc60d0c43fa8bd0cae6 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Mon, 8 Apr 2019 15:03:38 -0700 Subject: [PATCH 1/4] Updating PFI Docs. --- .../PermutationFeatureImportance.cs | 94 ++++++++++++++++++ .../PermutationFeatureImportance.cs | 95 ++++++++++++++++++ .../Ranking/PermutationFeatureImportance.cs | 99 +++++++++++++++++++ .../PermutationFeatureImportance.cs | 88 +++++++++++++++++ 4 files changed, 376 insertions(+) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs new file mode 100644 index 0000000000..5d527359ac --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs @@ -0,0 +1,94 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification +{ + public static class PermutationFeatureImportance + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(seed:1); + + // Create sample data. + var samples = Data.GenerateData(); + + // Load the sample data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Define a training pipeline that concatenates features into a vector, normalizes them, and then + // trains a linear model. + var pipeline = mlContext.Transforms.Concatenate("Features", Data.FeatureColumns) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression()); + + // Fit the pipeline to the data. + var model = pipeline.Fit(data); + + // Compute the permutation metrics for the linear model using the normalized data. + var transformedData = model.Transform(data); + var linearPredictor = model.LastTransformer; + var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance( + linearPredictor, transformedData, permutationCount: 30); + + // Now let's look at which features are most important to the model overall. + // Get the feature indices sorted by their impact on AUC. + var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.AreaUnderRocCurve}) + .OrderByDescending(feature => Math.Abs(feature.AreaUnderRocCurve.Mean)) + .Select(feature => feature.index); + + Console.WriteLine("Feature\tModel Weight\tChange in AUC\t95% Confidence in the Mean Change in AUC"); + var auc = permutationMetrics.Select(x => x.AreaUnderRocCurve).ToArray(); + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}", + Data.FeatureColumns[i], + linearPredictor.Model.SubModel.Weights[i], + auc[i].Mean, + 1.96 * auc[i].StandardError); + } + + // Expected output: + // Feature Model Weight Change in AUC 95% Confidence in the Mean Change in AUC + // Feature2 35.15 -0.387 0.002015 + // Feature1 17.94 -0.1514 0.0008963 + } + + private class Data + { + public bool Label { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + + public static readonly string[] FeatureColumns = new string[] { nameof(Feature1), nameof(Feature2) }; + + public static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + data.Label = Sigmoid(value) > 0.5; + yield return data; + } + } + + private static double Sigmoid(double x) + { + return 1.0 / (1.0 + Math.Exp(-1 * x)); + } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs new file mode 100644 index 0000000000..0aba5d60fe --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs @@ -0,0 +1,95 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification +{ + public static class PermutationFeatureImportance + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(seed:1); + + // Create sample data. + var samples = Data.GenerateData(); + + // Load the sample data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Define a training pipeline that concatenates features into a vector, normalizes them, and then + // trains a linear model. + var pipeline = mlContext.Transforms.Concatenate("Features", Data.FeatureColumns) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy()); + + // Fit the pipeline to the data. + var model = pipeline.Fit(data); + + // Compute the permutation metrics for the linear model using the normalized data. + var transformedData = model.Transform(data); + var linearPredictor = model.LastTransformer; + var permutationMetrics = mlContext.MulticlassClassification.PermutationFeatureImportance( + linearPredictor, transformedData, permutationCount: 30); + + // Now let's look at which features are most important to the model overall. + // Get the feature indices sorted by their impact on microaccuracy. + var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.MicroAccuracy}) + .OrderByDescending(feature => Math.Abs(feature.MicroAccuracy.Mean)) + .Select(feature => feature.index); + + Console.WriteLine("Feature\tChange in MicroAccuracy\t95% Confidence in the Mean Change in MicroAccuracy"); + var microAccuracy = permutationMetrics.Select(x => x.MicroAccuracy).ToArray(); + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:G4}\t{2:G4}", + Data.FeatureColumns[i], + microAccuracy[i].Mean, + 1.96 * microAccuracy[i].StandardError); + } + + // Expected output: + //Feature Change in MicroAccuracy 95% Confidence in the Mean Change in MicroAccuracy + //Feature2 -0.1395 0.0006567 + //Feature1 -0.05367 0.0006908 + } + + private class Data + { + public float Label { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + + public static readonly string[] FeatureColumns = new string[] { nameof(Feature1), nameof(Feature2) }; + + public static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + var max = bias + 4.5*weight1 + 4.5*weight2 + 0.5; + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + if (value < max / 3) + data.Label = 0; + else if (value < 2 * max / 3) + data.Label = 1; + else + data.Label = 2; + yield return data; + } + } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs new file mode 100644 index 0000000000..b868a469dc --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs @@ -0,0 +1,99 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking +{ + public static class PermutationFeatureImportance + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(seed:1); + + // Create sample data. + var samples = Data.GenerateData(); + + // Load the sample data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Define a training pipeline that concatenates features into a vector, normalizes them, and then + // trains a linear model. + var pipeline = mlContext.Transforms.Concatenate("Features", Data.FeatureColumns) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) + .Append(mlContext.Transforms.Conversion.MapValueToKey("GroupId")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.Ranking.Trainers.FastTree()); + + // Fit the pipeline to the data. + var model = pipeline.Fit(data); + + // Compute the permutation metrics for the linear model using the normalized data. + var transformedData = model.Transform(data); + var linearPredictor = model.LastTransformer; + var permutationMetrics = mlContext.Ranking.PermutationFeatureImportance( + linearPredictor, transformedData, permutationCount: 30); + + // Now let's look at which features are most important to the model overall. + // Get the feature indices sorted by their impact on NDCG@1. + var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.NormalizedDiscountedCumulativeGains}) + .OrderByDescending(feature => Math.Abs(feature.NormalizedDiscountedCumulativeGains[0].Mean)) + .Select(feature => feature.index); + + Console.WriteLine("Feature\tChange in NDCG@1\t95% Confidence in the Mean Change in NDCG@1"); + var ndcg = permutationMetrics.Select(x => x.NormalizedDiscountedCumulativeGains).ToArray(); + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:G4}\t{2:G4}", + Data.FeatureColumns[i], + ndcg[i][0].Mean, + 1.96 * ndcg[i][0].StandardError); + } + + // Expected output: + // Feature Change in NDCG@1 95% Confidence in the Mean Change in NDCG@1 + // Feature2 -0.2421 0.001748 + // Feature1 -0.0513 0.001184 + } + + private class Data + { + public float Label { get; set; } + + public int GroupId { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + + public static readonly string[] FeatureColumns = new string[] { nameof(Feature1), nameof(Feature2) }; + + public static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1, int groupSize = 5) + { + var rng = new Random(seed); + var max = bias + 4.5*weight1 + 4.5*weight2 + 0.5; + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + GroupId = i / groupSize, + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + if (value < max / 3) + data.Label = 0; + else if (value < 2 * max / 3) + data.Label = 1; + else + data.Label = 2; + yield return data; + } + } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs new file mode 100644 index 0000000000..6c4f4bc8d5 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs @@ -0,0 +1,88 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression +{ + public static class PermutationFeatureImportance + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + var mlContext = new MLContext(seed:1); + + // Create sample data. + var samples = Data.GenerateData(); + + // Load the sample data as an IDataView. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Define a training pipeline that concatenates features into a vector, normalizes them, and then + // trains a linear model. + var pipeline = mlContext.Transforms.Concatenate("Features", Data.FeatureColumns) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.Regression.Trainers.Ols()); + + // Fit the pipeline to the data. + var model = pipeline.Fit(data); + + // Compute the permutation metrics for the linear model using the normalized data. + var transformedData = model.Transform(data); + var linearPredictor = model.LastTransformer; + var permutationMetrics = mlContext.Regression.PermutationFeatureImportance( + linearPredictor, transformedData, permutationCount: 30); + + // Now let's look at which features are most important to the model overall. + // Get the feature indices sorted by their impact on RMSE. + var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RootMeanSquaredError}) + .OrderByDescending(feature => Math.Abs(feature.RootMeanSquaredError.Mean)) + .Select(feature => feature.index); + + Console.WriteLine("Feature\tModel Weight\tChange in RMSE\t95% Confidence in the Mean Change in RMSE"); + var rmse = permutationMetrics.Select(x => x.RootMeanSquaredError).ToArray(); + foreach (int i in sortedIndices) + { + Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}", + Data.FeatureColumns[i], + linearPredictor.Model.Weights[i], + rmse[i].Mean, + 1.96 * rmse[i].StandardError); + } + + // Expected output: + // Feature Model Weight Change in RMSE 95% Confidence in the Mean Change in RMSE + // Feature2 9.00 4.009 0.008304 + // Feature1 4.48 1.901 0.003351 + } + + private class Data + { + public float Label { get; set; } + + public float Feature1 { get; set; } + + public float Feature2 { get; set; } + + public static readonly string[] FeatureColumns = new string[] { nameof(Feature1), nameof(Feature2) }; + + public static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < nExamples; i++) + { + var data = new Data + { + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + data.Label = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + yield return data; + } + } + } + } +} From abd640403bab618d805f06f6757d7caa9b8d2f67 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Mon, 8 Apr 2019 15:07:18 -0700 Subject: [PATCH 2/4] Adding links to samples --- .../PermutationFeatureImportance/PFIHelper.cs | 60 --------------- .../PFIRegressionExample.cs | 77 ------------------- .../PfiBinaryClassificationExample.cs | 76 ------------------ .../PermutationFeatureImportanceExtensions.cs | 18 ++++- 4 files changed, 16 insertions(+), 215 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIHelper.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIHelper.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIHelper.cs deleted file mode 100644 index 95c64e629c..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIHelper.cs +++ /dev/null @@ -1,60 +0,0 @@ -using System; -using System.Linq; -using Microsoft.ML.Trainers; -using Microsoft.ML.SamplesUtils; - -namespace Microsoft.ML.Samples.Dynamic.PermutationFeatureImportance -{ - public static class PfiHelper - { - public static IDataView GetHousingRegressionIDataView(MLContext mlContext, out string labelName, out string[] featureNames, bool binaryPrediction = false) - { - // Read the Housing regression dataset - var data = DatasetUtils.LoadHousingRegressionDataset(mlContext); - - // Define the label column - var labelColumn = "MedianHomeValue"; - - if (binaryPrediction) - { - labelColumn = nameof(BinaryOutputRow.AboveAverage); - data = mlContext.Transforms.CustomMapping(GreaterThanAverage, null).Fit(data).Transform(data); - data = mlContext.Transforms.DropColumns("MedianHomeValue").Fit(data).Transform(data); - } - - labelName = labelColumn; - featureNames = data.Schema.AsEnumerable() - .Select(column => column.Name) // Get the column names - .Where(name => name != labelColumn) // Drop the Label - .ToArray(); - - return data; - } - - // Define a class for all the input columns that we intend to consume. - private class ContinuousInputRow - { - public float MedianHomeValue { get; set; } - } - - // Define a class for all output columns that we intend to produce. - private class BinaryOutputRow - { - public bool AboveAverage { get; set; } - } - - // Define an Action to apply a custom mapping from one object to the other - private readonly static Action GreaterThanAverage = (input, output) - => output.AboveAverage = input.MedianHomeValue > 22.6; - - public static float[] GetLinearModelWeights(OlsModelParameters linearModel) - { - return linearModel.Weights.ToArray(); - } - - public static float[] GetLinearModelWeights(LinearBinaryModelParameters linearModel) - { - return linearModel.Weights.ToArray(); - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs deleted file mode 100644 index 46b5bc65a6..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs +++ /dev/null @@ -1,77 +0,0 @@ -using System; -using System.Linq; - -namespace Microsoft.ML.Samples.Dynamic.PermutationFeatureImportance -{ - public static class PfiRegression - { - public static void Example() - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(); - - // Step 1: Read the data - var data = PfiHelper.GetHousingRegressionIDataView(mlContext, out string labelName, out string[] featureNames); - - // Step 2: Pipeline - // Concatenate the features to create a Feature vector. - // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0. - // Then append a linear regression trainer. - var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) - .Append(mlContext.Transforms.NormalizeMinMax("Features")) - .Append(mlContext.Regression.Trainers.Ols( - labelColumnName: labelName, featureColumnName: "Features")); - var model = pipeline.Fit(data); - - // Extract the model from the pipeline - var linearPredictor = model.LastTransformer; - var weights = PfiHelper.GetLinearModelWeights(linearPredictor.Model); - - // Compute the permutation metrics using the properly normalized data. - var transformedData = model.Transform(data); - var permutationMetrics = mlContext.Regression.PermutationFeatureImportance( - linearPredictor, transformedData, labelColumnName: labelName, permutationCount: 3); - - // Now let's look at which features are most important to the model overall - // Get the feature indices sorted by their impact on R-Squared - var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.RSquared }) - .OrderByDescending(feature => Math.Abs(feature.RSquared.Mean)) - .Select(feature => feature.index); - - // Print out the permutation results, with the model weights, in order of their impact: - // Expected console output for 100 permutations: - // Feature Model Weight Change in R-Squared 95% Confidence Interval of the Mean - // RoomsPerDwelling 53.35 -0.4298 0.005705 - // EmploymentDistance -19.21 -0.2609 0.004591 - // NitricOxides -19.32 -0.1569 0.003701 - // HighwayDistance 6.11 -0.1173 0.0025 - // TeacherRatio -21.92 -0.1106 0.002207 - // TaxRate -8.68 -0.1008 0.002083 - // CrimesPerCapita -16.37 -0.05988 0.00178 - // PercentPre40s -4.52 -0.03836 0.001432 - // PercentResidental 3.91 -0.02006 0.001079 - // CharlesRiver 3.49 -0.01839 0.000841 - // PercentNonRetail -1.17 -0.002111 0.0003176 - // - // Let's dig into these results a little bit. First, if you look at the weights of the model, they generally correlate - // with the results of PFI, but there are some significant misorderings. For example, "Tax Rate" and "Highway Distance" - // have relatively small model weights, but the permutation analysis shows these feature to have a larger effect - // on the accuracy of the model than higher-weighted features. To understand why the weights don't reflect the same - // feature importance as PFI, we need to go back to the basics of linear models: one of the assumptions of a linear - // model is that the features are uncorrelated. Now, the features in this dataset are clearly correlated: the tax rate - // for a house and the student-to-teacher ratio at the nearest school, for example, are often coupled through school - // levies. The tax rate, distance to a highway, and the crime rate would also seem to be correlated through social - // dynamics. We could draw out similar relationships for all variables in this dataset. The reason why the linear - // model weights don't reflect the same feature importance as PFI is that the solution to the linear model redistributes - // weights between correlated variables in unpredictable ways, so that the weights themselves are no longer a good - // measure of feature importance. - Console.WriteLine("Feature\tModel Weight\tChange in R-Squared\t95% Confidence Interval of the Mean"); - var rSquared = permutationMetrics.Select(x => x.RSquared).ToArray(); // Fetch r-squared as an array - foreach (int i in sortedIndices) - { - Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{rSquared[i].Mean:G4}\t{1.96 * rSquared[i].StandardError:G4}"); - } - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs deleted file mode 100644 index 8e109890e1..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs +++ /dev/null @@ -1,76 +0,0 @@ -using System; -using System.Linq; -using Microsoft.ML.Trainers; - -namespace Microsoft.ML.Samples.Dynamic.PermutationFeatureImportance -{ - public static class PfiBinaryClassification - { - public static void Example() - { - // Create a new context for ML.NET operations. It can be used for exception tracking and logging, - // as a catalog of available operations and as the source of randomness. - var mlContext = new MLContext(seed:999123); - - // Step 1: Read the data - var data = PfiHelper.GetHousingRegressionIDataView(mlContext, - out string labelName, out string[] featureNames, binaryPrediction: true); - - // Step 2: Pipeline - // Concatenate the features to create a Feature vector. - // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0. - // Then append a logistic regression trainer. - var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) - .Append(mlContext.Transforms.NormalizeMinMax("Features")) - .Append(mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression( - labelColumnName: labelName, featureColumnName: "Features")); - var model = pipeline.Fit(data); - - // Extract the model from the pipeline - var linearPredictor = model.LastTransformer; - // Linear models for binary classification are wrapped by a calibrator as a generic predictor - // To access it directly, we must extract it out and cast it to the proper class - var weights = PfiHelper.GetLinearModelWeights(linearPredictor.Model.SubModel as LinearBinaryModelParameters); - - // Compute the permutation metrics using the properly normalized data. - var transformedData = model.Transform(data); - var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance( - linearPredictor, transformedData, labelColumnName: labelName, permutationCount: 3); - - // Now let's look at which features are most important to the model overall. - // Get the feature indices sorted by their impact on AreaUnderRocCurve. - var sortedIndices = permutationMetrics.Select((metrics, index) => new { index, metrics.AreaUnderRocCurve }) - .OrderByDescending(feature => Math.Abs(feature.AreaUnderRocCurve.Mean)) - .Select(feature => feature.index); - - // Print out the permutation results, with the model weights, in order of their impact: - // Expected console output (for 100 permutations): - // Feature Model Weight Change in AUC 95% Confidence in the Mean Change in AUC - // PercentPre40s -1.96 -0.06316 0.002377 - // RoomsPerDwelling 3.71 -0.04385 0.001245 - // EmploymentDistance -1.31 -0.02139 0.0006867 - // TeacherRatio -2.46 -0.0203 0.0009566 - // PercentNonRetail -1.58 -0.01846 0.001586 - // CharlesRiver 0.66 -0.008605 0.0005136 - // PercentResidental 0.60 0.002483 0.0004818 - // TaxRate -0.95 -0.00221 0.0007394 - // NitricOxides -0.32 0.00101 0.0001428 - // CrimesPerCapita -0.04 -3.029E-05 1.678E-05 - // HighwayDistance 0.00 0 0 - // Let's look at these results. - // First, if you look at the weights of the model, they generally correlate with the results of PFI, - // but there are some significant misorderings. See the discussion in the Regression example for an - // explanation of why this happens and how to interpret it. - // Second, the logistic regression learner uses L1 regularization by default. Here, it causes the "HighWay Distance" - // feature to be zeroed out from the model. PFI assigns zero importance to this variable, as expected. - // Third, some features show an *increase* in AUC. This means that the model actually improved - // when these features were shuffled. This is a sign to investigate these features further. - Console.WriteLine("Feature\tModel Weight\tChange in AUC\t95% Confidence in the Mean Change in AUC"); - var auc = permutationMetrics.Select(x => x.AreaUnderRocCurve).ToArray(); // Fetch AUC as an array - foreach (int i in sortedIndices) - { - Console.WriteLine($"{featureNames[i]}\t{weights[i]:0.00}\t{auc[i].Mean:G4}\t{1.96 * auc[i].StandardError:G4}"); - } - } - } -} diff --git a/src/Microsoft.ML.Transforms/PermutationFeatureImportanceExtensions.cs b/src/Microsoft.ML.Transforms/PermutationFeatureImportanceExtensions.cs index 9e111dac5a..1c9cebd602 100644 --- a/src/Microsoft.ML.Transforms/PermutationFeatureImportanceExtensions.cs +++ b/src/Microsoft.ML.Transforms/PermutationFeatureImportanceExtensions.cs @@ -41,7 +41,7 @@ public static class PermutationFeatureImportanceExtensions /// /// /// /// /// @@ -117,7 +117,7 @@ private static RegressionMetrics RegressionDelta( /// /// /// /// /// @@ -194,6 +194,13 @@ private static BinaryClassificationMetrics BinaryClassifierDelta( /// example of working with these results to analyze the feature importance of a model. /// /// + /// + /// + /// + /// + /// /// The clustering catalog. /// The model on which to evaluate feature importance. /// The evaluation data set. @@ -272,6 +279,13 @@ private static MulticlassClassificationMetrics MulticlassClassificationDelta( /// example of working with these results to analyze the feature importance of a model. /// /// + /// + /// + /// + /// + /// /// The clustering catalog. /// The model on which to evaluate feature importance. /// The evaluation data set. From 628e812e0d073f9455ad51e87ac0749c275e5b8a Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 9 Apr 2019 15:02:32 -0700 Subject: [PATCH 3/4] Addressing PR comments. --- .../BinaryClassification/PermutationFeatureImportance.cs | 9 +++++++-- .../PermutationFeatureImportance.cs | 9 +++++++-- .../Trainers/Ranking/PermutationFeatureImportance.cs | 9 +++++++-- .../Trainers/Regression/PermutationFeatureImportance.cs | 9 +++++++-- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs index 5d527359ac..894ac521bc 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic.Trainers.BinaryClassification +namespace Samples.Dynamic.Trainers.BinaryClassification { public static class PermutationFeatureImportance { @@ -27,9 +28,13 @@ public static void Example() // Fit the pipeline to the data. var model = pipeline.Fit(data); - // Compute the permutation metrics for the linear model using the normalized data. + // Transform the dataset. var transformedData = model.Transform(data); + + // Extract the predictor. var linearPredictor = model.LastTransformer; + + // Compute the permutation metrics for the linear model using the normalized data. var permutationMetrics = mlContext.BinaryClassification.PermutationFeatureImportance( linearPredictor, transformedData, permutationCount: 30); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs index 0aba5d60fe..08a7c1c49d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic.Trainers.MulticlassClassification +namespace Samples.Dynamic.Trainers.MulticlassClassification { public static class PermutationFeatureImportance { @@ -28,9 +29,13 @@ public static void Example() // Fit the pipeline to the data. var model = pipeline.Fit(data); - // Compute the permutation metrics for the linear model using the normalized data. + // Transform the dataset. var transformedData = model.Transform(data); + + // Extract the predictor. var linearPredictor = model.LastTransformer; + + // Compute the permutation metrics for the linear model using the normalized data. var permutationMetrics = mlContext.MulticlassClassification.PermutationFeatureImportance( linearPredictor, transformedData, permutationCount: 30); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs index b868a469dc..53c038046e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic.Trainers.Ranking +namespace Samples.Dynamic.Trainers.Ranking { public static class PermutationFeatureImportance { @@ -29,9 +30,13 @@ public static void Example() // Fit the pipeline to the data. var model = pipeline.Fit(data); - // Compute the permutation metrics for the linear model using the normalized data. + // Transform the dataset. var transformedData = model.Transform(data); + + // Extract the predictor. var linearPredictor = model.LastTransformer; + + // Compute the permutation metrics for the linear model using the normalized data. var permutationMetrics = mlContext.Ranking.PermutationFeatureImportance( linearPredictor, transformedData, permutationCount: 30); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs index 6c4f4bc8d5..599bed55a7 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs @@ -1,8 +1,9 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.ML; -namespace Microsoft.ML.Samples.Dynamic.Trainers.Regression +namespace Samples.Dynamic.Trainers.Regression { public static class PermutationFeatureImportance { @@ -27,9 +28,13 @@ public static void Example() // Fit the pipeline to the data. var model = pipeline.Fit(data); - // Compute the permutation metrics for the linear model using the normalized data. + // Transform the dataset. var transformedData = model.Transform(data); + + // Extract the predictor. var linearPredictor = model.LastTransformer; + + // Compute the permutation metrics for the linear model using the normalized data. var permutationMetrics = mlContext.Regression.PermutationFeatureImportance( linearPredictor, transformedData, permutationCount: 30); From ee63a9aba9c29dbeecc6e9185c43ec0ee198bfce Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 10 Apr 2019 11:13:30 -0700 Subject: [PATCH 4/4] Addressing PR comments. --- .../PermutationFeatureImportance.cs | 56 +++++++++-------- .../PermutationFeatureImportance.cs | 61 ++++++++++-------- .../Ranking/PermutationFeatureImportance.cs | 63 +++++++++++-------- .../PermutationFeatureImportance.cs | 51 ++++++++------- 4 files changed, 132 insertions(+), 99 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs index 894ac521bc..55d6c54cc4 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/PermutationFeatureImportance.cs @@ -14,14 +14,15 @@ public static void Example() var mlContext = new MLContext(seed:1); // Create sample data. - var samples = Data.GenerateData(); + var samples = GenerateData(); // Load the sample data as an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); // Define a training pipeline that concatenates features into a vector, normalizes them, and then // trains a linear model. - var pipeline = mlContext.Transforms.Concatenate("Features", Data.FeatureColumns) + var featureColumns = new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }; + var pipeline = mlContext.Transforms.Concatenate("Features", featureColumns) .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.BinaryClassification.Trainers.SdcaLogisticRegression()); @@ -49,7 +50,7 @@ public static void Example() foreach (int i in sortedIndices) { Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}", - Data.FeatureColumns[i], + featureColumns[i], linearPredictor.Model.SubModel.Weights[i], auc[i].Mean, 1.96 * auc[i].StandardError); @@ -68,32 +69,37 @@ private class Data public float Feature1 { get; set; } public float Feature2 { get; set; } + } - public static readonly string[] FeatureColumns = new string[] { nameof(Feature1), nameof(Feature2) }; - - public static IEnumerable GenerateData(int nExamples = 10000, - double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the label. + /// The weight to multiply the first feature with to compute the label. + /// The weight to multiply the second feature with to compute the label. + /// The seed for generating feature values and label noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < nExamples; i++) { - var rng = new Random(seed); - for (int i = 0; i < nExamples; i++) + var data = new Data { - var data = new Data - { - Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - }; - - // Create a noisy label. - var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); - data.Label = Sigmoid(value) > 0.5; - yield return data; - } - } - - private static double Sigmoid(double x) - { - return 1.0 / (1.0 + Math.Exp(-1 * x)); + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + data.Label = Sigmoid(value) > 0.5; + yield return data; } } + + private static double Sigmoid(double x) => 1.0 / (1.0 + Math.Exp(-1 * x)); } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs index 08a7c1c49d..963fd238ca 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/MulticlassClassification/PermutationFeatureImportance.cs @@ -14,14 +14,15 @@ public static void Example() var mlContext = new MLContext(seed:1); // Create sample data. - var samples = Data.GenerateData(); + var samples = GenerateData(); // Load the sample data as an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); // Define a training pipeline that concatenates features into a vector, normalizes them, and then // trains a linear model. - var pipeline = mlContext.Transforms.Concatenate("Features", Data.FeatureColumns) + var featureColumns = new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }; + var pipeline = mlContext.Transforms.Concatenate("Features", featureColumns) .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy()); @@ -50,7 +51,7 @@ public static void Example() foreach (int i in sortedIndices) { Console.WriteLine("{0}\t{1:G4}\t{2:G4}", - Data.FeatureColumns[i], + featureColumns[i], microAccuracy[i].Mean, 1.96 * microAccuracy[i].StandardError); } @@ -68,32 +69,40 @@ private class Data public float Feature1 { get; set; } public float Feature2 { get; set; } + } - public static readonly string[] FeatureColumns = new string[] { nameof(Feature1), nameof(Feature2) }; - - public static IEnumerable GenerateData(int nExamples = 10000, - double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the label. + /// The weight to multiply the first feature with to compute the label. + /// The weight to multiply the second feature with to compute the label. + /// The seed for generating feature values and label noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + var max = bias + 4.5 * weight1 + 4.5 * weight2 + 0.5; + for (int i = 0; i < nExamples; i++) { - var rng = new Random(seed); - var max = bias + 4.5*weight1 + 4.5*weight2 + 0.5; - for (int i = 0; i < nExamples; i++) + var data = new Data { - var data = new Data - { - Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - }; - - // Create a noisy label. - var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); - if (value < max / 3) - data.Label = 0; - else if (value < 2 * max / 3) - data.Label = 1; - else - data.Label = 2; - yield return data; - } + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + if (value < max / 3) + data.Label = 0; + else if (value < 2 * max / 3) + data.Label = 1; + else + data.Label = 2; + yield return data; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs index 53c038046e..41928a70ee 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Ranking/PermutationFeatureImportance.cs @@ -14,14 +14,15 @@ public static void Example() var mlContext = new MLContext(seed:1); // Create sample data. - var samples = Data.GenerateData(); + var samples = GenerateData(); // Load the sample data as an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); // Define a training pipeline that concatenates features into a vector, normalizes them, and then // trains a linear model. - var pipeline = mlContext.Transforms.Concatenate("Features", Data.FeatureColumns) + var featureColumns = new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }; + var pipeline = mlContext.Transforms.Concatenate("Features", featureColumns) .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) .Append(mlContext.Transforms.Conversion.MapValueToKey("GroupId")) .Append(mlContext.Transforms.NormalizeMinMax("Features")) @@ -51,7 +52,7 @@ public static void Example() foreach (int i in sortedIndices) { Console.WriteLine("{0}\t{1:G4}\t{2:G4}", - Data.FeatureColumns[i], + featureColumns[i], ndcg[i][0].Mean, 1.96 * ndcg[i][0].StandardError); } @@ -71,33 +72,41 @@ private class Data public float Feature1 { get; set; } public float Feature2 { get; set; } + } - public static readonly string[] FeatureColumns = new string[] { nameof(Feature1), nameof(Feature2) }; - - public static IEnumerable GenerateData(int nExamples = 10000, - double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1, int groupSize = 5) + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the label. + /// The weight to multiply the first feature with to compute the label. + /// The weight to multiply the second feature with to compute the label. + /// The seed for generating feature values and label noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1, int groupSize = 5) + { + var rng = new Random(seed); + var max = bias + 4.5 * weight1 + 4.5 * weight2 + 0.5; + for (int i = 0; i < nExamples; i++) { - var rng = new Random(seed); - var max = bias + 4.5*weight1 + 4.5*weight2 + 0.5; - for (int i = 0; i < nExamples; i++) + var data = new Data { - var data = new Data - { - GroupId = i / groupSize, - Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - }; - - // Create a noisy label. - var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); - if (value < max / 3) - data.Label = 0; - else if (value < 2 * max / 3) - data.Label = 1; - else - data.Label = 2; - yield return data; - } + GroupId = i / groupSize, + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + var value = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + if (value < max / 3) + data.Label = 0; + else if (value < 2 * max / 3) + data.Label = 1; + else + data.Label = 2; + yield return data; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs index 599bed55a7..90cf94db2a 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Regression/PermutationFeatureImportance.cs @@ -14,16 +14,17 @@ public static void Example() var mlContext = new MLContext(seed:1); // Create sample data. - var samples = Data.GenerateData(); + var samples = GenerateData(); // Load the sample data as an IDataView. var data = mlContext.Data.LoadFromEnumerable(samples); // Define a training pipeline that concatenates features into a vector, normalizes them, and then // trains a linear model. - var pipeline = mlContext.Transforms.Concatenate("Features", Data.FeatureColumns) - .Append(mlContext.Transforms.NormalizeMinMax("Features")) - .Append(mlContext.Regression.Trainers.Ols()); + var featureColumns = new string[] { nameof(Data.Feature1), nameof(Data.Feature2) }; + var pipeline = mlContext.Transforms.Concatenate("Features", featureColumns) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) + .Append(mlContext.Regression.Trainers.Ols()); // Fit the pipeline to the data. var model = pipeline.Fit(data); @@ -49,7 +50,7 @@ public static void Example() foreach (int i in sortedIndices) { Console.WriteLine("{0}\t{1:0.00}\t{2:G4}\t{3:G4}", - Data.FeatureColumns[i], + featureColumns[i], linearPredictor.Model.Weights[i], rmse[i].Mean, 1.96 * rmse[i].StandardError); @@ -68,25 +69,33 @@ private class Data public float Feature1 { get; set; } public float Feature2 { get; set; } + } - public static readonly string[] FeatureColumns = new string[] { nameof(Feature1), nameof(Feature2) }; - - public static IEnumerable GenerateData(int nExamples = 10000, - double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + /// + /// Generate an enumerable of Data objects, creating the label as a simple + /// linear combination of the features. + /// + /// The number of examples. + /// The bias, or offset, in the calculation of the label. + /// The weight to multiply the first feature with to compute the label. + /// The weight to multiply the second feature with to compute the label. + /// The seed for generating feature values and label noise. + /// An enumerable of Data objects. + private static IEnumerable GenerateData(int nExamples = 10000, + double bias = 0, double weight1 = 1, double weight2 = 2, int seed = 1) + { + var rng = new Random(seed); + for (int i = 0; i < nExamples; i++) { - var rng = new Random(seed); - for (int i = 0; i < nExamples; i++) + var data = new Data { - var data = new Data - { - Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), - }; - - // Create a noisy label. - data.Label = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); - yield return data; - } + Feature1 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + Feature2 = (float)(rng.Next(10) * (rng.NextDouble() - 0.5)), + }; + + // Create a noisy label. + data.Label = (float)(bias + weight1 * data.Feature1 + weight2 * data.Feature2 + rng.NextDouble() - 0.5); + yield return data; } } }