From ee252181d96fa6bf64f5190eb875803df33141c7 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Mon, 4 Mar 2019 10:16:30 -0800 Subject: [PATCH 1/9] work in progress --- .../IntrospectiveTraining.cs | 116 ++++++++++++++++++ test/Microsoft.ML.TestFramework/Datasets.cs | 2 + 2 files changed, 118 insertions(+) create mode 100644 test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs new file mode 100644 index 0000000000..e9233ea298 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -0,0 +1,116 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; +using Microsoft.ML.Functional.Tests.Datasets; +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.Transforms; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Functional.Tests +{ + public class IntrospectiveTraining : BaseTestClass + { + public IntrospectiveTraining(ITestOutputHelper output): base(output) + { + } + + /// + /// Introspective Training: Map hashed values back to the original value. + /// + [Fact] + public void InspectSlotNamesForReversibleHash() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Load the Adult dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.adult.trainFilename), + hasHeader: TestDatasets.adult.fileHasHeader, + separatorChar: TestDatasets.adult.fileSeparator); + + // Create the learning pipeline. + var pipeline = mlContext.Transforms.Concatenate("NumericalFeatures", Adult.NumericalFeatures) + .Append(mlContext.Transforms.Concatenate("CategoricalFeatures", Adult.CategoricalFeatures)) + .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CategoricalFeatures", hashBits: 8, // get collisions! + invertHash: -1, outputKind: OneHotEncodingTransformer.OutputKind.Bag)); + + // Train the model. + var model = pipeline.Fit(data); + + // Transform the data. + var transformedData = model.Transform(data); + + // Verify that the slotnames cane be used to backtrack by confirming that + // all unique values in the input data are in the output data slot names. + // First get a list of the unique values. + VBuffer> categoricalSlotNames = new VBuffer>(); + transformedData.Schema["CategoricalFeatures"].GetSlotNames(ref categoricalSlotNames); + var uniqueValues = new HashSet(); + foreach (var slotName in categoricalSlotNames.GetValues()) + { + var slotNameString = slotName.ToString(); + if (slotNameString.StartsWith('{')) + { + // Values look like this: {3:Exec-managerial,2:Widowed}. + slotNameString = slotNameString.Substring(1, slotNameString.Length - 2); + foreach (var name in slotNameString.Split(',')) + uniqueValues.Add(name); + } + else + uniqueValues.Add(slotNameString); + } + + // Now validate that all values in the dataset are there + var transformedRows = mlContext.Data.CreateEnumerable(data, false); + foreach (var row in transformedRows) + { + for (int i = 0; i < Adult.CategoricalFeatures.Length; i++) + { + // Fetch the categorical value. + string value = (string) row.GetType().GetProperty(Adult.CategoricalFeatures[i]).GetValue(row, null); + Assert.Contains($"{i}:{value}", uniqueValues); + } + } + + float x = (float)double.MinValue; + Output.WriteLine($"{x}"); + } + + //private void BooYa() + //{ + // // Create the learning pipeline + // var nestedPipeline = mlContext.Transforms.Concatenate("NumericalFeatures", Adult.NumericalFeatures) + // .Append(mlContext.Transforms.Concatenate("CategoricalFeatures", Adult.CategoricalFeatures)) + // .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CategoricalFeatures", + // invertHash: 2, outputKind: OneHotEncodingTransformer.OutputKind.Bag) + // .Append(mlContext.Transforms.Concatenate("Features", "NumericalFeatures", "CategoricalFeatures")) + // .Append(mlContext.BinaryClassification.Trainers.LogisticRegression())); + + // // Train the model. + // var nestedModel = nestedPipeline.Fit(data); + // var nestedPredictor = nestedModel.LastTransformer.LastTransformer; + // var nestedTransformedData = nestedModel.Transform(data); + + // Assert.Equal(predictor.Model.SubModel.Bias, nestedPredictor.Model.SubModel.Bias); + // int nFeatures = predictor.Model.SubModel.Weights.Count; + // for (int i = 0; i(transformedData, false).ToArray(); + // var nestedTransformedRows = mlContext.Data.CreateEnumerable(nestedTransformedData, false).ToArray(); + // for (int i = 0; i Date: Tue, 5 Mar 2019 12:52:08 -0800 Subject: [PATCH 2/9] Adding introspective training scenario tests. --- test/Microsoft.ML.Functional.Tests/Common.cs | 15 + .../Evaluation.cs | 16 +- .../IntrospectiveTraining.cs | 387 ++++++++++++++++-- .../Validation.cs | 28 +- test/Microsoft.ML.TestFramework/Datasets.cs | 2 + .../Api/Estimators/IntrospectiveTraining.cs | 153 ------- 6 files changed, 387 insertions(+), 214 deletions(-) delete mode 100644 test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index b7971d881a..40eef88951 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -267,5 +267,20 @@ public static void AssertMetricsStatistics(RegressionMetricsStatistics metrics) AssertMetricStatistics(metrics.RSquared); AssertMetricStatistics(metrics.LossFn); } + + /// + /// Verify that a numerical array has no NaNs or infinities. + /// + /// An array of doubles. + public static void AssertFiniteNumbers(double[] array, int ignoreElementAt = -1) + { + for (int i = 0; i < array.Length; i++) + { + if (i == ignoreElementAt) + continue; + Assert.False(double.IsNaN(array[i])); + Assert.True(double.IsFinite(array[i])); + } + } } } diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index 404e17a44d..e3e280f1f2 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -237,17 +237,11 @@ public void TrainAndEvaluateRegression() { var mlContext = new MLContext(seed: 1, conc: 1); - // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) - .Load(GetDataPath(TestDatasets.housing.trainFilename)); - - // Create a pipeline to train on the sentiment data. - var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { - "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", - "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) - .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) - .Append(mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumberOfThreads = 1 })); + // Get the dataset + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Regression.Trainers.FastForest(new FastForestRegression.Options { NumberOfThreads = 1 })); // Train the model. var model = pipeline.Fit(data); diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index e9233ea298..6043da927d 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -4,10 +4,13 @@ using System; using System.Collections.Generic; +using Microsoft.ML.Calibrators; using Microsoft.ML.Data; using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Transforms; using Xunit; using Xunit.Abstractions; @@ -21,7 +24,296 @@ public IntrospectiveTraining(ITestOutputHelper output): base(output) } /// - /// Introspective Training: Map hashed values back to the original value. + /// Introspective Training: Tree ensembles learned from FastForest can be inspected. + /// + [Fact] + public void InspectFastForestRegresionTrees() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Get the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Regression.Trainers.FastForest(numLeaves: 5, numTrees: 3)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the boosted tree model. + var fastForestModel = model.LastTransformer.Model; + + // Extract the learned Random Forests model. + var treeCollection = fastForestModel.TrainedTreeEnsemble; + + // Inspect properties in the extracted model. + Assert.Equal(3, treeCollection.Trees.Count); + Assert.Equal(3, treeCollection.TreeWeights.Count); + Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight)); + Assert.All(treeCollection.Trees, tree => + { + Assert.Equal(5, tree.NumberOfLeaves); + Assert.Equal(4, tree.NumberOfNodes); + Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); + Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); + Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag)); + Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count); + Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); + }); + } + + /// + /// Introspective Training: Tree ensembles learned from FastTree can be inspected. + /// + [Fact] + public void InspectFastTreeModelParameters() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator, + allowQuoting: TestDatasets.Sentiment.allowQuoting); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.FastTree(numberOfLeaves: 5, numberOfTrees: 3)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the boosted tree model. + var fastTreeModel = model.LastTransformer.Model.SubModel; + + // Extract the learned GBDT model. + var treeCollection = fastTreeModel.TrainedTreeEnsemble; + + // Make sure the tree models were formed as expected. + Assert.Equal(3, treeCollection.Trees.Count); + Assert.Equal(3, treeCollection.TreeWeights.Count); + Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight)); + Assert.All(treeCollection.Trees, tree => + { + Assert.Equal(5, tree.NumberOfLeaves); + Assert.Equal(4, tree.NumberOfNodes); + Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); + Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); + Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag)); + Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count); + Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); + }); + + // Add baselines for the model. + // Verify that there is no bias. + Assert.Equal(0, treeCollection.Bias); + // Check the parameters of the final tree. + var finalTree = treeCollection.Trees[2]; + Assert.Equal(finalTree.LeftChild, new int[] { 2, -2, -1, -3 }); + Assert.Equal(finalTree.RightChild, new int[] { 1, 3, -4, -5 }); + Assert.Equal(finalTree.NumericalSplitFeatureIndexes, new int[] { 14, 294, 633, 266 }); + var expectedSplitGains = new double[] { 0.52634223978445616, 0.45899249367725858, 0.44142707650267105, 0.38348634823264854 }; + var expectedThresholds = new float[] { 0.0911167f, 0.06509889f, 0.019873254f, 0.0361835f }; + for (int i = 0; i < finalTree.NumberOfNodes; ++i) + { + Assert.Equal(expectedSplitGains[i], finalTree.SplitGains[i], 6); + Assert.Equal(expectedThresholds[i], finalTree.NumericalSplitThresholds[i], 6); + } + } + + /// + /// Introspective Training: GAM Shape Functions are easily accessed. + /// + [Fact] + void IntrospectGamShapeFunctions() + { + // Concurrency must be 1 to assure that the mapping is done sequentially. + var mlContext = new MLContext(seed: 1, conc: 1); + + // Load the Iris dataset. + var data = mlContext.Data.LoadFromTextFile( + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Compose the transformation. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels( + new RegressionGamTrainer.Options { NumIterations = 100 })); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the normalizer from the trained pipeline. + var gamModel = model.LastTransformer.Model; + + // Take look at the shape functions. + var shapeFunctionsBins = gamModel.GetBinUpperBounds(); + var shapeFunctionsValues = gamModel.GetBinEffects(); + + // Validate that the shape functions lengths match. + Assert.Equal(shapeFunctionsBins.Length, shapeFunctionsValues.Length); + for (int i = 0; i < shapeFunctionsBins.Length; i++) + { + Assert.Equal(shapeFunctionsBins[i].Length, shapeFunctionsValues[i].Length); + Common.AssertFiniteNumbers(shapeFunctionsBins[i], shapeFunctionsBins[i].Length - 1); + Common.AssertFiniteNumbers(shapeFunctionsValues[i]); + } + } + + /// + /// Introspective Training: LDA models can be easily inspected. + /// + [Fact] + public void InspectLdaModelParameters() + { + // Test Parameters + int numTopics = 10; + + var mlContext = new MLContext(seed: 1, conc: 1); + + // Load the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator, + allowQuoting: TestDatasets.Sentiment.allowQuoting); + + // Define the pipeline. + var pipeline = mlContext.Transforms.Text.ProduceWordBags("SentimentBag", "SentimentText") + .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "SentimentBag", numTopic: numTopics, numIterations: 10)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Get the trained LDA model. + // TODO #2197: Get the topics and summaries from the model. + var ldaTransform = model.LastTransformer; + + // Transform the data. + var transformedData = model.Transform(data); + + // Make sure the model weights array is the same length as the features array. + var numFeatures = (transformedData.Schema["Features"].Type as VectorType).Size; + Assert.Equal(numFeatures, numTopics); + } + + /// + /// Introspective Training: Linear model parameters may be inspected. + /// + [Fact] + public void InpsectLinearModelParameters() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator, + allowQuoting: TestDatasets.Sentiment.allowQuoting); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( + new SdcaNonCalibratedBinaryTrainer.Options { NumThreads = 1 })); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Transform the data. + var transformedData = model.Transform(data); + + // Extract the linear model from the pipeline. + var linearModel = model.LastTransformer.Model; + + // Get the model bias and weights. + var bias = linearModel.Bias; + var weights = linearModel.Weights; + + // Make sure the model weights array is the same length as the features array. + var numFeatures = (transformedData.Schema["Features"].Type as VectorType).Size; + Assert.Equal(numFeatures, weights.Count); + } + + /// + /// Introspectable Training: Parameters of a trained Normalizer are easily accessed. + /// + [Fact] + void IntrospectNormalization() + { + // Concurrency must be 1 to assure that the mapping is done sequentially. + var mlContext = new MLContext(seed: 1, conc: 1); + + // Load the Iris dataset. + var data = mlContext.Data.LoadFromTextFile( + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Compose the transformation. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the normalizer from the trained pipeline. + // TODO #2854: Extract the normalizer parameters. + var normalizer = model.LastTransformer; + } + + /// + /// I can take an existing model file and inspect what transformers were included in the pipeline. + /// + [Fact] + public void InspectPipelineContents() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + // Get the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Regression.Trainers.FastForest(numLeaves: 5, numTrees: 3)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Inspect the transforms in the trained pipeline. + var expectedTypes = new Type[] {typeof(ColumnConcatenatingTransformer), + typeof(RegressionPredictionTransformer)}; + var expectedColumns = new string[][] { + new string[] { "Features" }, + new string[] { "Score" }, + }; + int i = 0; + var currentSchema = data.Schema; + foreach (var transformer in model) + { + // It is possible to get the type at runtime. + Assert.IsType(expectedTypes[i], transformer); + + // It's also possible to inspect the schema output from the transform. + currentSchema = transformer.GetOutputSchema(currentSchema); + foreach (var expectedColumn in expectedColumns[i]) + { + var column = currentSchema.GetColumnOrNull(expectedColumn); + Assert.NotNull(column); + } + // And we can see that future columns do not yet exist. + if (i < expectedColumns.Length - 1) + foreach (var expectedColumn in expectedColumns[i+1]) + { + var column = currentSchema.GetColumnOrNull(expectedColumn); + Assert.Null(column); + } + i++; + } + } + + /// + /// Introspective Training: Hashed values can be mapped back to the original column and value. /// [Fact] public void InspectSlotNamesForReversibleHash() @@ -39,7 +331,7 @@ public void InspectSlotNamesForReversibleHash() .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CategoricalFeatures", hashBits: 8, // get collisions! invertHash: -1, outputKind: OneHotEncodingTransformer.OutputKind.Bag)); - // Train the model. + // Fit the pipeline. var model = pipeline.Fit(data); // Transform the data. @@ -65,7 +357,7 @@ public void InspectSlotNamesForReversibleHash() uniqueValues.Add(slotNameString); } - // Now validate that all values in the dataset are there + // Now validate that all values in the dataset are there. var transformedRows = mlContext.Data.CreateEnumerable(data, false); foreach (var row in transformedRows) { @@ -76,41 +368,64 @@ public void InspectSlotNamesForReversibleHash() Assert.Contains($"{i}:{value}", uniqueValues); } } + } + + [Fact] + public void InspectNestedPipeline() + { + var mlContext = new MLContext(seed: 1, conc: 1); + + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(StepOne(mlContext)) + .Append(StepTwo(mlContext)); + + // Fit the pipeline. + var model = pipeline.Fit(data); - float x = (float)double.MinValue; - Output.WriteLine($"{x}"); + // Extract the trained models. + var modelEnumerator = model.GetEnumerator(); + modelEnumerator.MoveNext(); // The Concat Transform + modelEnumerator.MoveNext(); + var kMeansModel = (modelEnumerator.Current as TransformerChain>).LastTransformer; + modelEnumerator.MoveNext(); + var mcLrModel = (modelEnumerator.Current as TransformerChain>).LastTransformer; + + // Validate the k-means model. + VBuffer[] centroids = default; + kMeansModel.Model.GetClusterCentroids(ref centroids, out int nCentroids); + Assert.Equal(4, centroids.Length); + + // Validate the MulticlassLogisticRegressionModel. + VBuffer[] weights = default; + mcLrModel.Model.GetWeights(ref weights, out int classes); + Assert.Equal(3, weights.Length); } - //private void BooYa() - //{ - // // Create the learning pipeline - // var nestedPipeline = mlContext.Transforms.Concatenate("NumericalFeatures", Adult.NumericalFeatures) - // .Append(mlContext.Transforms.Concatenate("CategoricalFeatures", Adult.CategoricalFeatures)) - // .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CategoricalFeatures", - // invertHash: 2, outputKind: OneHotEncodingTransformer.OutputKind.Bag) - // .Append(mlContext.Transforms.Concatenate("Features", "NumericalFeatures", "CategoricalFeatures")) - // .Append(mlContext.BinaryClassification.Trainers.LogisticRegression())); - - // // Train the model. - // var nestedModel = nestedPipeline.Fit(data); - // var nestedPredictor = nestedModel.LastTransformer.LastTransformer; - // var nestedTransformedData = nestedModel.Transform(data); - - // Assert.Equal(predictor.Model.SubModel.Bias, nestedPredictor.Model.SubModel.Bias); - // int nFeatures = predictor.Model.SubModel.Weights.Count; - // for (int i = 0; i(transformedData, false).ToArray(); - // var nestedTransformedRows = mlContext.Data.CreateEnumerable(nestedTransformedData, false).ToArray(); - // for (int i = 0; i>> StepOne(MLContext mlContext) + { + return mlContext.Transforms.Concatenate("LabelAndFeatures", "Label", "Features") + .Append(mlContext.Clustering.Trainers.KMeans( + new KMeansPlusPlusTrainer.Options + { + InitAlgorithm = KMeansPlusPlusTrainer.InitAlgorithm.Random, + ClustersCount = 4, + MaxIterations = 10, + NumThreads = 1 + })); + } + + private IEstimator>> StepTwo(MLContext mlContext) + { + return mlContext.Transforms.Conversion.MapValueToKey("Label") + .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent( + new SdcaMultiClassTrainer.Options { + MaxIterations = 10, + NumThreads = 1 })); + } } } \ No newline at end of file diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index cb0cacfc7e..0328f435ad 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -9,11 +9,17 @@ using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Trainers; using Xunit; +using Microsoft.ML.Functional.Tests.Datasets; +using Xunit.Abstractions; namespace Microsoft.ML.Functional.Tests { - public class ValidationScenarios + public class Validation : BaseTestClass { + public Validation(ITestOutputHelper output) : base(output) + { + } + /// /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with /// a data source (optionally with stratification column), come up with an instantiable transform @@ -26,16 +32,11 @@ void CrossValidation() { var mlContext = new MLContext(seed: 1, conc: 1); - // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) - .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); + // Get the dataset + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); - // Create a pipeline to train on the sentiment data. - var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { - "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", - "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) - .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares()); // Compute the CV result. @@ -64,15 +65,14 @@ public void TrainWithValidationSet() var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); + + // Create the train and validation set. var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2); var trainData = dataSplit.TrainSet; var validData = dataSplit.TestSet; // Create a pipeline to featurize the dataset. - var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { - "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", - "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) - .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) .AppendCacheCheckpoint(mlContext) as IEstimator; // Preprocess the datasets. diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index 58f901bb7c..23cb24dc62 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -16,6 +16,7 @@ public class TestDataset public string labelFilename; public char fileSeparator; public bool fileHasHeader; + public bool allowQuoting; // REVIEW: Replace these with appropriate SubComponents! public string settings; @@ -212,6 +213,7 @@ public static class TestDatasets testFilename = "wikipedia-detox-250-line-test.tsv", fileHasHeader = true, fileSeparator = '\t', + allowQuoting = true, GetLoaderColumns = () => { return new[] diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs deleted file mode 100644 index 040efac74b..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs +++ /dev/null @@ -1,153 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Calibrators; -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Microsoft.ML.Trainers.FastTree; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - - public partial class ApiScenariosTests - { - /// - /// Introspective training: Models that produce outputs and are otherwise black boxes are of limited use; - /// it is also necessary often to understand at least to some degree what was learnt. To outline critical - /// scenarios that have come up multiple times: - /// *) When I train a linear model, I should be able to inspect coefficients. - /// *) The tree ensemble learners, I should be able to inspect the trees. - /// *) The LDA transform, I should be able to inspect the topics. - /// I view it as essential from a usability perspective that this be discoverable to someone without - /// having to read documentation. For example, if I have var lda = new LdaTransform().Fit(data)(I don't insist on that - /// exact signature, just giving the idea), then if I were to type lda. - /// In Visual Studio, one of the auto-complete targets should be something like GetTopics. - /// - - [Fact] - public void IntrospectiveTraining() - { - var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true, allowQuoting: true); - - var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .AppendCacheCheckpoint(ml) - .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( - new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1 })); - - // Train. - var model = pipeline.Fit(data); - - // Get feature weights. - var weights = model.LastTransformer.Model.Weights; - } - - [Fact] - public void FastTreeClassificationIntrospectiveTraining() - { - var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true, allowQuoting: true); - - var trainer = ml.BinaryClassification.Trainers.FastTree(numberOfLeaves: 5, numberOfTrees: 3); - - BinaryPredictionTransformer> pred = null; - - var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .AppendCacheCheckpoint(ml) - .Append(trainer.WithOnFitDelegate(p => pred = p)); - - // Train. - var model = pipeline.Fit(data); - - // Extract the learned GBDT model. - var treeCollection = pred.Model.SubModel.TrainedTreeEnsemble; - - // Inspect properties in the extracted model. - Assert.Equal(3, treeCollection.Trees.Count); - Assert.Equal(3, treeCollection.TreeWeights.Count); - Assert.Equal(0, treeCollection.Bias); - Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight)); - - // Inspect the last tree. - var tree = treeCollection.Trees[2]; - - Assert.Equal(5, tree.NumberOfLeaves); - Assert.Equal(4, tree.NumberOfNodes); - Assert.Equal(tree.LeftChild, new int[] { 2, -2, -1, -3 }); - Assert.Equal(tree.RightChild, new int[] { 1, 3, -4, -5 }); - Assert.Equal(tree.NumericalSplitFeatureIndexes, new int[] { 14, 294, 633, 266 }); - Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); - Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); - var expectedSplitGains = new double[] { 0.52634223978445616, 0.45899249367725858, 0.44142707650267105, 0.38348634823264854 }; - var expectedThresholds = new float[] { 0.0911167f, 0.06509889f, 0.019873254f, 0.0361835f }; - for (int i = 0; i < tree.NumberOfNodes; ++i) - { - Assert.Equal(expectedSplitGains[i], tree.SplitGains[i], 6); - Assert.Equal(expectedThresholds[i], tree.NumericalSplitThresholds[i], 6); - } - Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag)); - - Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count); - Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); - } - - [Fact] - public void FastForestRegressionIntrospectiveTraining() - { - var ml = new MLContext(seed: 1, conc: 1); - var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(1000); - var dataView = ml.Data.LoadFromEnumerable(data); - - RegressionPredictionTransformer pred = null; - var trainer = ml.Regression.Trainers.FastForest(numLeaves: 5, numTrees: 3).WithOnFitDelegate(p => pred = p); - - // Train. - var model = trainer.Fit(dataView); - - // Extract the learned RF model. - var treeCollection = pred.Model.TrainedTreeEnsemble; - - // Inspect properties in the extracted model. - Assert.Equal(3, treeCollection.Trees.Count); - Assert.Equal(3, treeCollection.TreeWeights.Count); - Assert.Equal(0, treeCollection.Bias); - Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight)); - - // Inspect the last tree. - var tree = treeCollection.Trees[2]; - - Assert.Equal(5, tree.NumberOfLeaves); - Assert.Equal(4, tree.NumberOfNodes); - Assert.Equal(tree.LeftChild, new int[] { -1, -2, -3, -4 }); - Assert.Equal(tree.RightChild, new int[] { 1, 2, 3, -5 }); - Assert.Equal(tree.NumericalSplitFeatureIndexes, new int[] { 9, 0, 1, 8 }); - Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); - Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); - var expectedSplitGains = new double[] { 21.279269008093962, 19.376698810984138, 17.830020749728774, 17.366801337893413 }; - var expectedThresholds = new float[] { 0.208134219f, 0.198336035f, 0.202952743f, 0.205061346f }; - for (int i = 0; i < tree.NumberOfNodes; ++i) - { - Assert.Equal(expectedSplitGains[i], tree.SplitGains[i], 6); - Assert.Equal(expectedThresholds[i], tree.NumericalSplitThresholds[i], 6); - } - Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag)); - - Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count); - Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); - - var samples = new double[] { 0.97468354430379744, 1.0, 0.97727272727272729, 0.972972972972973, 0.26124197002141325 }; - for (int i = 0; i < tree.NumberOfLeaves; ++i) - { - var sample = tree.GetLeafSamplesAt(i); - Assert.Single(sample); - Assert.Equal(samples[i], sample[0], 6); - var weight = tree.GetLeafSampleWeightsAt(i); - Assert.Single(weight); - Assert.Equal(1, weight[0]); - } - } - } -} From c80f14f6f27792d1dd988363897e4213a7da6c37 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 5 Mar 2019 12:52:28 -0800 Subject: [PATCH 3/9] Adding an adult dataset. --- .../Datasets/Adult.cs | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs new file mode 100644 index 0000000000..94bdc99d15 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs @@ -0,0 +1,72 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + + +using System; +using Microsoft.Data.DataView; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class for the Iris test dataset. + /// + internal sealed class Adult + { + [LoadColumn(0)] + public bool Label { get; set; } + + [LoadColumn(1)] + public string WorkClass { get; set; } + + [LoadColumn(2)] + public string Education { get; set; } + + [LoadColumn(3)] + public string MaritalStatus { get; set; } + + [LoadColumn(4)] + public string Occupation { get; set; } + + [LoadColumn(5)] + public string Relationship { get; set; } + + [LoadColumn(6)] + public string Ethnicity { get; set; } + + [LoadColumn(7)] + public string Sex { get; set; } + + [LoadColumn(8)] + public string NativeCountryRegion { get; set; } + + [LoadColumn(9)] + public float Age { get; set; } + + [LoadColumn(10)] + public float FinalWeight { get; set; } + + [LoadColumn(11)] + public float EducationNum { get; set; } + + [LoadColumn(12)] + public float CapitalGain { get; set; } + + [LoadColumn(13)] + public float CapitalLoss { get; set; } + + [LoadColumn(14)] + public float HoursPerWeek { get; set; } + + /// + /// The list of columns commonly used as numerical features. + /// + public static readonly string[] CategoricalFeatures = new string[] { "WorkClass", "Education", "MaritalStatus", "Occupation", "Relationship", "Ethnicity", "Sex", "NativeCountryRegion" }; + + /// + /// The list of columns commonly used as numerical features. + /// + public static readonly string[] NumericalFeatures = new string[] { "Age", "FinalWeight", "EducationNum", "CapitalGain", "CapitalLoss", "HoursPerWeek" }; + } +} From ec11f35ae9ab6e7dc79e13edb159e68fa8a4fc10 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Tue, 5 Mar 2019 13:17:36 -0800 Subject: [PATCH 4/9] Fixing merge issues. --- .../IntrospectiveTraining.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index 6043da927d..21af8bf17e 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -412,10 +412,10 @@ private IEstimator Date: Wed, 6 Mar 2019 11:12:20 -0800 Subject: [PATCH 5/9] Addressing PR comments. --- .../Datasets/Adult.cs | 7 ++--- .../IntrospectiveTraining.cs | 26 +++++++------------ .../Validation.cs | 4 +-- 3 files changed, 14 insertions(+), 23 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs index 94bdc99d15..440515ac06 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs @@ -2,15 +2,12 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. - -using System; -using Microsoft.Data.DataView; using Microsoft.ML.Data; namespace Microsoft.ML.Functional.Tests.Datasets { /// - /// A class for the Iris test dataset. + /// A class for the Adult test dataset. /// internal sealed class Adult { @@ -60,7 +57,7 @@ internal sealed class Adult public float HoursPerWeek { get; set; } /// - /// The list of columns commonly used as numerical features. + /// The list of columns commonly used as categorical features. /// public static readonly string[] CategoricalFeatures = new string[] { "WorkClass", "Education", "MaritalStatus", "Occupation", "Relationship", "Ethnicity", "Sex", "NativeCountryRegion" }; diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index 21af8bf17e..547cfde0d0 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.Linq; using Microsoft.ML.Calibrators; using Microsoft.ML.Data; using Microsoft.ML.Functional.Tests.Datasets; @@ -263,7 +264,7 @@ void IntrospectNormalization() } /// - /// I can take an existing model file and inspect what transformers were included in the pipeline. + /// Introspective Training: I can inspect a pipeline to determine which transformers were included. /// [Fact] public void InspectPipelineContents() @@ -301,13 +302,6 @@ public void InspectPipelineContents() var column = currentSchema.GetColumnOrNull(expectedColumn); Assert.NotNull(column); } - // And we can see that future columns do not yet exist. - if (i < expectedColumns.Length - 1) - foreach (var expectedColumn in expectedColumns[i+1]) - { - var column = currentSchema.GetColumnOrNull(expectedColumn); - Assert.Null(column); - } i++; } } @@ -337,8 +331,8 @@ public void InspectSlotNamesForReversibleHash() // Transform the data. var transformedData = model.Transform(data); - // Verify that the slotnames cane be used to backtrack by confirming that - // all unique values in the input data are in the output data slot names. + // Verify that the slotnames can be used to backtrack to the original values by confirming that + // all unique values in the input data are in the output data slot names. // First get a list of the unique values. VBuffer> categoricalSlotNames = new VBuffer>(); transformedData.Schema["CategoricalFeatures"].GetSlotNames(ref categoricalSlotNames); @@ -370,6 +364,9 @@ public void InspectSlotNamesForReversibleHash() } } + /// + /// Introspective Training: I can create nested pipelines, and extract individual components. + /// [Fact] public void InspectNestedPipeline() { @@ -388,12 +385,9 @@ public void InspectNestedPipeline() var model = pipeline.Fit(data); // Extract the trained models. - var modelEnumerator = model.GetEnumerator(); - modelEnumerator.MoveNext(); // The Concat Transform - modelEnumerator.MoveNext(); - var kMeansModel = (modelEnumerator.Current as TransformerChain>).LastTransformer; - modelEnumerator.MoveNext(); - var mcLrModel = (modelEnumerator.Current as TransformerChain>).LastTransformer; + var modelComponents = model.ToList(); + var kMeansModel = (modelComponents[1] as TransformerChain>).LastTransformer; + var mcLrModel = (modelComponents[2] as TransformerChain>).LastTransformer; // Validate the k-means model. VBuffer[] centroids = default; diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index 0328f435ad..70eb64d36c 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -4,12 +4,12 @@ using Microsoft.Data.DataView; using Microsoft.ML.Data; +using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework; -using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; using Xunit; -using Microsoft.ML.Functional.Tests.Datasets; using Xunit.Abstractions; namespace Microsoft.ML.Functional.Tests From b29f4a3d2acddf3e852c391b35027268d9df1121 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 6 Mar 2019 12:23:19 -0800 Subject: [PATCH 6/9] Addressing PR comments. --- test/Microsoft.ML.Functional.Tests/Common.cs | 21 +++++++++++++--- .../IntrospectiveTraining.cs | 25 ++++++++++++++++--- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index 40eef88951..2d0accfe4c 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -269,12 +269,27 @@ public static void AssertMetricsStatistics(RegressionMetricsStatistics metrics) } /// - /// Verify that a numerical array has no NaNs or infinities. + /// Verify that a float array has no NaNs or infinities. /// /// An array of doubles. - public static void AssertFiniteNumbers(double[] array, int ignoreElementAt = -1) + public static void AssertFiniteNumbers(IList array, int ignoreElementAt = -1) { - for (int i = 0; i < array.Length; i++) + for (int i = 0; i < array.Count; i++) + { + if (i == ignoreElementAt) + continue; + Assert.False(float.IsNaN(array[i])); + Assert.True(float.IsFinite(array[i])); + } + } + + /// + /// Verify that a double array has no NaNs or infinities. + /// + /// An array of doubles. + public static void AssertFiniteNumbers(IList array, int ignoreElementAt = -1) + { + for (int i = 0; i < array.Count; i++) { if (i == ignoreElementAt) continue; diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index 547cfde0d0..22a0a89218 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -4,8 +4,9 @@ using System; using System.Collections.Generic; +using System.Collections.Immutable; using System.Linq; -using Microsoft.ML.Calibrators; +//using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.RunTests; @@ -20,7 +21,7 @@ namespace Microsoft.ML.Functional.Tests { public class IntrospectiveTraining : BaseTestClass { - public IntrospectiveTraining(ITestOutputHelper output): base(output) + public IntrospectiveTraining(ITestOutputHelper output) : base(output) { } @@ -259,10 +260,26 @@ void IntrospectNormalization() var model = pipeline.Fit(data); // Extract the normalizer from the trained pipeline. - // TODO #2854: Extract the normalizer parameters. var normalizer = model.LastTransformer; - } + // Extract the normalizer parameters. + // TODO #2854: Normalizer parameters are easy to find via intellisense. + int i = 0; + bool found = false; + foreach (var column in normalizer.Columns) + { + if (column.Name == "Features") + { + found = true; + var featuresNormalizer = normalizer.Columns[i].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters>; + Assert.NotNull(featuresNormalizer); + Common.AssertFiniteNumbers(featuresNormalizer.Offset); + Common.AssertFiniteNumbers(featuresNormalizer.Scale); + } + i++; + } + Assert.True(found); + } /// /// Introspective Training: I can inspect a pipeline to determine which transformers were included. /// From 4f7d8f57bb79f2d2a6626c140307acd14ce99214 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 6 Mar 2019 12:50:19 -0800 Subject: [PATCH 7/9] Address merge issues --- .../IntrospectiveTraining.cs | 26 +++++++++---------- .../Validation.cs | 6 ++--- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index 22a0a89218..7d54931bf1 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -6,7 +6,6 @@ using System.Collections.Generic; using System.Collections.Immutable; using System.Linq; -//using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.RunTests; @@ -142,7 +141,7 @@ void IntrospectGamShapeFunctions() // Compose the transformation. var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels( - new RegressionGamTrainer.Options { NumIterations = 100 })); + new RegressionGamTrainer.Options { NumberOfIterations = 100 })); // Fit the pipeline. var model = pipeline.Fit(data); @@ -151,16 +150,15 @@ void IntrospectGamShapeFunctions() var gamModel = model.LastTransformer.Model; // Take look at the shape functions. - var shapeFunctionsBins = gamModel.GetBinUpperBounds(); - var shapeFunctionsValues = gamModel.GetBinEffects(); - - // Validate that the shape functions lengths match. - Assert.Equal(shapeFunctionsBins.Length, shapeFunctionsValues.Length); - for (int i = 0; i < shapeFunctionsBins.Length; i++) + for (int i = 0; i < gamModel.NumberOfShapeFunctions; i++) { - Assert.Equal(shapeFunctionsBins[i].Length, shapeFunctionsValues[i].Length); - Common.AssertFiniteNumbers(shapeFunctionsBins[i], shapeFunctionsBins[i].Length - 1); - Common.AssertFiniteNumbers(shapeFunctionsValues[i]); + var shapeFunctionBins = gamModel.GetBinUpperBounds(i); + var shapeFunctionValues = gamModel.GetBinEffects(i); + + // Validate that the shape functions lengths match. + Assert.Equal(shapeFunctionBins.Count, shapeFunctionValues.Count); + Common.AssertFiniteNumbers(shapeFunctionBins as IList, shapeFunctionBins.Count - 1); + Common.AssertFiniteNumbers(shapeFunctionValues as IList); } } @@ -217,7 +215,7 @@ public void InpsectLinearModelParameters() var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") .AppendCacheCheckpoint(mlContext) .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( - new SdcaNonCalibratedBinaryTrainer.Options { NumThreads = 1 })); + new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1 })); // Fit the pipeline. var model = pipeline.Fit(data); @@ -435,8 +433,8 @@ private IEstimator(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); // Create the train and validation set. var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2); @@ -81,7 +79,7 @@ public void TrainWithValidationSet() var preprocessedValidData = preprocessor.Transform(validData); // Train the model with a validation set. - var trainedModel = mlContext.Regression.Trainers.FastTree(new Trainers.FastTree.FastTreeRegressionTrainer.Options { + var trainedModel = mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumberOfTrees = 2, EarlyStoppingMetric = EarlyStoppingMetric.L2Norm, EarlyStoppingRule = new GLEarlyStoppingCriterion.Options() From 25aded67a9e4d3bd179ff5f098cd5d9817bcd76e Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Wed, 6 Mar 2019 23:43:41 -0800 Subject: [PATCH 8/9] Fixing cross-platform build errors. --- test/Microsoft.ML.Functional.Tests/Common.cs | 4 ++-- test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index 2d0accfe4c..754c4575ea 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -279,7 +279,7 @@ public static void AssertFiniteNumbers(IList array, int ignoreElementAt = if (i == ignoreElementAt) continue; Assert.False(float.IsNaN(array[i])); - Assert.True(float.IsFinite(array[i])); + Assert.False(float.IsInfinity(array[i])); } } @@ -294,7 +294,7 @@ public static void AssertFiniteNumbers(IList array, int ignoreElementAt if (i == ignoreElementAt) continue; Assert.False(double.IsNaN(array[i])); - Assert.True(double.IsFinite(array[i])); + Assert.False(double.IsInfinity(array[i])); } } } diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index 7d54931bf1..6d68c45806 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -355,7 +355,7 @@ public void InspectSlotNamesForReversibleHash() foreach (var slotName in categoricalSlotNames.GetValues()) { var slotNameString = slotName.ToString(); - if (slotNameString.StartsWith('{')) + if (slotNameString.StartsWith("{")) { // Values look like this: {3:Exec-managerial,2:Widowed}. slotNameString = slotNameString.Substring(1, slotNameString.Length - 2); From 83747ba8ad379328566cc05aba83fa5f8f7f2e23 Mon Sep 17 00:00:00 2001 From: Rogan Carr Date: Thu, 7 Mar 2019 09:54:50 -0800 Subject: [PATCH 9/9] Fix merge issues. --- .../IntrospectiveTraining.cs | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index 6d68c45806..69d8773fc9 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -30,14 +30,15 @@ public IntrospectiveTraining(ITestOutputHelper output) : base(output) [Fact] public void InspectFastForestRegresionTrees() { - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); // Get the dataset. var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); // Create a pipeline to train on the housing data. var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) - .Append(mlContext.Regression.Trainers.FastForest(numLeaves: 5, numTrees: 3)); + .Append(mlContext.Regression.Trainers.FastForest( + new FastForestRegression.Options { NumberOfLeaves = 5, NumberOfTrees = 3, NumberOfThreads = 1 })); // Fit the pipeline. var model = pipeline.Fit(data); @@ -70,7 +71,7 @@ public void InspectFastForestRegresionTrees() [Fact] public void InspectFastTreeModelParameters() { - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: TestDatasets.Sentiment.fileHasHeader, @@ -80,7 +81,8 @@ public void InspectFastTreeModelParameters() // Create a training pipeline. var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") .AppendCacheCheckpoint(mlContext) - .Append(mlContext.BinaryClassification.Trainers.FastTree(numberOfLeaves: 5, numberOfTrees: 3)); + .Append(mlContext.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryClassificationTrainer.Options{ NumberOfLeaves = 5, NumberOfTrees= 3, NumberOfThreads = 1 })); // Fit the pipeline. var model = pipeline.Fit(data); @@ -130,7 +132,7 @@ public void InspectFastTreeModelParameters() void IntrospectGamShapeFunctions() { // Concurrency must be 1 to assure that the mapping is done sequentially. - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); // Load the Iris dataset. var data = mlContext.Data.LoadFromTextFile( @@ -141,7 +143,7 @@ void IntrospectGamShapeFunctions() // Compose the transformation. var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels( - new RegressionGamTrainer.Options { NumberOfIterations = 100 })); + new RegressionGamTrainer.Options { NumberOfIterations = 100, NumberOfThreads = 1 })); // Fit the pipeline. var model = pipeline.Fit(data); @@ -171,7 +173,7 @@ public void InspectLdaModelParameters() // Test Parameters int numTopics = 10; - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); // Load the dataset. var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), @@ -204,7 +206,7 @@ public void InspectLdaModelParameters() [Fact] public void InpsectLinearModelParameters() { - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: TestDatasets.Sentiment.fileHasHeader, @@ -242,7 +244,7 @@ public void InpsectLinearModelParameters() void IntrospectNormalization() { // Concurrency must be 1 to assure that the mapping is done sequentially. - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); // Load the Iris dataset. var data = mlContext.Data.LoadFromTextFile( @@ -284,7 +286,7 @@ void IntrospectNormalization() [Fact] public void InspectPipelineContents() { - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); // Get the dataset. var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); @@ -327,7 +329,7 @@ public void InspectPipelineContents() [Fact] public void InspectSlotNamesForReversibleHash() { - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); // Load the Adult dataset. var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.adult.trainFilename), @@ -385,7 +387,7 @@ public void InspectSlotNamesForReversibleHash() [Fact] public void InspectNestedPipeline() { - var mlContext = new MLContext(seed: 1, conc: 1); + var mlContext = new MLContext(seed: 1); var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.iris.trainFilename), hasHeader: TestDatasets.iris.fileHasHeader,