diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs index a2f47e66b4..5756893966 100644 --- a/test/Microsoft.ML.Functional.Tests/Common.cs +++ b/test/Microsoft.ML.Functional.Tests/Common.cs @@ -267,5 +267,35 @@ public static void AssertMetricsStatistics(RegressionMetricsStatistics metrics) AssertMetricStatistics(metrics.RSquared); AssertMetricStatistics(metrics.LossFunction); } + + /// + /// Verify that a float array has no NaNs or infinities. + /// + /// An array of doubles. + public static void AssertFiniteNumbers(IList array, int ignoreElementAt = -1) + { + for (int i = 0; i < array.Count; i++) + { + if (i == ignoreElementAt) + continue; + Assert.False(float.IsNaN(array[i])); + Assert.False(float.IsInfinity(array[i])); + } + } + + /// + /// Verify that a double array has no NaNs or infinities. + /// + /// An array of doubles. + public static void AssertFiniteNumbers(IList array, int ignoreElementAt = -1) + { + for (int i = 0; i < array.Count; i++) + { + if (i == ignoreElementAt) + continue; + Assert.False(double.IsNaN(array[i])); + Assert.False(double.IsInfinity(array[i])); + } + } } } diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs new file mode 100644 index 0000000000..440515ac06 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs @@ -0,0 +1,69 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Data; + +namespace Microsoft.ML.Functional.Tests.Datasets +{ + /// + /// A class for the Adult test dataset. + /// + internal sealed class Adult + { + [LoadColumn(0)] + public bool Label { get; set; } + + [LoadColumn(1)] + public string WorkClass { get; set; } + + [LoadColumn(2)] + public string Education { get; set; } + + [LoadColumn(3)] + public string MaritalStatus { get; set; } + + [LoadColumn(4)] + public string Occupation { get; set; } + + [LoadColumn(5)] + public string Relationship { get; set; } + + [LoadColumn(6)] + public string Ethnicity { get; set; } + + [LoadColumn(7)] + public string Sex { get; set; } + + [LoadColumn(8)] + public string NativeCountryRegion { get; set; } + + [LoadColumn(9)] + public float Age { get; set; } + + [LoadColumn(10)] + public float FinalWeight { get; set; } + + [LoadColumn(11)] + public float EducationNum { get; set; } + + [LoadColumn(12)] + public float CapitalGain { get; set; } + + [LoadColumn(13)] + public float CapitalLoss { get; set; } + + [LoadColumn(14)] + public float HoursPerWeek { get; set; } + + /// + /// The list of columns commonly used as categorical features. + /// + public static readonly string[] CategoricalFeatures = new string[] { "WorkClass", "Education", "MaritalStatus", "Occupation", "Relationship", "Ethnicity", "Sex", "NativeCountryRegion" }; + + /// + /// The list of columns commonly used as numerical features. + /// + public static readonly string[] NumericalFeatures = new string[] { "Age", "FinalWeight", "EducationNum", "CapitalGain", "CapitalLoss", "HoursPerWeek" }; + } +} diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index 437bb7fab5..36f000fc8f 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -237,17 +237,11 @@ public void TrainAndEvaluateRegression() { var mlContext = new MLContext(seed: 1); - // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) - .Load(GetDataPath(TestDatasets.housing.trainFilename)); - - // Create a pipeline to train on the sentiment data. - var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { - "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", - "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) - .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) - .Append(mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumberOfThreads = 1 })); + // Get the dataset + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Regression.Trainers.FastForest(new FastForestRegression.Options { NumberOfThreads = 1 })); // Train the model. var model = pipeline.Fit(data); diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs new file mode 100644 index 0000000000..69d8773fc9 --- /dev/null +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -0,0 +1,442 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using Microsoft.ML.Data; +using Microsoft.ML.Functional.Tests.Datasets; +using Microsoft.ML.RunTests; +using Microsoft.ML.TestFramework; +using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Transforms; +using Xunit; +using Xunit.Abstractions; + +namespace Microsoft.ML.Functional.Tests +{ + public class IntrospectiveTraining : BaseTestClass + { + public IntrospectiveTraining(ITestOutputHelper output) : base(output) + { + } + + /// + /// Introspective Training: Tree ensembles learned from FastForest can be inspected. + /// + [Fact] + public void InspectFastForestRegresionTrees() + { + var mlContext = new MLContext(seed: 1); + + // Get the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Regression.Trainers.FastForest( + new FastForestRegression.Options { NumberOfLeaves = 5, NumberOfTrees = 3, NumberOfThreads = 1 })); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the boosted tree model. + var fastForestModel = model.LastTransformer.Model; + + // Extract the learned Random Forests model. + var treeCollection = fastForestModel.TrainedTreeEnsemble; + + // Inspect properties in the extracted model. + Assert.Equal(3, treeCollection.Trees.Count); + Assert.Equal(3, treeCollection.TreeWeights.Count); + Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight)); + Assert.All(treeCollection.Trees, tree => + { + Assert.Equal(5, tree.NumberOfLeaves); + Assert.Equal(4, tree.NumberOfNodes); + Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); + Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); + Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag)); + Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count); + Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); + }); + } + + /// + /// Introspective Training: Tree ensembles learned from FastTree can be inspected. + /// + [Fact] + public void InspectFastTreeModelParameters() + { + var mlContext = new MLContext(seed: 1); + + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator, + allowQuoting: TestDatasets.Sentiment.allowQuoting); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryClassificationTrainer.Options{ NumberOfLeaves = 5, NumberOfTrees= 3, NumberOfThreads = 1 })); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the boosted tree model. + var fastTreeModel = model.LastTransformer.Model.SubModel; + + // Extract the learned GBDT model. + var treeCollection = fastTreeModel.TrainedTreeEnsemble; + + // Make sure the tree models were formed as expected. + Assert.Equal(3, treeCollection.Trees.Count); + Assert.Equal(3, treeCollection.TreeWeights.Count); + Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight)); + Assert.All(treeCollection.Trees, tree => + { + Assert.Equal(5, tree.NumberOfLeaves); + Assert.Equal(4, tree.NumberOfNodes); + Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); + Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); + Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag)); + Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count); + Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); + }); + + // Add baselines for the model. + // Verify that there is no bias. + Assert.Equal(0, treeCollection.Bias); + // Check the parameters of the final tree. + var finalTree = treeCollection.Trees[2]; + Assert.Equal(finalTree.LeftChild, new int[] { 2, -2, -1, -3 }); + Assert.Equal(finalTree.RightChild, new int[] { 1, 3, -4, -5 }); + Assert.Equal(finalTree.NumericalSplitFeatureIndexes, new int[] { 14, 294, 633, 266 }); + var expectedSplitGains = new double[] { 0.52634223978445616, 0.45899249367725858, 0.44142707650267105, 0.38348634823264854 }; + var expectedThresholds = new float[] { 0.0911167f, 0.06509889f, 0.019873254f, 0.0361835f }; + for (int i = 0; i < finalTree.NumberOfNodes; ++i) + { + Assert.Equal(expectedSplitGains[i], finalTree.SplitGains[i], 6); + Assert.Equal(expectedThresholds[i], finalTree.NumericalSplitThresholds[i], 6); + } + } + + /// + /// Introspective Training: GAM Shape Functions are easily accessed. + /// + [Fact] + void IntrospectGamShapeFunctions() + { + // Concurrency must be 1 to assure that the mapping is done sequentially. + var mlContext = new MLContext(seed: 1); + + // Load the Iris dataset. + var data = mlContext.Data.LoadFromTextFile( + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Compose the transformation. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels( + new RegressionGamTrainer.Options { NumberOfIterations = 100, NumberOfThreads = 1 })); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the normalizer from the trained pipeline. + var gamModel = model.LastTransformer.Model; + + // Take look at the shape functions. + for (int i = 0; i < gamModel.NumberOfShapeFunctions; i++) + { + var shapeFunctionBins = gamModel.GetBinUpperBounds(i); + var shapeFunctionValues = gamModel.GetBinEffects(i); + + // Validate that the shape functions lengths match. + Assert.Equal(shapeFunctionBins.Count, shapeFunctionValues.Count); + Common.AssertFiniteNumbers(shapeFunctionBins as IList, shapeFunctionBins.Count - 1); + Common.AssertFiniteNumbers(shapeFunctionValues as IList); + } + } + + /// + /// Introspective Training: LDA models can be easily inspected. + /// + [Fact] + public void InspectLdaModelParameters() + { + // Test Parameters + int numTopics = 10; + + var mlContext = new MLContext(seed: 1); + + // Load the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator, + allowQuoting: TestDatasets.Sentiment.allowQuoting); + + // Define the pipeline. + var pipeline = mlContext.Transforms.Text.ProduceWordBags("SentimentBag", "SentimentText") + .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "SentimentBag", numTopic: numTopics, numIterations: 10)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Get the trained LDA model. + // TODO #2197: Get the topics and summaries from the model. + var ldaTransform = model.LastTransformer; + + // Transform the data. + var transformedData = model.Transform(data); + + // Make sure the model weights array is the same length as the features array. + var numFeatures = (transformedData.Schema["Features"].Type as VectorType).Size; + Assert.Equal(numFeatures, numTopics); + } + + /// + /// Introspective Training: Linear model parameters may be inspected. + /// + [Fact] + public void InpsectLinearModelParameters() + { + var mlContext = new MLContext(seed: 1); + + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), + hasHeader: TestDatasets.Sentiment.fileHasHeader, + separatorChar: TestDatasets.Sentiment.fileSeparator, + allowQuoting: TestDatasets.Sentiment.allowQuoting); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText") + .AppendCacheCheckpoint(mlContext) + .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( + new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1 })); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Transform the data. + var transformedData = model.Transform(data); + + // Extract the linear model from the pipeline. + var linearModel = model.LastTransformer.Model; + + // Get the model bias and weights. + var bias = linearModel.Bias; + var weights = linearModel.Weights; + + // Make sure the model weights array is the same length as the features array. + var numFeatures = (transformedData.Schema["Features"].Type as VectorType).Size; + Assert.Equal(numFeatures, weights.Count); + } + + /// + /// Introspectable Training: Parameters of a trained Normalizer are easily accessed. + /// + [Fact] + void IntrospectNormalization() + { + // Concurrency must be 1 to assure that the mapping is done sequentially. + var mlContext = new MLContext(seed: 1); + + // Load the Iris dataset. + var data = mlContext.Data.LoadFromTextFile( + GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Compose the transformation. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the normalizer from the trained pipeline. + var normalizer = model.LastTransformer; + + // Extract the normalizer parameters. + // TODO #2854: Normalizer parameters are easy to find via intellisense. + int i = 0; + bool found = false; + foreach (var column in normalizer.Columns) + { + if (column.Name == "Features") + { + found = true; + var featuresNormalizer = normalizer.Columns[i].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters>; + Assert.NotNull(featuresNormalizer); + Common.AssertFiniteNumbers(featuresNormalizer.Offset); + Common.AssertFiniteNumbers(featuresNormalizer.Scale); + } + i++; + } + Assert.True(found); + } + /// + /// Introspective Training: I can inspect a pipeline to determine which transformers were included. + /// + [Fact] + public void InspectPipelineContents() + { + var mlContext = new MLContext(seed: 1); + + // Get the dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) + .Append(mlContext.Regression.Trainers.FastForest(numLeaves: 5, numTrees: 3)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Inspect the transforms in the trained pipeline. + var expectedTypes = new Type[] {typeof(ColumnConcatenatingTransformer), + typeof(RegressionPredictionTransformer)}; + var expectedColumns = new string[][] { + new string[] { "Features" }, + new string[] { "Score" }, + }; + int i = 0; + var currentSchema = data.Schema; + foreach (var transformer in model) + { + // It is possible to get the type at runtime. + Assert.IsType(expectedTypes[i], transformer); + + // It's also possible to inspect the schema output from the transform. + currentSchema = transformer.GetOutputSchema(currentSchema); + foreach (var expectedColumn in expectedColumns[i]) + { + var column = currentSchema.GetColumnOrNull(expectedColumn); + Assert.NotNull(column); + } + i++; + } + } + + /// + /// Introspective Training: Hashed values can be mapped back to the original column and value. + /// + [Fact] + public void InspectSlotNamesForReversibleHash() + { + var mlContext = new MLContext(seed: 1); + + // Load the Adult dataset. + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.adult.trainFilename), + hasHeader: TestDatasets.adult.fileHasHeader, + separatorChar: TestDatasets.adult.fileSeparator); + + // Create the learning pipeline. + var pipeline = mlContext.Transforms.Concatenate("NumericalFeatures", Adult.NumericalFeatures) + .Append(mlContext.Transforms.Concatenate("CategoricalFeatures", Adult.CategoricalFeatures)) + .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CategoricalFeatures", hashBits: 8, // get collisions! + invertHash: -1, outputKind: OneHotEncodingTransformer.OutputKind.Bag)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Transform the data. + var transformedData = model.Transform(data); + + // Verify that the slotnames can be used to backtrack to the original values by confirming that + // all unique values in the input data are in the output data slot names. + // First get a list of the unique values. + VBuffer> categoricalSlotNames = new VBuffer>(); + transformedData.Schema["CategoricalFeatures"].GetSlotNames(ref categoricalSlotNames); + var uniqueValues = new HashSet(); + foreach (var slotName in categoricalSlotNames.GetValues()) + { + var slotNameString = slotName.ToString(); + if (slotNameString.StartsWith("{")) + { + // Values look like this: {3:Exec-managerial,2:Widowed}. + slotNameString = slotNameString.Substring(1, slotNameString.Length - 2); + foreach (var name in slotNameString.Split(',')) + uniqueValues.Add(name); + } + else + uniqueValues.Add(slotNameString); + } + + // Now validate that all values in the dataset are there. + var transformedRows = mlContext.Data.CreateEnumerable(data, false); + foreach (var row in transformedRows) + { + for (int i = 0; i < Adult.CategoricalFeatures.Length; i++) + { + // Fetch the categorical value. + string value = (string) row.GetType().GetProperty(Adult.CategoricalFeatures[i]).GetValue(row, null); + Assert.Contains($"{i}:{value}", uniqueValues); + } + } + } + + /// + /// Introspective Training: I can create nested pipelines, and extract individual components. + /// + [Fact] + public void InspectNestedPipeline() + { + var mlContext = new MLContext(seed: 1); + + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.iris.trainFilename), + hasHeader: TestDatasets.iris.fileHasHeader, + separatorChar: TestDatasets.iris.fileSeparator); + + // Create a training pipeline. + var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) + .Append(StepOne(mlContext)) + .Append(StepTwo(mlContext)); + + // Fit the pipeline. + var model = pipeline.Fit(data); + + // Extract the trained models. + var modelComponents = model.ToList(); + var kMeansModel = (modelComponents[1] as TransformerChain>).LastTransformer; + var mcLrModel = (modelComponents[2] as TransformerChain>).LastTransformer; + + // Validate the k-means model. + VBuffer[] centroids = default; + kMeansModel.Model.GetClusterCentroids(ref centroids, out int nCentroids); + Assert.Equal(4, centroids.Length); + + // Validate the MulticlassLogisticRegressionModel. + VBuffer[] weights = default; + mcLrModel.Model.GetWeights(ref weights, out int classes); + Assert.Equal(3, weights.Length); + } + + private IEstimator>> StepOne(MLContext mlContext) + { + return mlContext.Transforms.Concatenate("LabelAndFeatures", "Label", "Features") + .Append(mlContext.Clustering.Trainers.KMeans( + new KMeansPlusPlusTrainer.Options + { + InitializationAlgorithm = KMeansPlusPlusTrainer.InitializationAlgorithm.Random, + NumberOfClusters = 4, + NumberOfIterations = 10, + NumberOfThreads = 1 + })); + } + + private IEstimator>> StepTwo(MLContext mlContext) + { + return mlContext.Transforms.Conversion.MapValueToKey("Label") + .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent( + new SdcaMultiClassTrainer.Options { + NumberOfIterations = 10, + NumberOfThreads = 1 })); + } + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index 686e1a0ec6..49a5db6693 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -4,16 +4,22 @@ using Microsoft.Data.DataView; using Microsoft.ML.Data; +using Microsoft.ML.Functional.Tests.Datasets; using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework; -using Microsoft.ML.Trainers.FastTree; using Microsoft.ML.Trainers; +using Microsoft.ML.Trainers.FastTree; using Xunit; +using Xunit.Abstractions; namespace Microsoft.ML.Functional.Tests { - public class ValidationScenarios + public class Validation : BaseTestClass { + public Validation(ITestOutputHelper output) : base(output) + { + } + /// /// Cross-validation: Have a mechanism to do cross validation, that is, you come up with /// a data source (optionally with stratification column), come up with an instantiable transform @@ -26,16 +32,11 @@ void CrossValidation() { var mlContext = new MLContext(seed: 1); - // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) - .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); - - // Create a pipeline to train on the sentiment data. - var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { - "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", - "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) - .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + // Get the dataset + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Create a pipeline to train on the housing data. + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) .Append(mlContext.Regression.Trainers.OrdinaryLeastSquares()); // Compute the CV result. @@ -61,18 +62,15 @@ public void TrainWithValidationSet() var mlContext = new MLContext(seed: 1); // Get the dataset. - var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) - .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); + var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); + + // Create the train and validation set. var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2); var trainData = dataSplit.TrainSet; var validData = dataSplit.TestSet; // Create a pipeline to featurize the dataset. - var pipeline = mlContext.Transforms.Concatenate("Features", new string[] { - "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling", - "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"}) - .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue")) + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) .AppendCacheCheckpoint(mlContext) as IEstimator; // Preprocess the datasets. @@ -81,7 +79,7 @@ public void TrainWithValidationSet() var preprocessedValidData = preprocessor.Transform(validData); // Train the model with a validation set. - var trainedModel = mlContext.Regression.Trainers.FastTree(new Trainers.FastTree.FastTreeRegressionTrainer.Options { + var trainedModel = mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumberOfTrees = 2, EarlyStoppingMetric = EarlyStoppingMetric.L2Norm, EarlyStoppingRule = new GeneralityLossRule() diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs index d279332cad..23cb24dc62 100644 --- a/test/Microsoft.ML.TestFramework/Datasets.cs +++ b/test/Microsoft.ML.TestFramework/Datasets.cs @@ -16,6 +16,7 @@ public class TestDataset public string labelFilename; public char fileSeparator; public bool fileHasHeader; + public bool allowQuoting; // REVIEW: Replace these with appropriate SubComponents! public string settings; @@ -212,6 +213,7 @@ public static class TestDatasets testFilename = "wikipedia-detox-250-line-test.tsv", fileHasHeader = true, fileSeparator = '\t', + allowQuoting = true, GetLoaderColumns = () => { return new[] @@ -276,6 +278,8 @@ public static class TestDatasets name = "Census", trainFilename = "adult.tiny.with-schema.txt", testFilename = "adult.tiny.with-schema.txt", + fileHasHeader = true, + fileSeparator = '\t', loaderSettings = "loader=Text{header+ col=Label:0 col=Num:9-14 col=Cat:TX:1-8}", mamlExtraSettings = new[] { "xf=Cat{col=Cat}", "xf=Concat{col=Features:Num,Cat}" }, extraSettings = @"/inst Text{header+ sep=, label=14 handler=Categorical{cols=5-9,1,13,3}}", diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs deleted file mode 100644 index 1c9b76c813..0000000000 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs +++ /dev/null @@ -1,153 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Calibrators; -using Microsoft.ML.Data; -using Microsoft.ML.RunTests; -using Microsoft.ML.Trainers; -using Microsoft.ML.Trainers.FastTree; -using Xunit; - -namespace Microsoft.ML.Tests.Scenarios.Api -{ - - public partial class ApiScenariosTests - { - /// - /// Introspective training: Models that produce outputs and are otherwise black boxes are of limited use; - /// it is also necessary often to understand at least to some degree what was learnt. To outline critical - /// scenarios that have come up multiple times: - /// *) When I train a linear model, I should be able to inspect coefficients. - /// *) The tree ensemble learners, I should be able to inspect the trees. - /// *) The LDA transform, I should be able to inspect the topics. - /// I view it as essential from a usability perspective that this be discoverable to someone without - /// having to read documentation. For example, if I have var lda = new LdaTransform().Fit(data)(I don't insist on that - /// exact signature, just giving the idea), then if I were to type lda. - /// In Visual Studio, one of the auto-complete targets should be something like GetTopics. - /// - - [Fact] - public void IntrospectiveTraining() - { - var ml = new MLContext(seed: 1); - var data = ml.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true, allowQuoting: true); - - var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .AppendCacheCheckpoint(ml) - .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( - new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1 })); - - // Train. - var model = pipeline.Fit(data); - - // Get feature weights. - var weights = model.LastTransformer.Model.Weights; - } - - [Fact] - public void FastTreeClassificationIntrospectiveTraining() - { - var ml = new MLContext(seed: 1); - var data = ml.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true, allowQuoting: true); - - var trainer = ml.BinaryClassification.Trainers.FastTree(numberOfLeaves: 5, numberOfTrees: 3); - - BinaryPredictionTransformer> pred = null; - - var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText") - .AppendCacheCheckpoint(ml) - .Append(trainer.WithOnFitDelegate(p => pred = p)); - - // Train. - var model = pipeline.Fit(data); - - // Extract the learned GBDT model. - var treeCollection = pred.Model.SubModel.TrainedTreeEnsemble; - - // Inspect properties in the extracted model. - Assert.Equal(3, treeCollection.Trees.Count); - Assert.Equal(3, treeCollection.TreeWeights.Count); - Assert.Equal(0, treeCollection.Bias); - Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight)); - - // Inspect the last tree. - var tree = treeCollection.Trees[2]; - - Assert.Equal(5, tree.NumberOfLeaves); - Assert.Equal(4, tree.NumberOfNodes); - Assert.Equal(tree.LeftChild, new int[] { 2, -2, -1, -3 }); - Assert.Equal(tree.RightChild, new int[] { 1, 3, -4, -5 }); - Assert.Equal(tree.NumericalSplitFeatureIndexes, new int[] { 14, 294, 633, 266 }); - Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); - Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); - var expectedSplitGains = new double[] { 0.52634223978445616, 0.45899249367725858, 0.44142707650267105, 0.38348634823264854 }; - var expectedThresholds = new float[] { 0.0911167f, 0.06509889f, 0.019873254f, 0.0361835f }; - for (int i = 0; i < tree.NumberOfNodes; ++i) - { - Assert.Equal(expectedSplitGains[i], tree.SplitGains[i], 6); - Assert.Equal(expectedThresholds[i], tree.NumericalSplitThresholds[i], 6); - } - Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag)); - - Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count); - Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); - } - - [Fact] - public void FastForestRegressionIntrospectiveTraining() - { - var ml = new MLContext(seed: 1); - var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(1000); - var dataView = ml.Data.LoadFromEnumerable(data); - - RegressionPredictionTransformer pred = null; - var trainer = ml.Regression.Trainers.FastForest(numLeaves: 5, numTrees: 3).WithOnFitDelegate(p => pred = p); - - // Train. - var model = trainer.Fit(dataView); - - // Extract the learned RF model. - var treeCollection = pred.Model.TrainedTreeEnsemble; - - // Inspect properties in the extracted model. - Assert.Equal(3, treeCollection.Trees.Count); - Assert.Equal(3, treeCollection.TreeWeights.Count); - Assert.Equal(0, treeCollection.Bias); - Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight)); - - // Inspect the last tree. - var tree = treeCollection.Trees[2]; - - Assert.Equal(5, tree.NumberOfLeaves); - Assert.Equal(4, tree.NumberOfNodes); - Assert.Equal(tree.LeftChild, new int[] { -1, -2, -3, -4 }); - Assert.Equal(tree.RightChild, new int[] { 1, 2, 3, -5 }); - Assert.Equal(tree.NumericalSplitFeatureIndexes, new int[] { 9, 0, 1, 8 }); - Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes); - Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes); - var expectedSplitGains = new double[] { 21.279269008093962, 19.376698810984138, 17.830020749728774, 17.366801337893413 }; - var expectedThresholds = new float[] { 0.208134219f, 0.198336035f, 0.202952743f, 0.205061346f }; - for (int i = 0; i < tree.NumberOfNodes; ++i) - { - Assert.Equal(expectedSplitGains[i], tree.SplitGains[i], 6); - Assert.Equal(expectedThresholds[i], tree.NumericalSplitThresholds[i], 6); - } - Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag)); - - Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count); - Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count); - - var samples = new double[] { 0.97468354430379744, 1.0, 0.97727272727272729, 0.972972972972973, 0.26124197002141325 }; - for (int i = 0; i < tree.NumberOfLeaves; ++i) - { - var sample = tree.GetLeafSamplesAt(i); - Assert.Single(sample); - Assert.Equal(samples[i], sample[0], 6); - var weight = tree.GetLeafSampleWeightsAt(i); - Assert.Single(weight); - Assert.Equal(1, weight[0]); - } - } - } -}