diff --git a/test/Microsoft.ML.Functional.Tests/Common.cs b/test/Microsoft.ML.Functional.Tests/Common.cs
index a2f47e66b4..5756893966 100644
--- a/test/Microsoft.ML.Functional.Tests/Common.cs
+++ b/test/Microsoft.ML.Functional.Tests/Common.cs
@@ -267,5 +267,35 @@ public static void AssertMetricsStatistics(RegressionMetricsStatistics metrics)
AssertMetricStatistics(metrics.RSquared);
AssertMetricStatistics(metrics.LossFunction);
}
+
+ ///
+ /// Verify that a float array has no NaNs or infinities.
+ ///
+ /// An array of doubles.
+ public static void AssertFiniteNumbers(IList array, int ignoreElementAt = -1)
+ {
+ for (int i = 0; i < array.Count; i++)
+ {
+ if (i == ignoreElementAt)
+ continue;
+ Assert.False(float.IsNaN(array[i]));
+ Assert.False(float.IsInfinity(array[i]));
+ }
+ }
+
+ ///
+ /// Verify that a double array has no NaNs or infinities.
+ ///
+ /// An array of doubles.
+ public static void AssertFiniteNumbers(IList array, int ignoreElementAt = -1)
+ {
+ for (int i = 0; i < array.Count; i++)
+ {
+ if (i == ignoreElementAt)
+ continue;
+ Assert.False(double.IsNaN(array[i]));
+ Assert.False(double.IsInfinity(array[i]));
+ }
+ }
}
}
diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs b/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs
new file mode 100644
index 0000000000..440515ac06
--- /dev/null
+++ b/test/Microsoft.ML.Functional.Tests/Datasets/Adult.cs
@@ -0,0 +1,69 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Data;
+
+namespace Microsoft.ML.Functional.Tests.Datasets
+{
+ ///
+ /// A class for the Adult test dataset.
+ ///
+ internal sealed class Adult
+ {
+ [LoadColumn(0)]
+ public bool Label { get; set; }
+
+ [LoadColumn(1)]
+ public string WorkClass { get; set; }
+
+ [LoadColumn(2)]
+ public string Education { get; set; }
+
+ [LoadColumn(3)]
+ public string MaritalStatus { get; set; }
+
+ [LoadColumn(4)]
+ public string Occupation { get; set; }
+
+ [LoadColumn(5)]
+ public string Relationship { get; set; }
+
+ [LoadColumn(6)]
+ public string Ethnicity { get; set; }
+
+ [LoadColumn(7)]
+ public string Sex { get; set; }
+
+ [LoadColumn(8)]
+ public string NativeCountryRegion { get; set; }
+
+ [LoadColumn(9)]
+ public float Age { get; set; }
+
+ [LoadColumn(10)]
+ public float FinalWeight { get; set; }
+
+ [LoadColumn(11)]
+ public float EducationNum { get; set; }
+
+ [LoadColumn(12)]
+ public float CapitalGain { get; set; }
+
+ [LoadColumn(13)]
+ public float CapitalLoss { get; set; }
+
+ [LoadColumn(14)]
+ public float HoursPerWeek { get; set; }
+
+ ///
+ /// The list of columns commonly used as categorical features.
+ ///
+ public static readonly string[] CategoricalFeatures = new string[] { "WorkClass", "Education", "MaritalStatus", "Occupation", "Relationship", "Ethnicity", "Sex", "NativeCountryRegion" };
+
+ ///
+ /// The list of columns commonly used as numerical features.
+ ///
+ public static readonly string[] NumericalFeatures = new string[] { "Age", "FinalWeight", "EducationNum", "CapitalGain", "CapitalLoss", "HoursPerWeek" };
+ }
+}
diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs
index 437bb7fab5..36f000fc8f 100644
--- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs
+++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs
@@ -237,17 +237,11 @@ public void TrainAndEvaluateRegression()
{
var mlContext = new MLContext(seed: 1);
- // Get the dataset.
- var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(),
- hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator)
- .Load(GetDataPath(TestDatasets.housing.trainFilename));
-
- // Create a pipeline to train on the sentiment data.
- var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
- "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling",
- "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"})
- .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue"))
- .Append(mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options { NumberOfThreads = 1 }));
+ // Get the dataset
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
+ // Create a pipeline to train on the housing data.
+ var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
+ .Append(mlContext.Regression.Trainers.FastForest(new FastForestRegression.Options { NumberOfThreads = 1 }));
// Train the model.
var model = pipeline.Fit(data);
diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs
new file mode 100644
index 0000000000..69d8773fc9
--- /dev/null
+++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs
@@ -0,0 +1,442 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.Collections.Immutable;
+using System.Linq;
+using Microsoft.ML.Data;
+using Microsoft.ML.Functional.Tests.Datasets;
+using Microsoft.ML.RunTests;
+using Microsoft.ML.TestFramework;
+using Microsoft.ML.Trainers;
+using Microsoft.ML.Trainers.FastTree;
+using Microsoft.ML.Transforms;
+using Xunit;
+using Xunit.Abstractions;
+
+namespace Microsoft.ML.Functional.Tests
+{
+ public class IntrospectiveTraining : BaseTestClass
+ {
+ public IntrospectiveTraining(ITestOutputHelper output) : base(output)
+ {
+ }
+
+ ///
+ /// Introspective Training: Tree ensembles learned from FastForest can be inspected.
+ ///
+ [Fact]
+ public void InspectFastForestRegresionTrees()
+ {
+ var mlContext = new MLContext(seed: 1);
+
+ // Get the dataset.
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
+
+ // Create a pipeline to train on the housing data.
+ var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
+ .Append(mlContext.Regression.Trainers.FastForest(
+ new FastForestRegression.Options { NumberOfLeaves = 5, NumberOfTrees = 3, NumberOfThreads = 1 }));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Extract the boosted tree model.
+ var fastForestModel = model.LastTransformer.Model;
+
+ // Extract the learned Random Forests model.
+ var treeCollection = fastForestModel.TrainedTreeEnsemble;
+
+ // Inspect properties in the extracted model.
+ Assert.Equal(3, treeCollection.Trees.Count);
+ Assert.Equal(3, treeCollection.TreeWeights.Count);
+ Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight));
+ Assert.All(treeCollection.Trees, tree =>
+ {
+ Assert.Equal(5, tree.NumberOfLeaves);
+ Assert.Equal(4, tree.NumberOfNodes);
+ Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes);
+ Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes);
+ Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag));
+ Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count);
+ Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count);
+ });
+ }
+
+ ///
+ /// Introspective Training: Tree ensembles learned from FastTree can be inspected.
+ ///
+ [Fact]
+ public void InspectFastTreeModelParameters()
+ {
+ var mlContext = new MLContext(seed: 1);
+
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename),
+ hasHeader: TestDatasets.Sentiment.fileHasHeader,
+ separatorChar: TestDatasets.Sentiment.fileSeparator,
+ allowQuoting: TestDatasets.Sentiment.allowQuoting);
+
+ // Create a training pipeline.
+ var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
+ .AppendCacheCheckpoint(mlContext)
+ .Append(mlContext.BinaryClassification.Trainers.FastTree(
+ new FastTreeBinaryClassificationTrainer.Options{ NumberOfLeaves = 5, NumberOfTrees= 3, NumberOfThreads = 1 }));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Extract the boosted tree model.
+ var fastTreeModel = model.LastTransformer.Model.SubModel;
+
+ // Extract the learned GBDT model.
+ var treeCollection = fastTreeModel.TrainedTreeEnsemble;
+
+ // Make sure the tree models were formed as expected.
+ Assert.Equal(3, treeCollection.Trees.Count);
+ Assert.Equal(3, treeCollection.TreeWeights.Count);
+ Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight));
+ Assert.All(treeCollection.Trees, tree =>
+ {
+ Assert.Equal(5, tree.NumberOfLeaves);
+ Assert.Equal(4, tree.NumberOfNodes);
+ Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes);
+ Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes);
+ Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag));
+ Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count);
+ Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count);
+ });
+
+ // Add baselines for the model.
+ // Verify that there is no bias.
+ Assert.Equal(0, treeCollection.Bias);
+ // Check the parameters of the final tree.
+ var finalTree = treeCollection.Trees[2];
+ Assert.Equal(finalTree.LeftChild, new int[] { 2, -2, -1, -3 });
+ Assert.Equal(finalTree.RightChild, new int[] { 1, 3, -4, -5 });
+ Assert.Equal(finalTree.NumericalSplitFeatureIndexes, new int[] { 14, 294, 633, 266 });
+ var expectedSplitGains = new double[] { 0.52634223978445616, 0.45899249367725858, 0.44142707650267105, 0.38348634823264854 };
+ var expectedThresholds = new float[] { 0.0911167f, 0.06509889f, 0.019873254f, 0.0361835f };
+ for (int i = 0; i < finalTree.NumberOfNodes; ++i)
+ {
+ Assert.Equal(expectedSplitGains[i], finalTree.SplitGains[i], 6);
+ Assert.Equal(expectedThresholds[i], finalTree.NumericalSplitThresholds[i], 6);
+ }
+ }
+
+ ///
+ /// Introspective Training: GAM Shape Functions are easily accessed.
+ ///
+ [Fact]
+ void IntrospectGamShapeFunctions()
+ {
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
+ var mlContext = new MLContext(seed: 1);
+
+ // Load the Iris dataset.
+ var data = mlContext.Data.LoadFromTextFile(
+ GetDataPath(TestDatasets.iris.trainFilename),
+ hasHeader: TestDatasets.iris.fileHasHeader,
+ separatorChar: TestDatasets.iris.fileSeparator);
+
+ // Compose the transformation.
+ var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
+ .Append(mlContext.Regression.Trainers.GeneralizedAdditiveModels(
+ new RegressionGamTrainer.Options { NumberOfIterations = 100, NumberOfThreads = 1 }));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Extract the normalizer from the trained pipeline.
+ var gamModel = model.LastTransformer.Model;
+
+ // Take look at the shape functions.
+ for (int i = 0; i < gamModel.NumberOfShapeFunctions; i++)
+ {
+ var shapeFunctionBins = gamModel.GetBinUpperBounds(i);
+ var shapeFunctionValues = gamModel.GetBinEffects(i);
+
+ // Validate that the shape functions lengths match.
+ Assert.Equal(shapeFunctionBins.Count, shapeFunctionValues.Count);
+ Common.AssertFiniteNumbers(shapeFunctionBins as IList, shapeFunctionBins.Count - 1);
+ Common.AssertFiniteNumbers(shapeFunctionValues as IList);
+ }
+ }
+
+ ///
+ /// Introspective Training: LDA models can be easily inspected.
+ ///
+ [Fact]
+ public void InspectLdaModelParameters()
+ {
+ // Test Parameters
+ int numTopics = 10;
+
+ var mlContext = new MLContext(seed: 1);
+
+ // Load the dataset.
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename),
+ hasHeader: TestDatasets.Sentiment.fileHasHeader,
+ separatorChar: TestDatasets.Sentiment.fileSeparator,
+ allowQuoting: TestDatasets.Sentiment.allowQuoting);
+
+ // Define the pipeline.
+ var pipeline = mlContext.Transforms.Text.ProduceWordBags("SentimentBag", "SentimentText")
+ .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "SentimentBag", numTopic: numTopics, numIterations: 10));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Get the trained LDA model.
+ // TODO #2197: Get the topics and summaries from the model.
+ var ldaTransform = model.LastTransformer;
+
+ // Transform the data.
+ var transformedData = model.Transform(data);
+
+ // Make sure the model weights array is the same length as the features array.
+ var numFeatures = (transformedData.Schema["Features"].Type as VectorType).Size;
+ Assert.Equal(numFeatures, numTopics);
+ }
+
+ ///
+ /// Introspective Training: Linear model parameters may be inspected.
+ ///
+ [Fact]
+ public void InpsectLinearModelParameters()
+ {
+ var mlContext = new MLContext(seed: 1);
+
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename),
+ hasHeader: TestDatasets.Sentiment.fileHasHeader,
+ separatorChar: TestDatasets.Sentiment.fileSeparator,
+ allowQuoting: TestDatasets.Sentiment.allowQuoting);
+
+ // Create a training pipeline.
+ var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
+ .AppendCacheCheckpoint(mlContext)
+ .Append(mlContext.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated(
+ new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1 }));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Transform the data.
+ var transformedData = model.Transform(data);
+
+ // Extract the linear model from the pipeline.
+ var linearModel = model.LastTransformer.Model;
+
+ // Get the model bias and weights.
+ var bias = linearModel.Bias;
+ var weights = linearModel.Weights;
+
+ // Make sure the model weights array is the same length as the features array.
+ var numFeatures = (transformedData.Schema["Features"].Type as VectorType).Size;
+ Assert.Equal(numFeatures, weights.Count);
+ }
+
+ ///
+ /// Introspectable Training: Parameters of a trained Normalizer are easily accessed.
+ ///
+ [Fact]
+ void IntrospectNormalization()
+ {
+ // Concurrency must be 1 to assure that the mapping is done sequentially.
+ var mlContext = new MLContext(seed: 1);
+
+ // Load the Iris dataset.
+ var data = mlContext.Data.LoadFromTextFile(
+ GetDataPath(TestDatasets.iris.trainFilename),
+ hasHeader: TestDatasets.iris.fileHasHeader,
+ separatorChar: TestDatasets.iris.fileSeparator);
+
+ // Compose the transformation.
+ var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
+ .Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizerMode.MinMax));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Extract the normalizer from the trained pipeline.
+ var normalizer = model.LastTransformer;
+
+ // Extract the normalizer parameters.
+ // TODO #2854: Normalizer parameters are easy to find via intellisense.
+ int i = 0;
+ bool found = false;
+ foreach (var column in normalizer.Columns)
+ {
+ if (column.Name == "Features")
+ {
+ found = true;
+ var featuresNormalizer = normalizer.Columns[i].ModelParameters as NormalizingTransformer.AffineNormalizerModelParameters>;
+ Assert.NotNull(featuresNormalizer);
+ Common.AssertFiniteNumbers(featuresNormalizer.Offset);
+ Common.AssertFiniteNumbers(featuresNormalizer.Scale);
+ }
+ i++;
+ }
+ Assert.True(found);
+ }
+ ///
+ /// Introspective Training: I can inspect a pipeline to determine which transformers were included.
+ ///
+ [Fact]
+ public void InspectPipelineContents()
+ {
+ var mlContext = new MLContext(seed: 1);
+
+ // Get the dataset.
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
+
+ // Create a pipeline to train on the housing data.
+ var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
+ .Append(mlContext.Regression.Trainers.FastForest(numLeaves: 5, numTrees: 3));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Inspect the transforms in the trained pipeline.
+ var expectedTypes = new Type[] {typeof(ColumnConcatenatingTransformer),
+ typeof(RegressionPredictionTransformer)};
+ var expectedColumns = new string[][] {
+ new string[] { "Features" },
+ new string[] { "Score" },
+ };
+ int i = 0;
+ var currentSchema = data.Schema;
+ foreach (var transformer in model)
+ {
+ // It is possible to get the type at runtime.
+ Assert.IsType(expectedTypes[i], transformer);
+
+ // It's also possible to inspect the schema output from the transform.
+ currentSchema = transformer.GetOutputSchema(currentSchema);
+ foreach (var expectedColumn in expectedColumns[i])
+ {
+ var column = currentSchema.GetColumnOrNull(expectedColumn);
+ Assert.NotNull(column);
+ }
+ i++;
+ }
+ }
+
+ ///
+ /// Introspective Training: Hashed values can be mapped back to the original column and value.
+ ///
+ [Fact]
+ public void InspectSlotNamesForReversibleHash()
+ {
+ var mlContext = new MLContext(seed: 1);
+
+ // Load the Adult dataset.
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.adult.trainFilename),
+ hasHeader: TestDatasets.adult.fileHasHeader,
+ separatorChar: TestDatasets.adult.fileSeparator);
+
+ // Create the learning pipeline.
+ var pipeline = mlContext.Transforms.Concatenate("NumericalFeatures", Adult.NumericalFeatures)
+ .Append(mlContext.Transforms.Concatenate("CategoricalFeatures", Adult.CategoricalFeatures))
+ .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("CategoricalFeatures", hashBits: 8, // get collisions!
+ invertHash: -1, outputKind: OneHotEncodingTransformer.OutputKind.Bag));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Transform the data.
+ var transformedData = model.Transform(data);
+
+ // Verify that the slotnames can be used to backtrack to the original values by confirming that
+ // all unique values in the input data are in the output data slot names.
+ // First get a list of the unique values.
+ VBuffer> categoricalSlotNames = new VBuffer>();
+ transformedData.Schema["CategoricalFeatures"].GetSlotNames(ref categoricalSlotNames);
+ var uniqueValues = new HashSet();
+ foreach (var slotName in categoricalSlotNames.GetValues())
+ {
+ var slotNameString = slotName.ToString();
+ if (slotNameString.StartsWith("{"))
+ {
+ // Values look like this: {3:Exec-managerial,2:Widowed}.
+ slotNameString = slotNameString.Substring(1, slotNameString.Length - 2);
+ foreach (var name in slotNameString.Split(','))
+ uniqueValues.Add(name);
+ }
+ else
+ uniqueValues.Add(slotNameString);
+ }
+
+ // Now validate that all values in the dataset are there.
+ var transformedRows = mlContext.Data.CreateEnumerable(data, false);
+ foreach (var row in transformedRows)
+ {
+ for (int i = 0; i < Adult.CategoricalFeatures.Length; i++)
+ {
+ // Fetch the categorical value.
+ string value = (string) row.GetType().GetProperty(Adult.CategoricalFeatures[i]).GetValue(row, null);
+ Assert.Contains($"{i}:{value}", uniqueValues);
+ }
+ }
+ }
+
+ ///
+ /// Introspective Training: I can create nested pipelines, and extract individual components.
+ ///
+ [Fact]
+ public void InspectNestedPipeline()
+ {
+ var mlContext = new MLContext(seed: 1);
+
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.iris.trainFilename),
+ hasHeader: TestDatasets.iris.fileHasHeader,
+ separatorChar: TestDatasets.iris.fileSeparator);
+
+ // Create a training pipeline.
+ var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features)
+ .Append(StepOne(mlContext))
+ .Append(StepTwo(mlContext));
+
+ // Fit the pipeline.
+ var model = pipeline.Fit(data);
+
+ // Extract the trained models.
+ var modelComponents = model.ToList();
+ var kMeansModel = (modelComponents[1] as TransformerChain>).LastTransformer;
+ var mcLrModel = (modelComponents[2] as TransformerChain>).LastTransformer;
+
+ // Validate the k-means model.
+ VBuffer[] centroids = default;
+ kMeansModel.Model.GetClusterCentroids(ref centroids, out int nCentroids);
+ Assert.Equal(4, centroids.Length);
+
+ // Validate the MulticlassLogisticRegressionModel.
+ VBuffer[] weights = default;
+ mcLrModel.Model.GetWeights(ref weights, out int classes);
+ Assert.Equal(3, weights.Length);
+ }
+
+ private IEstimator>> StepOne(MLContext mlContext)
+ {
+ return mlContext.Transforms.Concatenate("LabelAndFeatures", "Label", "Features")
+ .Append(mlContext.Clustering.Trainers.KMeans(
+ new KMeansPlusPlusTrainer.Options
+ {
+ InitializationAlgorithm = KMeansPlusPlusTrainer.InitializationAlgorithm.Random,
+ NumberOfClusters = 4,
+ NumberOfIterations = 10,
+ NumberOfThreads = 1
+ }));
+ }
+
+ private IEstimator>> StepTwo(MLContext mlContext)
+ {
+ return mlContext.Transforms.Conversion.MapValueToKey("Label")
+ .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent(
+ new SdcaMultiClassTrainer.Options {
+ NumberOfIterations = 10,
+ NumberOfThreads = 1 }));
+ }
+ }
+}
\ No newline at end of file
diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs
index 686e1a0ec6..49a5db6693 100644
--- a/test/Microsoft.ML.Functional.Tests/Validation.cs
+++ b/test/Microsoft.ML.Functional.Tests/Validation.cs
@@ -4,16 +4,22 @@
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
+using Microsoft.ML.Functional.Tests.Datasets;
using Microsoft.ML.RunTests;
using Microsoft.ML.TestFramework;
-using Microsoft.ML.Trainers.FastTree;
using Microsoft.ML.Trainers;
+using Microsoft.ML.Trainers.FastTree;
using Xunit;
+using Xunit.Abstractions;
namespace Microsoft.ML.Functional.Tests
{
- public class ValidationScenarios
+ public class Validation : BaseTestClass
{
+ public Validation(ITestOutputHelper output) : base(output)
+ {
+ }
+
///
/// Cross-validation: Have a mechanism to do cross validation, that is, you come up with
/// a data source (optionally with stratification column), come up with an instantiable transform
@@ -26,16 +32,11 @@ void CrossValidation()
{
var mlContext = new MLContext(seed: 1);
- // Get the dataset.
- var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(),
- hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator)
- .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));
-
- // Create a pipeline to train on the sentiment data.
- var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
- "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling",
- "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"})
- .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue"))
+ // Get the dataset
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
+
+ // Create a pipeline to train on the housing data.
+ var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
.Append(mlContext.Regression.Trainers.OrdinaryLeastSquares());
// Compute the CV result.
@@ -61,18 +62,15 @@ public void TrainWithValidationSet()
var mlContext = new MLContext(seed: 1);
// Get the dataset.
- var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(),
- hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator)
- .Load(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename));
+ var data = mlContext.Data.LoadFromTextFile(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
+
+ // Create the train and validation set.
var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2);
var trainData = dataSplit.TrainSet;
var validData = dataSplit.TestSet;
// Create a pipeline to featurize the dataset.
- var pipeline = mlContext.Transforms.Concatenate("Features", new string[] {
- "CrimesPerCapita", "PercentResidental", "PercentNonRetail", "CharlesRiver", "NitricOxides", "RoomsPerDwelling",
- "PercentPre40s", "EmploymentDistance", "HighwayDistance", "TaxRate", "TeacherRatio"})
- .Append(mlContext.Transforms.CopyColumns("Label", "MedianHomeValue"))
+ var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
.AppendCacheCheckpoint(mlContext) as IEstimator;
// Preprocess the datasets.
@@ -81,7 +79,7 @@ public void TrainWithValidationSet()
var preprocessedValidData = preprocessor.Transform(validData);
// Train the model with a validation set.
- var trainedModel = mlContext.Regression.Trainers.FastTree(new Trainers.FastTree.FastTreeRegressionTrainer.Options {
+ var trainedModel = mlContext.Regression.Trainers.FastTree(new FastTreeRegressionTrainer.Options {
NumberOfTrees = 2,
EarlyStoppingMetric = EarlyStoppingMetric.L2Norm,
EarlyStoppingRule = new GeneralityLossRule()
diff --git a/test/Microsoft.ML.TestFramework/Datasets.cs b/test/Microsoft.ML.TestFramework/Datasets.cs
index d279332cad..23cb24dc62 100644
--- a/test/Microsoft.ML.TestFramework/Datasets.cs
+++ b/test/Microsoft.ML.TestFramework/Datasets.cs
@@ -16,6 +16,7 @@ public class TestDataset
public string labelFilename;
public char fileSeparator;
public bool fileHasHeader;
+ public bool allowQuoting;
// REVIEW: Replace these with appropriate SubComponents!
public string settings;
@@ -212,6 +213,7 @@ public static class TestDatasets
testFilename = "wikipedia-detox-250-line-test.tsv",
fileHasHeader = true,
fileSeparator = '\t',
+ allowQuoting = true,
GetLoaderColumns = () =>
{
return new[]
@@ -276,6 +278,8 @@ public static class TestDatasets
name = "Census",
trainFilename = "adult.tiny.with-schema.txt",
testFilename = "adult.tiny.with-schema.txt",
+ fileHasHeader = true,
+ fileSeparator = '\t',
loaderSettings = "loader=Text{header+ col=Label:0 col=Num:9-14 col=Cat:TX:1-8}",
mamlExtraSettings = new[] { "xf=Cat{col=Cat}", "xf=Concat{col=Features:Num,Cat}" },
extraSettings = @"/inst Text{header+ sep=, label=14 handler=Categorical{cols=5-9,1,13,3}}",
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs
deleted file mode 100644
index 1c9b76c813..0000000000
--- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs
+++ /dev/null
@@ -1,153 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using Microsoft.ML.Calibrators;
-using Microsoft.ML.Data;
-using Microsoft.ML.RunTests;
-using Microsoft.ML.Trainers;
-using Microsoft.ML.Trainers.FastTree;
-using Xunit;
-
-namespace Microsoft.ML.Tests.Scenarios.Api
-{
-
- public partial class ApiScenariosTests
- {
- ///
- /// Introspective training: Models that produce outputs and are otherwise black boxes are of limited use;
- /// it is also necessary often to understand at least to some degree what was learnt. To outline critical
- /// scenarios that have come up multiple times:
- /// *) When I train a linear model, I should be able to inspect coefficients.
- /// *) The tree ensemble learners, I should be able to inspect the trees.
- /// *) The LDA transform, I should be able to inspect the topics.
- /// I view it as essential from a usability perspective that this be discoverable to someone without
- /// having to read documentation. For example, if I have var lda = new LdaTransform().Fit(data)(I don't insist on that
- /// exact signature, just giving the idea), then if I were to type lda.
- /// In Visual Studio, one of the auto-complete targets should be something like GetTopics.
- ///
-
- [Fact]
- public void IntrospectiveTraining()
- {
- var ml = new MLContext(seed: 1);
- var data = ml.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true, allowQuoting: true);
-
- var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText")
- .AppendCacheCheckpoint(ml)
- .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated(
- new SdcaNonCalibratedBinaryTrainer.Options { NumberOfThreads = 1 }));
-
- // Train.
- var model = pipeline.Fit(data);
-
- // Get feature weights.
- var weights = model.LastTransformer.Model.Weights;
- }
-
- [Fact]
- public void FastTreeClassificationIntrospectiveTraining()
- {
- var ml = new MLContext(seed: 1);
- var data = ml.Data.LoadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true, allowQuoting: true);
-
- var trainer = ml.BinaryClassification.Trainers.FastTree(numberOfLeaves: 5, numberOfTrees: 3);
-
- BinaryPredictionTransformer> pred = null;
-
- var pipeline = ml.Transforms.Text.FeaturizeText("Features", "SentimentText")
- .AppendCacheCheckpoint(ml)
- .Append(trainer.WithOnFitDelegate(p => pred = p));
-
- // Train.
- var model = pipeline.Fit(data);
-
- // Extract the learned GBDT model.
- var treeCollection = pred.Model.SubModel.TrainedTreeEnsemble;
-
- // Inspect properties in the extracted model.
- Assert.Equal(3, treeCollection.Trees.Count);
- Assert.Equal(3, treeCollection.TreeWeights.Count);
- Assert.Equal(0, treeCollection.Bias);
- Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight));
-
- // Inspect the last tree.
- var tree = treeCollection.Trees[2];
-
- Assert.Equal(5, tree.NumberOfLeaves);
- Assert.Equal(4, tree.NumberOfNodes);
- Assert.Equal(tree.LeftChild, new int[] { 2, -2, -1, -3 });
- Assert.Equal(tree.RightChild, new int[] { 1, 3, -4, -5 });
- Assert.Equal(tree.NumericalSplitFeatureIndexes, new int[] { 14, 294, 633, 266 });
- Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes);
- Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes);
- var expectedSplitGains = new double[] { 0.52634223978445616, 0.45899249367725858, 0.44142707650267105, 0.38348634823264854 };
- var expectedThresholds = new float[] { 0.0911167f, 0.06509889f, 0.019873254f, 0.0361835f };
- for (int i = 0; i < tree.NumberOfNodes; ++i)
- {
- Assert.Equal(expectedSplitGains[i], tree.SplitGains[i], 6);
- Assert.Equal(expectedThresholds[i], tree.NumericalSplitThresholds[i], 6);
- }
- Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag));
-
- Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count);
- Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count);
- }
-
- [Fact]
- public void FastForestRegressionIntrospectiveTraining()
- {
- var ml = new MLContext(seed: 1);
- var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(1000);
- var dataView = ml.Data.LoadFromEnumerable(data);
-
- RegressionPredictionTransformer pred = null;
- var trainer = ml.Regression.Trainers.FastForest(numLeaves: 5, numTrees: 3).WithOnFitDelegate(p => pred = p);
-
- // Train.
- var model = trainer.Fit(dataView);
-
- // Extract the learned RF model.
- var treeCollection = pred.Model.TrainedTreeEnsemble;
-
- // Inspect properties in the extracted model.
- Assert.Equal(3, treeCollection.Trees.Count);
- Assert.Equal(3, treeCollection.TreeWeights.Count);
- Assert.Equal(0, treeCollection.Bias);
- Assert.All(treeCollection.TreeWeights, weight => Assert.Equal(1.0, weight));
-
- // Inspect the last tree.
- var tree = treeCollection.Trees[2];
-
- Assert.Equal(5, tree.NumberOfLeaves);
- Assert.Equal(4, tree.NumberOfNodes);
- Assert.Equal(tree.LeftChild, new int[] { -1, -2, -3, -4 });
- Assert.Equal(tree.RightChild, new int[] { 1, 2, 3, -5 });
- Assert.Equal(tree.NumericalSplitFeatureIndexes, new int[] { 9, 0, 1, 8 });
- Assert.Equal(tree.SplitGains.Count, tree.NumberOfNodes);
- Assert.Equal(tree.NumericalSplitThresholds.Count, tree.NumberOfNodes);
- var expectedSplitGains = new double[] { 21.279269008093962, 19.376698810984138, 17.830020749728774, 17.366801337893413 };
- var expectedThresholds = new float[] { 0.208134219f, 0.198336035f, 0.202952743f, 0.205061346f };
- for (int i = 0; i < tree.NumberOfNodes; ++i)
- {
- Assert.Equal(expectedSplitGains[i], tree.SplitGains[i], 6);
- Assert.Equal(expectedThresholds[i], tree.NumericalSplitThresholds[i], 6);
- }
- Assert.All(tree.CategoricalSplitFlags, flag => Assert.False(flag));
-
- Assert.Equal(0, tree.GetCategoricalSplitFeaturesAt(0).Count);
- Assert.Equal(0, tree.GetCategoricalCategoricalSplitFeatureRangeAt(0).Count);
-
- var samples = new double[] { 0.97468354430379744, 1.0, 0.97727272727272729, 0.972972972972973, 0.26124197002141325 };
- for (int i = 0; i < tree.NumberOfLeaves; ++i)
- {
- var sample = tree.GetLeafSamplesAt(i);
- Assert.Single(sample);
- Assert.Equal(samples[i], sample[0], 6);
- var weight = tree.GetLeafSampleWeightsAt(i);
- Assert.Single(weight);
- Assert.Equal(1, weight[0]);
- }
- }
- }
-}