From 17248d39f11f5ef9413af1906e7d7a161f0cc70b Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 31 May 2019 16:55:57 -0700 Subject: [PATCH 01/25] Implement transformer --- .../TreeEnsembleFeaturizationEstimator.cs | 14 ++ .../TreeEnsembleFeaturizationTransformer.cs | 149 ++++++++++++++++++ .../TreeEnsembleFeaturizerTest.cs | 96 +++++++++++ 3 files changed, 259 insertions(+) create mode 100644 src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs create mode 100644 src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs new file mode 100644 index 0000000000..781e73ba21 --- /dev/null +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -0,0 +1,14 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.FastTree +{ + public class TreeEnsembleFeaturizationEstimator + { + } +} diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs new file mode 100644 index 0000000000..72e43420a8 --- /dev/null +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -0,0 +1,149 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Calibrators; +using Microsoft.ML.CommandLine; +using Microsoft.ML.Data; +using Microsoft.ML.Data.Conversion; +using Microsoft.ML.Data.IO; +using Microsoft.ML.EntryPoints; +using Microsoft.ML.Internal.Utilities; +using Microsoft.ML.Model; +using Microsoft.ML.Runtime; +using Microsoft.ML.Trainers.FastTree; +using Microsoft.ML.Transforms; +using Microsoft.ML.TreePredictor; + +[assembly: LoadableClass(typeof(TreeEnsembleFeaturizationTransformer), typeof(TreeEnsembleFeaturizationTransformer), + null, typeof(SignatureLoadModel), "", TreeEnsembleFeaturizationTransformer.LoaderSignature)] + +namespace Microsoft.ML.Trainers.FastTree +{ + public sealed class TreeEnsembleFeaturizationTransformer : PredictionTransformerBase + { + internal const string LoaderSignature = "TreeEnseFeat"; + private readonly TreeEnsembleFeaturizerBindableMapper.Arguments _scorerArgs; + private readonly DataViewSchema.DetachedColumn _featureDetachedColumn; + private readonly string _outputColumnSuffix; + + /// + /// Check if is compatible with . + /// + /// A column checked against . + private void CheckFeatureColumnCompatibility(DataViewSchema.Column inspectedFeatureColumn) + { + string nameErrorMessage = $"The column called {inspectedFeatureColumn.Name} does not match the expected " + + $"feature column with name {_featureDetachedColumn.Name} and type {_featureDetachedColumn.Type}. " + + $"Please rename your column by calling CopyColumns defined in TransformExtensionsCatalog"; + // Check if column names are the same. + Host.Check(_featureDetachedColumn.Name == inspectedFeatureColumn.Name, nameErrorMessage); + + string typeErrorMessage = $"The column called {inspectedFeatureColumn.Name} has a type {inspectedFeatureColumn.Type}, " + + $"which does not match the expected feature column with name {_featureDetachedColumn.Name} and type {_featureDetachedColumn.Type}. " + + $"Please make sure your feature column type is {_featureDetachedColumn.Type}."; + // Check if column types are identical. + Host.Check(_featureDetachedColumn.Type.Equals(inspectedFeatureColumn.Type), typeErrorMessage); + } + + /// + /// Create from by using as the feature role. + /// + /// The original schema to be mapped. + private RoleMappedSchema MakeFeatureRoleMappedSchema(DataViewSchema schema) + { + var roles = new List>(); + roles.Add(new KeyValuePair(RoleMappedSchema.ColumnRole.Feature, _featureDetachedColumn.Name)); + return new RoleMappedSchema(schema, roles); + } + + public TreeEnsembleFeaturizationTransformer(IHostEnvironment env, DataViewSchema inputSchema, + DataViewSchema.Column featureColumn, TreeEnsembleModelParameters modelParameters, string outputColumnNameSuffix=null) : + base(Contracts.CheckRef(env, nameof(env)).Register(nameof(TreeEnsembleFeaturizationTransformer)), modelParameters, inputSchema) + { + // Store featureColumn as a detached column because a fitted transformer can be applied to different IDataViews and different + // IDataView may have different schemas. + _featureDetachedColumn = new DataViewSchema.DetachedColumn(featureColumn); + // Check if featureColumn matches a column in inputSchema. The answer is yes if they have the same name and type. + // The indexed column, inputSchema[featureColumn.Index], should match the detached column, _featureDetachedColumn. + CheckFeatureColumnCompatibility(inputSchema[featureColumn.Index]); + // Store outputColumnNameSuffix so that this transformer can be saved into a file later. + _outputColumnSuffix = outputColumnNameSuffix; + // Create an argument, _scorerArgs, to pass the suffix of output column names to the underlying scorer. + _scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments { Suffix = _outputColumnSuffix }; + // Create a bindable mapper. It provides the core computation and can be attached to any IDataView and produce + // a transformed IDataView. + BindableMapper = new TreeEnsembleFeaturizerBindableMapper(env, _scorerArgs, modelParameters); + // Create a scorer. + var roleMappedSchema = MakeFeatureRoleMappedSchema(inputSchema); + Scorer = new GenericScorer(Host, _scorerArgs, new EmptyDataView(Host, inputSchema), BindableMapper.Bind(Host, roleMappedSchema), roleMappedSchema); + } + + private TreeEnsembleFeaturizationTransformer(IHostEnvironment host, ModelLoadContext ctx) + : base(Contracts.CheckRef(host, nameof(host)).Register(nameof(TreeEnsembleFeaturizationTransformer)), ctx) + { + // *** Binary format *** + // + // string: feature column's name. + // string: output columns' suffix. + + string featureColumnName = ctx.LoadString(); + _featureDetachedColumn = new DataViewSchema.DetachedColumn(TrainSchema[featureColumnName]); + _outputColumnSuffix = ctx.LoadStringOrNull(); + + BindableMapper = ScoreUtils.GetSchemaBindableMapper(Host, Model); + + var args = new GenericScorer.Arguments { Suffix = "" }; + var schema = MakeFeatureRoleMappedSchema(TrainSchema); + Scorer = new GenericScorer(Host, args, new EmptyDataView(Host, TrainSchema), BindableMapper.Bind(Host, schema), schema); + } + + public override DataViewSchema GetOutputSchema(DataViewSchema inputSchema) => Transform(new EmptyDataView(Host, inputSchema)).Schema; + + private protected override void SaveModel(ModelSaveContext ctx) + { + Host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // *** Binary format *** + // model: prediction model. + // stream: empty data view that contains train schema. + // ids of strings: feature columns. + // float: scorer threshold + // id of string: scorer threshold column + + ctx.SaveModel(Model, DirModel); + ctx.SaveBinaryStream(DirTransSchema, writer => + { + using (var ch = Host.Start("Saving train schema")) + { + var saver = new BinarySaver(Host, new BinarySaver.Arguments { Silent = true }); + DataSaverUtils.SaveDataView(ch, saver, new EmptyDataView(Host, TrainSchema), writer.BaseStream); + } + }); + + ctx.SaveString(_featureDetachedColumn.Name); + ctx.SaveStringOrNull(_outputColumnSuffix); + } + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "TREEFEAT", // "TREE" ensemble "FEAT"urizer. + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature, + loaderAssemblyName: typeof(TreeEnsembleFeaturizationTransformer).Assembly.FullName); + } + + private static TreeEnsembleFeaturizationTransformer Create(IHostEnvironment env, ModelLoadContext ctx) + => new TreeEnsembleFeaturizationTransformer(env, ctx); + } +} \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 188a9ced46..e25a4faaa4 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System; +using System.Collections.Generic; using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Trainers.FastTree; @@ -108,5 +109,100 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() } } + + [Fact] + public void TreeEnsembleFeaturizerTransformerFastTreeBinary() + { + // Create data set + int dataPointCount = 20; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = ML.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 1, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 1 + }); + + // Train the defined tree model. + var model = trainer.Fit(dataView); + var predicted = model.Transform(dataView); + + // From the trained tree model, a mapper of tree featurizer is created. + var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model.SubModel); + + // Apply TreeEnsembleFeaturizer to the input data. + var transformed = treeFeaturizer.Transform(dataView); + + // Extract the outputs of TreeEnsembleFeaturizer. + var features = transformed.GetColumn("Features").ToArray(); + var leafValues = transformed.GetColumn("Trees").ToArray(); + var leafIds = transformed.GetColumn("Leaves").ToArray(); + var paths = transformed.GetColumn("Paths").ToArray(); + + // Check if the TreeEnsembleFeaturizer produce expected values. + List path = null; + for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex) + { + int treeIndex = 0; + var leafId = model.Model.SubModel.GetLeaf(treeIndex, new VBuffer(10, features[dataPointIndex]), ref path); + var leafValue = model.Model.SubModel.GetLeafValue(0, leafId); + Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue); + Assert.Equal(1.0, leafIds[dataPointIndex][leafId]); + foreach (var nodeId in path) + Assert.Equal(1.0, paths[dataPointIndex][nodeId]); + } + } + + [Fact] + public void TreeEnsembleFeaturizerTransformerFastForestBinary() + { + // Create data set + int dataPointCount = 20; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = ML.BinaryClassification.Trainers.FastForest( + new FastForestBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 1, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 1 + }); + + // Train the defined tree model. + var model = trainer.Fit(dataView); + + // From the trained tree model, a mapper of tree featurizer is created. + var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model); + + // Apply TreeEnsembleFeaturizer to the input data. + var transformed = treeFeaturizer.Transform(dataView); + + // Extract the outputs of TreeEnsembleFeaturizer. + var features = transformed.GetColumn("Features").ToArray(); + var leafValues = transformed.GetColumn("Trees").ToArray(); + var leafIds = transformed.GetColumn("Leaves").ToArray(); + var paths = transformed.GetColumn("Paths").ToArray(); + + // Check if the TreeEnsembleFeaturizer produce expected values. + List path = null; + for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex) + { + int treeIndex = 0; + var leafId = model.Model.GetLeaf(treeIndex, new VBuffer(10, features[dataPointIndex]), ref path); + var leafValue = model.Model.GetLeafValue(0, leafId); + Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue); + Assert.Equal(1.0, leafIds[dataPointIndex][leafId]); + foreach (var nodeId in path) + Assert.Equal(1.0, paths[dataPointIndex][nodeId]); + } + } } } From a2f1d6c88f308b13d509a4914bf0b3ca05cf3d2c Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 3 Jun 2019 15:11:08 -0700 Subject: [PATCH 02/25] Initial draft of porting tree-based featurization --- .../TreeEnsembleFeaturizationEstimator.cs | 229 ++++++++++++++++- .../TreeEnsembleFeaturizationTransformer.cs | 11 - .../TreeTrainersCatalog.cs | 41 ++- .../TreeEnsembleFeaturizerTest.cs | 241 ++++++++++++++++++ 4 files changed, 505 insertions(+), 17 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index 781e73ba21..f4143eef70 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -2,13 +2,232 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; -using System.Collections.Generic; -using System.Text; +using System.Linq; +using Microsoft.ML.Data; +using Microsoft.ML.Runtime; -namespace Microsoft.ML.FastTree +namespace Microsoft.ML.Trainers.FastTree { - public class TreeEnsembleFeaturizationEstimator + /// + /// This class encapsulates the common behavior of all tree-based featurizers such as , + /// , , + /// , and . + /// All tree-based featurizers share the same output schema computed by . All tree-based featurizers + /// requires an input feature column name and a suffix for all output columns. The returned by + /// produces three columns: (1) the prediction values of all trees, (2) the IDs of leaves the input feature vector falling into, and (3) + /// the binary vector which encodes the paths to those destination leaves. + /// + public abstract class FeaturizationEstimatorBase : IEstimator { + /// + /// The common options of tree-based featurizations such as , , + /// , , and . + /// + public class CommonOptions + { + /// + /// The name of feature column in the when calling . + /// The column type must be a vector of . + /// + public string InputColumnName; + + /// + /// The estimator has three output columns. Their names would be "Trees" + , + /// "Leaves" + , and "Paths" + . If + /// is , the output names would be "Trees", "Leaves", and "Paths". + /// + public string OutputColumnsSuffix; + }; + + /// + /// Feature column to apply tree-based featurization. Note that is not necessary to be the same as + /// the feature column used to train the tree model. + /// + private protected readonly string FeatureColumnName; + + /// + /// See . + /// + private protected readonly string OutputColumnSuffix; + + /// + /// Environment of this instance. It controls error throwing and other enviroment settings. + /// + private protected readonly IHostEnvironment Env; + + private protected FeaturizationEstimatorBase(IHostEnvironment env, CommonOptions options) + { + Env = env; + FeatureColumnName = options.InputColumnName; + OutputColumnSuffix = options.OutputColumnsSuffix; + } + + /// + /// All derived class should implement to tell how to get a + /// out from and parameters inside this or derived classes. + /// + /// Data used to train a tree model. + /// The trees used in . + private protected abstract TreeEnsembleModelParameters PrepareModel(IDataView input); + + /// + /// Produce a which maps the column called in + /// to three output columns. + /// + public TreeEnsembleFeaturizationTransformer Fit(IDataView input) + { + var model = PrepareModel(input); + return new TreeEnsembleFeaturizationTransformer(Env, input.Schema, + input.Schema[FeatureColumnName], model, OutputColumnSuffix); + } + + /// + /// adds three float-vector columns into . + /// Given a feature vector column, the added columns are the prediction values of all trees, the leaf IDs the feature + /// vector falls into, and the paths to those leaves. + /// + /// A schema which contains a feature column. Note that feature column name can be specified + /// by . + /// Output produced by . + public SchemaShape GetOutputSchema(SchemaShape inputSchema) + { + Env.CheckValue(inputSchema, nameof(inputSchema)); + + if (!inputSchema.TryFindColumn(FeatureColumnName, out var col)) + throw Env.ExceptSchemaMismatch(nameof(inputSchema), "input", FeatureColumnName); + + var result = inputSchema.ToDictionary(x => x.Name); + + var treeColumnName = OutputColumnSuffix != null ? OutputColumnSuffix + "Trees" : "Trees"; + result[treeColumnName] = new SchemaShape.Column(treeColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + + var leafColumnName = OutputColumnSuffix != null ? OutputColumnSuffix + "Leaves" : "Leaves"; + result[leafColumnName] = new SchemaShape.Column(leafColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + + var pathColumnName = OutputColumnSuffix != null ? OutputColumnSuffix + "Paths" : "Paths"; + result[pathColumnName] = new SchemaShape.Column(pathColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + + return new SchemaShape(result.Values); + } + } + + /// + /// A which takes a trained and calling its + /// produces a featurizer based on the trained model. + /// + public sealed class PretrainedTreeFeaturizationEstimator : FeaturizationEstimatorBase + { + public sealed class Options : FeaturizationEstimatorBase.CommonOptions + { + public TreeEnsembleModelParameters ModelParameters; + }; + + private TreeEnsembleModelParameters _modelParameters; + + public PretrainedTreeFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) + { + _modelParameters = options.ModelParameters; + } + + /// + /// Produce the for tree-based feature engineering. This function does not + /// invoke training procedure and just returns the pre-trained model passed in via . + /// + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) => _modelParameters; + } + + public sealed class FastTreeBinaryFeaturizationEstimator : FeaturizationEstimatorBase + { + private readonly FastTreeBinaryTrainer.Options _trainerOptions; + + public sealed class Options : CommonOptions + { + public FastTreeBinaryTrainer.Options TrainerOptions; + } + + public FastTreeBinaryFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastTreeBinaryTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model.SubModel; + } + } + + public sealed class FastTreeRegressionFeaturizationEstimator : FeaturizationEstimatorBase + { + private readonly FastTreeRegressionTrainer.Options _trainerOptions; + + public sealed class Options : CommonOptions + { + public FastTreeRegressionTrainer.Options TrainerOptions; + } + + public FastTreeRegressionFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastTreeRegressionTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } + + public sealed class FastForestBinaryFeaturizationEstimator : FeaturizationEstimatorBase + { + private readonly FastForestBinaryTrainer.Options _trainerOptions; + + public sealed class Options : CommonOptions + { + public FastForestBinaryTrainer.Options TrainerOptions; + } + + public FastForestBinaryFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastForestBinaryTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } + + public sealed class FastForestRegressionFeaturizationEstimator : FeaturizationEstimatorBase + { + private readonly FastForestRegressionTrainer.Options _trainerOptions; + + public sealed class Options : CommonOptions + { + public FastForestRegressionTrainer.Options TrainerOptions; + } + + public FastForestRegressionFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastForestRegressionTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } } } diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs index 72e43420a8..08d2342d1b 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -2,23 +2,12 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; using System.Collections.Generic; -using System.IO; -using System.Linq; using Microsoft.ML; -using Microsoft.ML.Calibrators; -using Microsoft.ML.CommandLine; using Microsoft.ML.Data; -using Microsoft.ML.Data.Conversion; using Microsoft.ML.Data.IO; -using Microsoft.ML.EntryPoints; -using Microsoft.ML.Internal.Utilities; -using Microsoft.ML.Model; using Microsoft.ML.Runtime; using Microsoft.ML.Trainers.FastTree; -using Microsoft.ML.Transforms; -using Microsoft.ML.TreePredictor; [assembly: LoadableClass(typeof(TreeEnsembleFeaturizationTransformer), typeof(TreeEnsembleFeaturizationTransformer), null, typeof(SignatureLoadModel), "", TreeEnsembleFeaturizationTransformer.LoaderSignature)] diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index 3909bca2ff..ba67a33a34 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -427,7 +427,6 @@ public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalo /// ]]> /// /// - public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalog.BinaryClassificationTrainers catalog, FastForestBinaryTrainer.Options options) { @@ -437,5 +436,45 @@ public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalo var env = CatalogUtils.GetEnvironment(catalog); return new FastForestBinaryTrainer(env, options); } + + public static PretrainedTreeFeaturizationEstimator PretrainTreeEnsembleFeaturizing(this TransformsCatalog catalog, + PretrainedTreeFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new PretrainedTreeFeaturizationEstimator(env, options); + } + + public static FastForestRegressionFeaturizationEstimator FastForestRegressionFeaturizing(this TransformsCatalog catalog, + FastForestRegressionFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastForestRegressionFeaturizationEstimator(env, options); + } + + public static FastTreeRegressionFeaturizationEstimator FastTreeRegressionFeaturizing(this TransformsCatalog catalog, + FastTreeRegressionFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastTreeRegressionFeaturizationEstimator(env, options); + } + + public static FastForestBinaryFeaturizationEstimator FastForestBinaryFeaturizing(this TransformsCatalog catalog, + FastForestBinaryFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastForestBinaryFeaturizationEstimator(env, options); + } + + public static FastTreeBinaryFeaturizationEstimator FastTreeBinaryFeaturizing(this TransformsCatalog catalog, + FastTreeBinaryFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastTreeBinaryFeaturizationEstimator(env, options); + } } } diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index e25a4faaa4..8182bb49c9 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -204,5 +204,246 @@ public void TreeEnsembleFeaturizerTransformerFastForestBinary() Assert.Equal(1.0, paths[dataPointIndex][nodeId]); } } + + /// + /// A test of . + /// + [Fact] + public void TestPretrainedTreeFeaturizationEstimator() + { + // Create data set + int dataPointCount = 20; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = ML.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 1, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 1 + }); + + // Train the defined tree model. + var model = trainer.Fit(dataView); + var predicted = model.Transform(dataView); + + // From the trained tree model, a mapper of tree featurizer is created. + var options = new PretrainedTreeFeaturizationEstimator.Options() { InputColumnName = "Features", ModelParameters = model.Model.SubModel }; + var treeFeaturizer = ML.Transforms.PretrainTreeEnsembleFeaturizing(options).Fit(dataView); + + // Apply TreeEnsembleFeaturizer to the input data. + var transformed = treeFeaturizer.Transform(dataView); + + // Extract the outputs of TreeEnsembleFeaturizer. + var features = transformed.GetColumn("Features").ToArray(); + var leafValues = transformed.GetColumn("Trees").ToArray(); + var leafIds = transformed.GetColumn("Leaves").ToArray(); + var paths = transformed.GetColumn("Paths").ToArray(); + + // Check if the TreeEnsembleFeaturizer produce expected values. + List path = null; + for (int dataPointIndex = 0; dataPointIndex < dataPointCount; ++dataPointIndex) + { + int treeIndex = 0; + var leafId = model.Model.SubModel.GetLeaf(treeIndex, new VBuffer(10, features[dataPointIndex]), ref path); + var leafValue = model.Model.SubModel.GetLeafValue(0, leafId); + Assert.Equal(leafValues[dataPointIndex][treeIndex], leafValue); + Assert.Equal(1.0, leafIds[dataPointIndex][leafId]); + foreach (var nodeId in path) + Assert.Equal(1.0, paths[dataPointIndex][nodeId]); + } + } + + /// + /// This test contains several steps. + /// 1. It first trains a using . + /// 2. Then, it creates the a from the trained . + /// 3. The feature produced in step 2 would be fed into to enhance the training accuracy of that linear model. + /// 4. We train another without features from trees and finally compare their scores. + /// + [Fact] + public void TreeEnsembleFeaturizingPipeline() + { + // Create data set + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = ML.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10 + }); + + // Train the defined tree model. This trained model will be used to construct TreeEnsembleFeaturizationEstimator. + var treeModel = trainer.Fit(dataView); + var predicted = treeModel.Transform(dataView); + + // Combine the output of TreeEnsembleFeaturizationTransformer and the original features as the final training features. + // Then train a linear model. + var options = new PretrainedTreeFeaturizationEstimator.Options() { InputColumnName = "Features", ModelParameters = treeModel.Model.SubModel }; + var pipeline = ML.Transforms.PretrainTreeEnsembleFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.BinaryClassification.Evaluate(prediction); + + // Then train the same linear model without tree features. + var naivePipeline = ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "Features"); + var naiveModel = naivePipeline.Fit(dataView); + var naivePrediction = naiveModel.Transform(dataView); + var naiveMetrics = ML.BinaryClassification.Evaluate(naivePrediction); + + // The linear model trained with tree features should perform better than that without tree features. + Assert.True(metrics.Accuracy > naiveMetrics.Accuracy); + Assert.True(metrics.LogLoss < naiveMetrics.LogLoss); + Assert.True(metrics.AreaUnderPrecisionRecallCurve > naiveMetrics.AreaUnderPrecisionRecallCurve); + } + + [Fact] + public void TestFastTreeBinaryFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeBinaryFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FastTreeBinaryFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.BinaryClassification.Evaluate(prediction); + + Assert.True(metrics.Accuracy > 0.98); + Assert.True(metrics.LogLoss < 0.05); + Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); + } + + [Fact] + public void TestFastForestBinaryFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastForestBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastForestBinaryFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FastForestBinaryFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.BinaryClassification.Evaluate(prediction); + + Assert.True(metrics.Accuracy > 0.97); + Assert.True(metrics.LogLoss < 0.07); + Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); + } + + [Fact] + public void TestFastTreeRegressionFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastTreeRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeRegressionFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FastTreeRegressionFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.2); + Assert.True(metrics.MeanSquaredError < 0.05); + } + + [Fact] + public void TestFastForestRegressionFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastForestRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastForestRegressionFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FastForestRegressionFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + } } } From 33d0ee08a8d3e3e01f1bfd90cb8c8a788c5cb6bd Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 3 Jun 2019 15:20:57 -0700 Subject: [PATCH 03/25] Internalize something --- .../TreeEnsembleFeaturizationEstimator.cs | 10 +++++----- .../TreeEnsembleFeaturizationTransformer.cs | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index f4143eef70..c5fd8837c8 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -127,7 +127,7 @@ public sealed class Options : FeaturizationEstimatorBase.CommonOptions private TreeEnsembleModelParameters _modelParameters; - public PretrainedTreeFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) + internal PretrainedTreeFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) { _modelParameters = options.ModelParameters; } @@ -148,7 +148,7 @@ public sealed class Options : CommonOptions public FastTreeBinaryTrainer.Options TrainerOptions; } - public FastTreeBinaryFeaturizationEstimator(IHostEnvironment env, Options options) + internal FastTreeBinaryFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) { _trainerOptions = options.TrainerOptions; @@ -171,7 +171,7 @@ public sealed class Options : CommonOptions public FastTreeRegressionTrainer.Options TrainerOptions; } - public FastTreeRegressionFeaturizationEstimator(IHostEnvironment env, Options options) + internal FastTreeRegressionFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) { _trainerOptions = options.TrainerOptions; @@ -194,7 +194,7 @@ public sealed class Options : CommonOptions public FastForestBinaryTrainer.Options TrainerOptions; } - public FastForestBinaryFeaturizationEstimator(IHostEnvironment env, Options options) + internal FastForestBinaryFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) { _trainerOptions = options.TrainerOptions; @@ -217,7 +217,7 @@ public sealed class Options : CommonOptions public FastForestRegressionTrainer.Options TrainerOptions; } - public FastForestRegressionFeaturizationEstimator(IHostEnvironment env, Options options) + internal FastForestRegressionFeaturizationEstimator(IHostEnvironment env, Options options) : base(env, options) { _trainerOptions = options.TrainerOptions; diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs index 08d2342d1b..b1970940a8 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -51,7 +51,7 @@ private RoleMappedSchema MakeFeatureRoleMappedSchema(DataViewSchema schema) return new RoleMappedSchema(schema, roles); } - public TreeEnsembleFeaturizationTransformer(IHostEnvironment env, DataViewSchema inputSchema, + internal TreeEnsembleFeaturizationTransformer(IHostEnvironment env, DataViewSchema inputSchema, DataViewSchema.Column featureColumn, TreeEnsembleModelParameters modelParameters, string outputColumnNameSuffix=null) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(TreeEnsembleFeaturizationTransformer)), modelParameters, inputSchema) { From 965899139b863a3c93d9ab269140e8296f8cf389 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 3 Jun 2019 16:18:54 -0700 Subject: [PATCH 04/25] Add Tweedie and Ranking cases --- .../TreeEnsembleFeaturizationEstimator.cs | 46 +++++++++++++ .../TreeTrainersCatalog.cs | 22 +++++- .../TreeEnsembleFeaturizerTest.cs | 68 +++++++++++++++++++ 3 files changed, 133 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index c5fd8837c8..75f3ce4450 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -230,4 +230,50 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in return trained.Model; } } + + public sealed class FastTreeRankingFeaturizationEstimator : FeaturizationEstimatorBase + { + private readonly FastTreeRankingTrainer.Options _trainerOptions; + + public sealed class Options : CommonOptions + { + public FastTreeRankingTrainer.Options TrainerOptions; + } + + internal FastTreeRankingFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastTreeRankingTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } + + public sealed class FastTreeTweedieFeaturizationEstimator : FeaturizationEstimatorBase + { + private readonly FastTreeTweedieTrainer.Options _trainerOptions; + + public sealed class Options : CommonOptions + { + public FastTreeTweedieTrainer.Options TrainerOptions; + } + + internal FastTreeTweedieFeaturizationEstimator(IHostEnvironment env, Options options) + : base(env, options) + { + _trainerOptions = options.TrainerOptions; + } + + private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) + { + var trainer = new FastTreeTweedieTrainer(Env, _trainerOptions); + var trained = trainer.Fit(input); + return trained.Model; + } + } } diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index ba67a33a34..38a1611195 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -9,9 +9,9 @@ namespace Microsoft.ML { /// - /// Collection of extension methods used by , - /// , , - /// and to create instances of decision tree trainers. + /// Collection of extension methods used by , , + /// , , and + /// to create instances of decision tree trainers and featurizers. /// public static class TreeExtensions { @@ -476,5 +476,21 @@ public static FastTreeBinaryFeaturizationEstimator FastTreeBinaryFeaturizing(thi var env = CatalogUtils.GetEnvironment(catalog); return new FastTreeBinaryFeaturizationEstimator(env, options); } + + public static FastTreeRankingFeaturizationEstimator FastTreeRankingFeaturizing(this TransformsCatalog catalog, + FastTreeRankingFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastTreeRankingFeaturizationEstimator(env, options); + } + + public static FastTreeTweedieFeaturizationEstimator FastTreeTweedieFeaturizing(this TransformsCatalog catalog, + FastTreeTweedieFeaturizationEstimator.Options options) + { + Contracts.CheckValue(catalog, nameof(catalog)); + var env = CatalogUtils.GetEnvironment(catalog); + return new FastTreeTweedieFeaturizationEstimator(env, options); + } } } diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 8182bb49c9..5e70d7e36f 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -445,5 +445,73 @@ public void TestFastForestRegressionFeaturizationInPipeline() Assert.True(metrics.MeanAbsoluteError < 0.25); Assert.True(metrics.MeanSquaredError < 0.1); } + + [Fact] + public void TestFastTreeTweedieFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastTreeTweedieTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeTweedieFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FastTreeTweedieFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + } + + [Fact] + public void TestFastTreeRankingFeaturizationInPipeline() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastTreeRankingTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeRankingFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FastTreeRankingFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + } } } From f529f1dbd95bd4975b66446524245bd771b67d33 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 3 Jun 2019 17:55:40 -0700 Subject: [PATCH 05/25] Some small docs --- .../TreeEnsembleFeaturizationEstimator.cs | 24 ++++++++++++++++--- .../TreeEnsembleFeaturizationTransformer.cs | 13 ++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index 75f3ce4450..655c43a732 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -113,11 +113,29 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) return new SchemaShape(result.Values); } } - /// - /// A which takes a trained and calling its - /// produces a featurizer based on the trained model. + /// A which contains a pre-trained and calling its + /// produces a featurizer based on the pre-trained model. /// + /// + /// . + /// + /// This estimator outputs the following columns: + /// + /// | Output Column Name | Column Type | Description| + /// | -- | -- | -- | + /// | `Trees` | Vector of | The output values of all trees. | + /// | `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | + /// | `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | + /// + /// Check the See Also section for links to usage examples. + /// ]]> + /// + /// + /// public sealed class PretrainedTreeFeaturizationEstimator : FeaturizationEstimatorBase { public sealed class Options : FeaturizationEstimatorBase.CommonOptions diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs index b1970940a8..5ea5f27508 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -14,6 +14,11 @@ namespace Microsoft.ML.Trainers.FastTree { + /// + /// resulting from fitting any derived class of . + /// The derived classes include, for example, and + /// . + /// public sealed class TreeEnsembleFeaturizationTransformer : PredictionTransformerBase { internal const string LoaderSignature = "TreeEnseFeat"; @@ -92,6 +97,14 @@ private TreeEnsembleFeaturizationTransformer(IHostEnvironment host, ModelLoadCon Scorer = new GenericScorer(Host, args, new EmptyDataView(Host, TrainSchema), BindableMapper.Bind(Host, schema), schema); } + /// + /// appends three columns to the . + /// The three columns are all vectors. The fist column stores the prediction values of all trees and + /// its default name is "Trees". The second column (default name: "Leaves") contains leaf IDs where the given feature vector falls into. + /// The third column (default name: "Paths") encodes the paths to those leaves via a 0-1 vector. + /// + /// of the data to be transformed. + /// of the transformed data if the input schema is . public override DataViewSchema GetOutputSchema(DataViewSchema inputSchema) => Transform(new EmptyDataView(Host, inputSchema)).Schema; private protected override void SaveModel(ModelSaveContext ctx) From 9c4d8011f3637201d49bb7364238d3deb18537f7 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 4 Jun 2019 15:11:14 -0700 Subject: [PATCH 06/25] Customize output column names --- .../TreeEnsembleFeaturizationEstimator.cs | 61 ++++++++++++++----- .../TreeEnsembleFeaturizationTransformer.cs | 36 ++++++++--- .../TreeEnsembleFeaturizer.cs | 43 ++++++++++--- .../TreeEnsembleFeaturizerTest.cs | 37 +++++++---- 4 files changed, 131 insertions(+), 46 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index 655c43a732..e95a68fa9d 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -19,6 +19,16 @@ namespace Microsoft.ML.Trainers.FastTree /// public abstract class FeaturizationEstimatorBase : IEstimator { + /// + /// Default values of . + /// + private static class DefaultCommonOptions + { + public static string TreesColumnName = "Trees"; + public static string LeavesColumnName = "Leaves"; + public static string PathsColumnName = "Paths"; + } + /// /// The common options of tree-based featurizations such as , , /// , , and . @@ -32,11 +42,23 @@ public class CommonOptions public string InputColumnName; /// - /// The estimator has three output columns. Their names would be "Trees" + , - /// "Leaves" + , and "Paths" + . If - /// is , the output names would be "Trees", "Leaves", and "Paths". + /// The name of the column that stores the prediction values of all trees. Its type is a vector of + /// and the i-th vector element is the prediction value predicted by the i-th tree. /// - public string OutputColumnsSuffix; + public string TreesColumnName; + + /// + /// The 0-1 encoding of all leaf nodes' IDs. Its type is a vector of . If the given feature + /// vector falls into the first leaf of the first tree, the first element in the 0-1 encoding would be 1. + /// + public string LeavesColumnName; + + /// + /// The 0-1 encoding of the paths to the leaves. If the path to the first tree's leaf is node 1 (2nd node in the first tree), + /// node 3 (4th node in the first tree), and node 5 (6th node in the first tree), the 2nd, 4th, and 6th element in that encoding + /// would be 1. + /// + public string PathsColumnName; }; /// @@ -46,9 +68,19 @@ public class CommonOptions private protected readonly string FeatureColumnName; /// - /// See . + /// See . + /// + private protected readonly string TreesColumnName; + + /// + /// See . + /// + private protected readonly string LeavesColumnName; + + /// + /// See . /// - private protected readonly string OutputColumnSuffix; + private protected readonly string PathsColumnName; /// /// Environment of this instance. It controls error throwing and other enviroment settings. @@ -59,7 +91,9 @@ private protected FeaturizationEstimatorBase(IHostEnvironment env, CommonOptions { Env = env; FeatureColumnName = options.InputColumnName; - OutputColumnSuffix = options.OutputColumnsSuffix; + TreesColumnName = options.TreesColumnName ?? DefaultCommonOptions.TreesColumnName; + LeavesColumnName = options.LeavesColumnName ?? DefaultCommonOptions.LeavesColumnName; + PathsColumnName = options.PathsColumnName ?? DefaultCommonOptions.PathsColumnName; } /// @@ -77,8 +111,8 @@ private protected FeaturizationEstimatorBase(IHostEnvironment env, CommonOptions public TreeEnsembleFeaturizationTransformer Fit(IDataView input) { var model = PrepareModel(input); - return new TreeEnsembleFeaturizationTransformer(Env, input.Schema, - input.Schema[FeatureColumnName], model, OutputColumnSuffix); + return new TreeEnsembleFeaturizationTransformer(Env, input.Schema, input.Schema[FeatureColumnName], model, + TreesColumnName, LeavesColumnName, PathsColumnName); } /// @@ -98,16 +132,13 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) var result = inputSchema.ToDictionary(x => x.Name); - var treeColumnName = OutputColumnSuffix != null ? OutputColumnSuffix + "Trees" : "Trees"; - result[treeColumnName] = new SchemaShape.Column(treeColumnName, + result[TreesColumnName] = new SchemaShape.Column(TreesColumnName, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); - var leafColumnName = OutputColumnSuffix != null ? OutputColumnSuffix + "Leaves" : "Leaves"; - result[leafColumnName] = new SchemaShape.Column(leafColumnName, + result[LeavesColumnName] = new SchemaShape.Column(LeavesColumnName, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); - var pathColumnName = OutputColumnSuffix != null ? OutputColumnSuffix + "Paths" : "Paths"; - result[pathColumnName] = new SchemaShape.Column(pathColumnName, + result[PathsColumnName] = new SchemaShape.Column(PathsColumnName, SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); return new SchemaShape(result.Values); diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs index 5ea5f27508..44e6b3468f 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -24,8 +24,18 @@ public sealed class TreeEnsembleFeaturizationTransformer : PredictionTransformer internal const string LoaderSignature = "TreeEnseFeat"; private readonly TreeEnsembleFeaturizerBindableMapper.Arguments _scorerArgs; private readonly DataViewSchema.DetachedColumn _featureDetachedColumn; - private readonly string _outputColumnSuffix; - + /// + /// See . + /// + private readonly string _treesColumnName; + /// + /// See . + /// + private readonly string _leavesColumnName; + /// + /// See . + /// + private readonly string _pathsColumnName; /// /// Check if is compatible with . /// @@ -57,7 +67,8 @@ private RoleMappedSchema MakeFeatureRoleMappedSchema(DataViewSchema schema) } internal TreeEnsembleFeaturizationTransformer(IHostEnvironment env, DataViewSchema inputSchema, - DataViewSchema.Column featureColumn, TreeEnsembleModelParameters modelParameters, string outputColumnNameSuffix=null) : + DataViewSchema.Column featureColumn, TreeEnsembleModelParameters modelParameters, + string treesColumnName, string leavesColumnName, string pathsColumnName) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(TreeEnsembleFeaturizationTransformer)), modelParameters, inputSchema) { // Store featureColumn as a detached column because a fitted transformer can be applied to different IDataViews and different @@ -66,10 +77,13 @@ internal TreeEnsembleFeaturizationTransformer(IHostEnvironment env, DataViewSche // Check if featureColumn matches a column in inputSchema. The answer is yes if they have the same name and type. // The indexed column, inputSchema[featureColumn.Index], should match the detached column, _featureDetachedColumn. CheckFeatureColumnCompatibility(inputSchema[featureColumn.Index]); - // Store outputColumnNameSuffix so that this transformer can be saved into a file later. - _outputColumnSuffix = outputColumnNameSuffix; - // Create an argument, _scorerArgs, to pass the suffix of output column names to the underlying scorer. - _scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments { Suffix = _outputColumnSuffix }; + // Store output column names so that this transformer can be saved into a file later. + _treesColumnName = treesColumnName; + _leavesColumnName = leavesColumnName; + _pathsColumnName = pathsColumnName; + // Create an argument, _scorerArgs, to pass the output column names to the underlying scorer. + _scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments { + TreesColumnName = _treesColumnName, LeavesColumnName = _leavesColumnName, PathsColumnName = _pathsColumnName }; // Create a bindable mapper. It provides the core computation and can be attached to any IDataView and produce // a transformed IDataView. BindableMapper = new TreeEnsembleFeaturizerBindableMapper(env, _scorerArgs, modelParameters); @@ -88,7 +102,9 @@ private TreeEnsembleFeaturizationTransformer(IHostEnvironment host, ModelLoadCon string featureColumnName = ctx.LoadString(); _featureDetachedColumn = new DataViewSchema.DetachedColumn(TrainSchema[featureColumnName]); - _outputColumnSuffix = ctx.LoadStringOrNull(); + _treesColumnName = ctx.LoadString(); + _leavesColumnName = ctx.LoadString(); + _pathsColumnName = ctx.LoadString(); BindableMapper = ScoreUtils.GetSchemaBindableMapper(Host, Model); @@ -131,7 +147,9 @@ private protected override void SaveModel(ModelSaveContext ctx) }); ctx.SaveString(_featureDetachedColumn.Name); - ctx.SaveStringOrNull(_outputColumnSuffix); + ctx.SaveString(_treesColumnName); + ctx.SaveString(_leavesColumnName); + ctx.SaveString(_pathsColumnName); } private static VersionInfo GetVersionInfo() diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs index 38cbeda4f2..bc4d121268 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs @@ -44,7 +44,7 @@ namespace Microsoft.ML.Data /// internal sealed class TreeEnsembleFeaturizerBindableMapper : ISchemaBindableMapper, ICanSaveModel { - public static class OutputColumnNames + private static class OutputColumnNames { public const string Trees = "Trees"; public const string Paths = "Paths"; @@ -53,6 +53,9 @@ public static class OutputColumnNames public sealed class Arguments : ScorerArgumentsBase { + public string TreesColumnName; + public string LeavesColumnName; + public string PathsColumnName; } private sealed class BoundMapper : ISchemaBoundRowMapper @@ -81,8 +84,8 @@ private sealed class BoundMapper : ISchemaBoundRowMapper public ISchemaBindableMapper Bindable => _owner; - public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper owner, - RoleMappedSchema schema) + public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper owner, RoleMappedSchema schema, + string treesColumnName=OutputColumnNames.Trees, string leavesColumnName=OutputColumnNames.Leaves, string pathsColumnName=OutputColumnNames.Paths) { Contracts.AssertValue(ectx); ectx.AssertValue(owner); @@ -116,7 +119,7 @@ public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper treeIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(treeValueType.Size), (ValueGetter>>)owner.GetTreeSlotNames); // Add the column of trees' output values - schemaBuilder.AddColumn(OutputColumnNames.Trees, treeValueType, treeIdMetadataBuilder.ToAnnotations()); + schemaBuilder.AddColumn(treesColumnName, treeValueType, treeIdMetadataBuilder.ToAnnotations()); // Metadata of leaf IDs. var leafIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); @@ -124,7 +127,7 @@ public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper (ValueGetter>>)owner.GetLeafSlotNames); leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); // Add the column of leaves' IDs where the input example reaches. - schemaBuilder.AddColumn(OutputColumnNames.Leaves, leafIdType, leafIdMetadataBuilder.ToAnnotations()); + schemaBuilder.AddColumn(leavesColumnName, leafIdType, leafIdMetadataBuilder.ToAnnotations()); // Metadata of path IDs. var pathIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); @@ -132,16 +135,16 @@ public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper (ValueGetter>>)owner.GetPathSlotNames); pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); // Add the column of encoded paths which the input example passes. - schemaBuilder.AddColumn(OutputColumnNames.Paths, pathIdType, pathIdMetadataBuilder.ToAnnotations()); + schemaBuilder.AddColumn(pathsColumnName, pathIdType, pathIdMetadataBuilder.ToAnnotations()); OutputSchema = schemaBuilder.ToSchema(); // Tree values must be the first output column. - Contracts.Assert(OutputSchema[OutputColumnNames.Trees].Index == TreeValuesColumnId); + Contracts.Assert(OutputSchema[treesColumnName].Index == TreeValuesColumnId); // leaf IDs must be the second output column. - Contracts.Assert(OutputSchema[OutputColumnNames.Leaves].Index == LeafIdsColumnId); + Contracts.Assert(OutputSchema[leavesColumnName].Index == LeafIdsColumnId); // Path IDs must be the third output column. - Contracts.Assert(OutputSchema[OutputColumnNames.Paths].Index == PathIdsColumnId); + Contracts.Assert(OutputSchema[pathsColumnName].Index == PathIdsColumnId); } DataViewRow ISchemaBoundRowMapper.GetRow(DataViewRow input, IEnumerable activeColumns) @@ -360,6 +363,9 @@ private static VersionInfo GetVersionInfo() private readonly IHost _host; private readonly TreeEnsembleModelParameters _ensemble; private readonly int _totalLeafCount; + private readonly string _treesColumnName; + private readonly string _leavesColumnName; + private readonly string _pathsColumnName; public TreeEnsembleFeaturizerBindableMapper(IHostEnvironment env, Arguments args, IPredictor predictor) { @@ -368,6 +374,11 @@ public TreeEnsembleFeaturizerBindableMapper(IHostEnvironment env, Arguments args _host.CheckValue(args, nameof(args)); _host.CheckValue(predictor, nameof(predictor)); + // Store output columns specified by the user. + _treesColumnName = args.TreesColumnName; + _leavesColumnName = args.LeavesColumnName; + _pathsColumnName = args.PathsColumnName; + // This function accepts models trained by FastTreeTrainer family. There are four types that "predictor" can be. // 1. CalibratedPredictorBase // 2. FastTreeRankingModelParameters @@ -390,9 +401,15 @@ public TreeEnsembleFeaturizerBindableMapper(IHostEnvironment env, ModelLoadConte // *** Binary format *** // ensemble + // string: treesColumnName + // string: leavesColumnName + // string: pathsColumnName ctx.LoadModel(env, out _ensemble, "Ensemble"); _totalLeafCount = CountLeaves(_ensemble); + _treesColumnName = ctx.LoadString(); + _leavesColumnName = ctx.LoadString(); + _pathsColumnName = ctx.LoadString(); } void ICanSaveModel.Save(ModelSaveContext ctx) @@ -403,9 +420,15 @@ void ICanSaveModel.Save(ModelSaveContext ctx) // *** Binary format *** // ensemble + // string: treesColumnName + // string: leavesColumnName + // string: pathsColumnName _host.AssertValue(_ensemble); ctx.SaveModel(_ensemble, "Ensemble"); + ctx.SaveString(_treesColumnName); + ctx.SaveString(_leavesColumnName); + ctx.SaveString(_pathsColumnName); } private static int CountLeaves(TreeEnsembleModelParameters ensemble) @@ -474,7 +497,7 @@ ISchemaBoundMapper ISchemaBindableMapper.Bind(IHostEnvironment env, RoleMappedSc env.AssertValue(schema); env.CheckParam(schema.Feature != null, nameof(schema), "Need a feature column"); - return new BoundMapper(env, this, schema); + return new BoundMapper(env, this, schema, _treesColumnName, _leavesColumnName, _pathsColumnName); } } diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 5e70d7e36f..44286e7ea8 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -33,7 +33,12 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() var model = trainer.Fit(dataView); // From the trained tree model, a mapper of tree featurizer is created. - var treeFeaturizer = new TreeEnsembleFeaturizerBindableMapper(Env, new TreeEnsembleFeaturizerBindableMapper.Arguments(), model.Model); + const string treesColumnName = "MyTrees"; + const string leavesColumnName = "MyLeaves"; + const string pathsColumnName = "MyPaths"; + var args = new TreeEnsembleFeaturizerBindableMapper.Arguments() { + TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName }; + var treeFeaturizer = new TreeEnsembleFeaturizerBindableMapper(Env, args, model.Model); // To get output schema, we need to create RoleMappedSchema for calling Bind(...). var roleMappedSchema = new RoleMappedSchema(dataView.Schema, @@ -47,7 +52,7 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() { // Check if output schema is correct. var treeValuesColumn = outputSchema[0]; - Assert.Equal("Trees", treeValuesColumn.Name); + Assert.Equal(treesColumnName, treeValuesColumn.Name); VectorDataViewType treeValuesType = treeValuesColumn.Type as VectorDataViewType; Assert.NotNull(treeValuesType); Assert.Equal(NumberDataViewType.Single, treeValuesType.ItemType); @@ -65,7 +70,7 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() { var treeLeafIdsColumn = outputSchema[1]; // Check column of tree leaf IDs. - Assert.Equal("Leaves", treeLeafIdsColumn.Name); + Assert.Equal(leavesColumnName, treeLeafIdsColumn.Name); VectorDataViewType treeLeafIdsType = treeLeafIdsColumn.Type as VectorDataViewType; Assert.NotNull(treeLeafIdsType); Assert.Equal(NumberDataViewType.Single, treeLeafIdsType.ItemType); @@ -88,7 +93,7 @@ public void TreeEnsembleFeaturizerOutputSchemaTest() { var treePathIdsColumn = outputSchema[2]; // Check column of path IDs. - Assert.Equal("Paths", treePathIdsColumn.Name); + Assert.Equal(pathsColumnName, treePathIdsColumn.Name); VectorDataViewType treePathIdsType = treePathIdsColumn.Type as VectorDataViewType; Assert.NotNull(treePathIdsType); Assert.Equal(NumberDataViewType.Single, treePathIdsType.ItemType); @@ -133,16 +138,20 @@ public void TreeEnsembleFeaturizerTransformerFastTreeBinary() var predicted = model.Transform(dataView); // From the trained tree model, a mapper of tree featurizer is created. - var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model.SubModel); + const string treesColumnName = "MyTrees"; + const string leavesColumnName = "MyLeaves"; + const string pathsColumnName = "MyPaths"; + var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model.SubModel, + treesColumnName: treesColumnName, leavesColumnName: leavesColumnName, pathsColumnName: pathsColumnName); // Apply TreeEnsembleFeaturizer to the input data. var transformed = treeFeaturizer.Transform(dataView); // Extract the outputs of TreeEnsembleFeaturizer. var features = transformed.GetColumn("Features").ToArray(); - var leafValues = transformed.GetColumn("Trees").ToArray(); - var leafIds = transformed.GetColumn("Leaves").ToArray(); - var paths = transformed.GetColumn("Paths").ToArray(); + var leafValues = transformed.GetColumn(treesColumnName).ToArray(); + var leafIds = transformed.GetColumn(leavesColumnName).ToArray(); + var paths = transformed.GetColumn(pathsColumnName).ToArray(); // Check if the TreeEnsembleFeaturizer produce expected values. List path = null; @@ -180,16 +189,20 @@ public void TreeEnsembleFeaturizerTransformerFastForestBinary() var model = trainer.Fit(dataView); // From the trained tree model, a mapper of tree featurizer is created. - var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model); + const string treesColumnName = "MyTrees"; + const string leavesColumnName = "MyLeaves"; + const string pathsColumnName = "MyPaths"; + var treeFeaturizer = new TreeEnsembleFeaturizationTransformer(ML, dataView.Schema, dataView.Schema["Features"], model.Model, + treesColumnName: treesColumnName, leavesColumnName: leavesColumnName, pathsColumnName: pathsColumnName); // Apply TreeEnsembleFeaturizer to the input data. var transformed = treeFeaturizer.Transform(dataView); // Extract the outputs of TreeEnsembleFeaturizer. var features = transformed.GetColumn("Features").ToArray(); - var leafValues = transformed.GetColumn("Trees").ToArray(); - var leafIds = transformed.GetColumn("Leaves").ToArray(); - var paths = transformed.GetColumn("Paths").ToArray(); + var leafValues = transformed.GetColumn(treesColumnName).ToArray(); + var leafIds = transformed.GetColumn(leavesColumnName).ToArray(); + var paths = transformed.GetColumn(pathsColumnName).ToArray(); // Check if the TreeEnsembleFeaturizer produce expected values. List path = null; From e7b84dd53e246b3d42abc34317877a1728fa5e86 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 4 Jun 2019 16:10:29 -0700 Subject: [PATCH 07/25] Fix save and load --- .../TreeEnsembleFeaturizationTransformer.cs | 31 +++-- .../TreeEnsembleFeaturizerTest.cs | 125 ++++++++++++++++++ 2 files changed, 148 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs index 44e6b3468f..3d95048653 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -74,19 +74,24 @@ internal TreeEnsembleFeaturizationTransformer(IHostEnvironment env, DataViewSche // Store featureColumn as a detached column because a fitted transformer can be applied to different IDataViews and different // IDataView may have different schemas. _featureDetachedColumn = new DataViewSchema.DetachedColumn(featureColumn); + // Check if featureColumn matches a column in inputSchema. The answer is yes if they have the same name and type. // The indexed column, inputSchema[featureColumn.Index], should match the detached column, _featureDetachedColumn. CheckFeatureColumnCompatibility(inputSchema[featureColumn.Index]); + // Store output column names so that this transformer can be saved into a file later. _treesColumnName = treesColumnName; _leavesColumnName = leavesColumnName; _pathsColumnName = pathsColumnName; + // Create an argument, _scorerArgs, to pass the output column names to the underlying scorer. _scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments { TreesColumnName = _treesColumnName, LeavesColumnName = _leavesColumnName, PathsColumnName = _pathsColumnName }; + // Create a bindable mapper. It provides the core computation and can be attached to any IDataView and produce // a transformed IDataView. BindableMapper = new TreeEnsembleFeaturizerBindableMapper(env, _scorerArgs, modelParameters); + // Create a scorer. var roleMappedSchema = MakeFeatureRoleMappedSchema(inputSchema); Scorer = new GenericScorer(Host, _scorerArgs, new EmptyDataView(Host, inputSchema), BindableMapper.Bind(Host, roleMappedSchema), roleMappedSchema); @@ -98,19 +103,28 @@ private TreeEnsembleFeaturizationTransformer(IHostEnvironment host, ModelLoadCon // *** Binary format *** // // string: feature column's name. - // string: output columns' suffix. + // string: the name of the columns where tree prediction values are stored. + // string: the name of the columns where trees' leave are stored. + // string: the name of the columns where trees' paths are stored. + // Load stored fields. string featureColumnName = ctx.LoadString(); _featureDetachedColumn = new DataViewSchema.DetachedColumn(TrainSchema[featureColumnName]); _treesColumnName = ctx.LoadString(); _leavesColumnName = ctx.LoadString(); _pathsColumnName = ctx.LoadString(); - BindableMapper = ScoreUtils.GetSchemaBindableMapper(Host, Model); + // Create an argument to specify output columns' names of this transformer. + _scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments { + TreesColumnName = _treesColumnName, LeavesColumnName = _leavesColumnName, PathsColumnName = _pathsColumnName }; - var args = new GenericScorer.Arguments { Suffix = "" }; - var schema = MakeFeatureRoleMappedSchema(TrainSchema); - Scorer = new GenericScorer(Host, args, new EmptyDataView(Host, TrainSchema), BindableMapper.Bind(Host, schema), schema); + // Create a bindable mapper. It provides the core computation and can be attached to any IDataView and produce + // a transformed IDataView. + BindableMapper = new TreeEnsembleFeaturizerBindableMapper(host, _scorerArgs, Model); + + // Create a scorer. + var roleMappedSchema = MakeFeatureRoleMappedSchema(TrainSchema); + Scorer = new GenericScorer(Host, _scorerArgs, new EmptyDataView(Host, TrainSchema), BindableMapper.Bind(Host, roleMappedSchema), roleMappedSchema); } /// @@ -132,9 +146,10 @@ private protected override void SaveModel(ModelSaveContext ctx) // *** Binary format *** // model: prediction model. // stream: empty data view that contains train schema. - // ids of strings: feature columns. - // float: scorer threshold - // id of string: scorer threshold column + // string: feature column name. + // string: the name of the columns where tree prediction values are stored. + // string: the name of the columns where trees' leave are stored. + // string: the name of the columns where trees' paths are stored. ctx.SaveModel(Model, DirModel); ctx.SaveBinaryStream(DirTransSchema, writer => diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 44286e7ea8..689c45f41c 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.IO; using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Trainers.FastTree; @@ -526,5 +527,129 @@ public void TestFastTreeRankingFeaturizationInPipeline() Assert.True(metrics.MeanAbsoluteError < 0.25); Assert.True(metrics.MeanSquaredError < 0.1); } + + [Fact] + public void TestSaveAndLoadTreeFeaturizer() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastForestRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastForestRegressionFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.FastForestRegressionFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + + // Save the trained model into file. + ITransformer loadedModel = null; + var tempPath = Path.GetTempFileName(); + using (var file = new SimpleFileHandle(Env, tempPath, true, true)) + { + using (var fs = file.CreateWriteStream()) + ML.Model.Save(model, null, fs); + + using (var fs = file.OpenReadStream()) + loadedModel = ML.Model.Load(fs, out var schema); + } + var loadedPrediction = loadedModel.Transform(dataView); + var loadedMetrics = ML.Regression.Evaluate(loadedPrediction); + + Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); + Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); + } + + [Fact] + public void TestSaveAndLoadDoubleTreeFeaturizer() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastForestRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + // Trains tree featurization on "Features" and applies on "CopiedFeatures". + var options = new FastForestRegressionFeaturizationEstimator.Options() + { + InputColumnName = "CopiedFeatures", + TrainerOptions = trainerOptions, + TreesColumnName = "OhMyTrees", + LeavesColumnName = "OhMyLeaves", + PathsColumnName = "OhMyPaths" + }; + + var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features"). + Append(ML.Transforms.FastForestRegressionFeaturizing(options)). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")). + Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.Regression.Evaluate(prediction); + + Assert.True(metrics.MeanAbsoluteError < 0.25); + Assert.True(metrics.MeanSquaredError < 0.1); + + // Save the trained model into file and then load it back. + ITransformer loadedModel = null; + var tempPath = Path.GetTempFileName(); + using (var file = new SimpleFileHandle(Env, tempPath, true, true)) + { + using (var fs = file.CreateWriteStream()) + ML.Model.Save(model, null, fs); + + using (var fs = file.OpenReadStream()) + loadedModel = ML.Model.Load(fs, out var schema); + } + + // Compute prediction using the loaded model. + var loadedPrediction = loadedModel.Transform(dataView); + var loadedMetrics = ML.Regression.Evaluate(loadedPrediction); + + // Check if the loaded model produces the same result as the trained model. + Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); + Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); + + var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features"). + Append(ML.Transforms.NormalizeBinning("CopiedFeatures")). + Append(ML.Transforms.FastForestRegressionFeaturizing(options)). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")). + Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var secondModel = secondPipeline.Fit(dataView); + var secondPrediction = secondModel.Transform(dataView); + var secondMetrics = ML.Regression.Evaluate(secondPrediction); + + // The second pipeline trains a tree featurizer on a bin-based normalized feature, so the second pipeline + // is different from the first pipeline. + Assert.NotEqual(metrics.MeanAbsoluteError, secondMetrics.MeanAbsoluteError); + Assert.NotEqual(metrics.MeanSquaredError, secondMetrics.MeanSquaredError); + } } } From 5d8215af710c634b485d89bdaaea48ac9ee9aa15 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 4 Jun 2019 19:46:31 -0700 Subject: [PATCH 08/25] Optional output columns --- .../TreeEnsembleFeaturizationEstimator.cs | 32 ++-- .../TreeEnsembleFeaturizationTransformer.cs | 12 +- .../TreeEnsembleFeaturizer.cs | 169 ++++++++++-------- .../TreeEnsembleFeaturizerTest.cs | 83 ++++++++- 4 files changed, 202 insertions(+), 94 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index e95a68fa9d..d802961bb1 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -44,12 +44,14 @@ public class CommonOptions /// /// The name of the column that stores the prediction values of all trees. Its type is a vector of /// and the i-th vector element is the prediction value predicted by the i-th tree. + /// If is , this output column may not be generated. /// public string TreesColumnName; /// /// The 0-1 encoding of all leaf nodes' IDs. Its type is a vector of . If the given feature /// vector falls into the first leaf of the first tree, the first element in the 0-1 encoding would be 1. + /// If is , this output column may not be generated. /// public string LeavesColumnName; @@ -57,6 +59,7 @@ public class CommonOptions /// The 0-1 encoding of the paths to the leaves. If the path to the first tree's leaf is node 1 (2nd node in the first tree), /// node 3 (4th node in the first tree), and node 5 (6th node in the first tree), the 2nd, 4th, and 6th element in that encoding /// would be 1. + /// If is , this output column may not be generated. /// public string PathsColumnName; }; @@ -83,17 +86,23 @@ public class CommonOptions private protected readonly string PathsColumnName; /// - /// Environment of this instance. It controls error throwing and other enviroment settings. + /// Environment of this instance. It controls error throwing and other environment settings. /// private protected readonly IHostEnvironment Env; private protected FeaturizationEstimatorBase(IHostEnvironment env, CommonOptions options) { Env = env; + if (options.InputColumnName == null) + throw Env.Except(nameof(options), "The " + nameof(options.InputColumnName) + " cannot be null."); + if (options.TreesColumnName == null && options.LeavesColumnName == null && options.PathsColumnName == null) + throw Env.Except($"{nameof(CommonOptions.TreesColumnName)}, {nameof(CommonOptions.LeavesColumnName)}, and {nameof(CommonOptions.PathsColumnName)} cannot be all null at the same time. " + + $"At least one output column name should be provided so that at least one output column may be generated."); + FeatureColumnName = options.InputColumnName; - TreesColumnName = options.TreesColumnName ?? DefaultCommonOptions.TreesColumnName; - LeavesColumnName = options.LeavesColumnName ?? DefaultCommonOptions.LeavesColumnName; - PathsColumnName = options.PathsColumnName ?? DefaultCommonOptions.PathsColumnName; + TreesColumnName = options.TreesColumnName; + LeavesColumnName = options.LeavesColumnName; + PathsColumnName = options.PathsColumnName; } /// @@ -132,14 +141,17 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) var result = inputSchema.ToDictionary(x => x.Name); - result[TreesColumnName] = new SchemaShape.Column(TreesColumnName, - SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + if (TreesColumnName != null) + result[TreesColumnName] = new SchemaShape.Column(TreesColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); - result[LeavesColumnName] = new SchemaShape.Column(LeavesColumnName, - SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + if (LeavesColumnName != null) + result[LeavesColumnName] = new SchemaShape.Column(LeavesColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); - result[PathsColumnName] = new SchemaShape.Column(PathsColumnName, - SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); + if (PathsColumnName != null) + result[PathsColumnName] = new SchemaShape.Column(PathsColumnName, + SchemaShape.Column.VectorKind.Vector, NumberDataViewType.Single, false); return new SchemaShape(result.Values); } diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs index 3d95048653..77900f7488 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -110,9 +110,9 @@ private TreeEnsembleFeaturizationTransformer(IHostEnvironment host, ModelLoadCon // Load stored fields. string featureColumnName = ctx.LoadString(); _featureDetachedColumn = new DataViewSchema.DetachedColumn(TrainSchema[featureColumnName]); - _treesColumnName = ctx.LoadString(); - _leavesColumnName = ctx.LoadString(); - _pathsColumnName = ctx.LoadString(); + _treesColumnName = ctx.LoadStringOrNull(); + _leavesColumnName = ctx.LoadStringOrNull(); + _pathsColumnName = ctx.LoadStringOrNull(); // Create an argument to specify output columns' names of this transformer. _scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments { @@ -162,9 +162,9 @@ private protected override void SaveModel(ModelSaveContext ctx) }); ctx.SaveString(_featureDetachedColumn.Name); - ctx.SaveString(_treesColumnName); - ctx.SaveString(_leavesColumnName); - ctx.SaveString(_pathsColumnName); + ctx.SaveStringOrNull(_treesColumnName); + ctx.SaveStringOrNull(_leavesColumnName); + ctx.SaveStringOrNull(_pathsColumnName); } private static VersionInfo GetVersionInfo() diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs index bc4d121268..0be2dadfcf 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs @@ -44,13 +44,6 @@ namespace Microsoft.ML.Data /// internal sealed class TreeEnsembleFeaturizerBindableMapper : ISchemaBindableMapper, ICanSaveModel { - private static class OutputColumnNames - { - public const string Trees = "Trees"; - public const string Paths = "Paths"; - public const string Leaves = "Leaves"; - } - public sealed class Arguments : ScorerArgumentsBase { public string TreesColumnName; @@ -60,32 +53,43 @@ public sealed class Arguments : ScorerArgumentsBase private sealed class BoundMapper : ISchemaBoundRowMapper { + public RoleMappedSchema InputRoleMappedSchema { get; } + public DataViewSchema InputSchema => InputRoleMappedSchema.Schema; + public DataViewSchema OutputSchema { get; } + public ISchemaBindableMapper Bindable => _owner; + + private readonly TreeEnsembleFeaturizerBindableMapper _owner; + private readonly IExceptionContext _ectx; + /// - /// Column index of values predicted by all trees in an ensemble in . + /// Feature vector to be mapped to tree-based features. /// - private const int TreeValuesColumnId = 0; + private DataViewSchema.Column FeatureColumn => InputRoleMappedSchema.Feature.Value; + /// - /// Column index of leaf IDs containing the considered example in . + /// The name of the column that stores the prediction values of all trees. Its type is a vector of + /// and the i-th vector element is the prediction value predicted by the i-th tree. + /// If is , this output column may not be generated. /// - private const int LeafIdsColumnId = 1; + private string _treesColumnName; + /// - /// Column index of path IDs which specify the paths the considered example passing through per tree in . + /// The 0-1 encoding of all leaf nodes' IDs. Its type is a vector of . If the given feature + /// vector falls into the first leaf of the first tree, the first element in the 0-1 encoding would be 1. + /// If is , this output column may not be generated. /// - private const int PathIdsColumnId = 2; - - private readonly TreeEnsembleFeaturizerBindableMapper _owner; - private readonly IExceptionContext _ectx; + private string _leavesColumnName; - public RoleMappedSchema InputRoleMappedSchema { get; } - - public DataViewSchema InputSchema => InputRoleMappedSchema.Schema; - public DataViewSchema OutputSchema { get; } - private DataViewSchema.Column FeatureColumn => InputRoleMappedSchema.Feature.Value; - - public ISchemaBindableMapper Bindable => _owner; + /// + /// The 0-1 encoding of the paths to the leaves. If the path to the first tree's leaf is node 1 (2nd node in the first tree), + /// node 3 (4th node in the first tree), and node 5 (6th node in the first tree), the 2nd, 4th, and 6th element in that encoding + /// would be 1. + /// If is , this output column may not be generated. + /// + private string _pathsColumnName; public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper owner, RoleMappedSchema schema, - string treesColumnName=OutputColumnNames.Trees, string leavesColumnName=OutputColumnNames.Leaves, string pathsColumnName=OutputColumnNames.Paths) + string treesColumnName, string leavesColumnName, string pathsColumnName) { Contracts.AssertValue(ectx); ectx.AssertValue(owner); @@ -114,37 +118,45 @@ public BoundMapper(IExceptionContext ectx, TreeEnsembleFeaturizerBindableMapper // Start creating output schema with types derived above. var schemaBuilder = new DataViewSchema.Builder(); - // Metadata of tree values. - var treeIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); - treeIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(treeValueType.Size), - (ValueGetter>>)owner.GetTreeSlotNames); - // Add the column of trees' output values - schemaBuilder.AddColumn(treesColumnName, treeValueType, treeIdMetadataBuilder.ToAnnotations()); - - // Metadata of leaf IDs. - var leafIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); - leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(leafIdType.Size), - (ValueGetter>>)owner.GetLeafSlotNames); - leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); - // Add the column of leaves' IDs where the input example reaches. - schemaBuilder.AddColumn(leavesColumnName, leafIdType, leafIdMetadataBuilder.ToAnnotations()); - - // Metadata of path IDs. - var pathIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); - pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(pathIdType.Size), - (ValueGetter>>)owner.GetPathSlotNames); - pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); - // Add the column of encoded paths which the input example passes. - schemaBuilder.AddColumn(pathsColumnName, pathIdType, pathIdMetadataBuilder.ToAnnotations()); + _treesColumnName = treesColumnName; + if (treesColumnName != null) + { + // Metadata of tree values. + var treeIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); + treeIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(treeValueType.Size), + (ValueGetter>>)owner.GetTreeSlotNames); - OutputSchema = schemaBuilder.ToSchema(); + // Add the column of trees' output values + schemaBuilder.AddColumn(treesColumnName, treeValueType, treeIdMetadataBuilder.ToAnnotations()); + } - // Tree values must be the first output column. - Contracts.Assert(OutputSchema[treesColumnName].Index == TreeValuesColumnId); - // leaf IDs must be the second output column. - Contracts.Assert(OutputSchema[leavesColumnName].Index == LeafIdsColumnId); - // Path IDs must be the third output column. - Contracts.Assert(OutputSchema[pathsColumnName].Index == PathIdsColumnId); + _leavesColumnName = leavesColumnName; + if (leavesColumnName != null) + { + // Metadata of leaf IDs. + var leafIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); + leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(leafIdType.Size), + (ValueGetter>>)owner.GetLeafSlotNames); + leafIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); + + // Add the column of leaves' IDs where the input example reaches. + schemaBuilder.AddColumn(leavesColumnName, leafIdType, leafIdMetadataBuilder.ToAnnotations()); + } + + _pathsColumnName = pathsColumnName; + if (pathsColumnName != null) + { + // Metadata of path IDs. + var pathIdMetadataBuilder = new DataViewSchema.Annotations.Builder(); + pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.SlotNames, AnnotationUtils.GetNamesType(pathIdType.Size), + (ValueGetter>>)owner.GetPathSlotNames); + pathIdMetadataBuilder.Add(AnnotationUtils.Kinds.IsNormalized, BooleanDataViewType.Instance, (ref bool value) => value = true); + + // Add the column of encoded paths which the input example passes. + schemaBuilder.AddColumn(pathsColumnName, pathIdType, pathIdMetadataBuilder.ToAnnotations()); + } + + OutputSchema = schemaBuilder.ToSchema(); } DataViewRow ISchemaBoundRowMapper.GetRow(DataViewRow input, IEnumerable activeColumns) @@ -159,40 +171,41 @@ private Delegate[] CreateGetters(DataViewRow input, IEnumerable(); var activeIndices = activeColumns.Select(c => c.Index); - var treeValueActive = activeIndices.Contains(TreeValuesColumnId); - var leafIdActive = activeIndices.Contains(LeafIdsColumnId); - var pathIdActive = activeIndices.Contains(PathIdsColumnId); - - if (!treeValueActive && !leafIdActive && !pathIdActive) - return delegates; - var state = new State(_ectx, input, _owner._ensemble, _owner._totalLeafCount, FeatureColumn.Index); // Get the tree value getter. - if (treeValueActive) + if (_treesColumnName != null) { ValueGetter> fn = state.GetTreeValues; - delegates[TreeValuesColumnId] = fn; + if(activeIndices.Contains(OutputSchema[_treesColumnName].Index)) + delegates.Add(fn); + else + delegates.Add(null); } // Get the leaf indicator getter. - if (leafIdActive) + if (_leavesColumnName != null ) { ValueGetter> fn = state.GetLeafIds; - delegates[LeafIdsColumnId] = fn; + if (activeIndices.Contains(OutputSchema[_leavesColumnName].Index)) + delegates.Add(fn); + else + delegates.Add(null); } // Get the path indicators getter. - if (pathIdActive) + if (_pathsColumnName != null) { ValueGetter> fn = state.GetPathIds; - delegates[PathIdsColumnId] = fn; + if (activeIndices.Contains(OutputSchema[_pathsColumnName].Index)) + delegates.Add(fn); + else + delegates.Add(null); } - return delegates; + return delegates.ToArray(); } private sealed class State @@ -407,9 +420,9 @@ public TreeEnsembleFeaturizerBindableMapper(IHostEnvironment env, ModelLoadConte ctx.LoadModel(env, out _ensemble, "Ensemble"); _totalLeafCount = CountLeaves(_ensemble); - _treesColumnName = ctx.LoadString(); - _leavesColumnName = ctx.LoadString(); - _pathsColumnName = ctx.LoadString(); + _treesColumnName = ctx.LoadStringOrNull(); + _leavesColumnName = ctx.LoadStringOrNull(); + _pathsColumnName = ctx.LoadStringOrNull(); } void ICanSaveModel.Save(ModelSaveContext ctx) @@ -426,9 +439,9 @@ void ICanSaveModel.Save(ModelSaveContext ctx) _host.AssertValue(_ensemble); ctx.SaveModel(_ensemble, "Ensemble"); - ctx.SaveString(_treesColumnName); - ctx.SaveString(_leavesColumnName); - ctx.SaveString(_pathsColumnName); + ctx.SaveStringOrNull(_treesColumnName); + ctx.SaveStringOrNull(_leavesColumnName); + ctx.SaveStringOrNull(_pathsColumnName); } private static int CountLeaves(TreeEnsembleModelParameters ensemble) @@ -595,7 +608,8 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData IDataTransform xf; using (var ch = host.Start("Create Tree Ensemble Scorer")) { - var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments() { Suffix = args.Suffix }; + var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments() { + Suffix = args.Suffix, TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths" }; if (!string.IsNullOrWhiteSpace(args.TrainedModelFile)) { if (args.Trainer != null) @@ -667,7 +681,8 @@ public static IDataTransform CreateForEntryPoint(IHostEnvironment env, Arguments using (var ch = host.Start("Create Tree Ensemble Scorer")) { - var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments() { Suffix = args.Suffix }; + var scorerArgs = new TreeEnsembleFeaturizerBindableMapper.Arguments() { + Suffix = args.Suffix, TreesColumnName = "Trees", LeavesColumnName = "Leaves", PathsColumnName = "Paths" }; var predictor = args.PredictorModel.Predictor; ch.Trace("Prepare data"); RoleMappedData data = null; diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 689c45f41c..8d0f936855 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -302,7 +302,14 @@ public void TreeEnsembleFeaturizingPipeline() // Combine the output of TreeEnsembleFeaturizationTransformer and the original features as the final training features. // Then train a linear model. - var options = new PretrainedTreeFeaturizationEstimator.Options() { InputColumnName = "Features", ModelParameters = treeModel.Model.SubModel }; + var options = new PretrainedTreeFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + ModelParameters = treeModel.Model.SubModel + }; var pipeline = ML.Transforms.PretrainTreeEnsembleFeaturizing(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); @@ -342,6 +349,9 @@ public void TestFastTreeBinaryFeaturizationInPipeline() var options = new FastTreeBinaryFeaturizationEstimator.Options() { InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", TrainerOptions = trainerOptions }; @@ -377,6 +387,9 @@ public void TestFastForestBinaryFeaturizationInPipeline() var options = new FastForestBinaryFeaturizationEstimator.Options() { InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", TrainerOptions = trainerOptions }; @@ -412,6 +425,9 @@ public void TestFastTreeRegressionFeaturizationInPipeline() var options = new FastTreeRegressionFeaturizationEstimator.Options() { InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", TrainerOptions = trainerOptions }; @@ -446,6 +462,9 @@ public void TestFastForestRegressionFeaturizationInPipeline() var options = new FastForestRegressionFeaturizationEstimator.Options() { InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", TrainerOptions = trainerOptions }; @@ -480,6 +499,9 @@ public void TestFastTreeTweedieFeaturizationInPipeline() var options = new FastTreeTweedieFeaturizationEstimator.Options() { InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", TrainerOptions = trainerOptions }; @@ -514,6 +536,9 @@ public void TestFastTreeRankingFeaturizationInPipeline() var options = new FastTreeRankingFeaturizationEstimator.Options() { InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", TrainerOptions = trainerOptions }; @@ -548,6 +573,9 @@ public void TestSaveAndLoadTreeFeaturizer() var options = new FastForestRegressionFeaturizationEstimator.Options() { InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", TrainerOptions = trainerOptions }; @@ -651,5 +679,58 @@ public void TestSaveAndLoadDoubleTreeFeaturizer() Assert.NotEqual(metrics.MeanAbsoluteError, secondMetrics.MeanAbsoluteError); Assert.NotEqual(metrics.MeanSquaredError, secondMetrics.MeanSquaredError); } + + [Fact] + public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" + }; + + var options = new FastTreeBinaryFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TrainerOptions = trainerOptions, + TreesColumnName = null, + PathsColumnName = null, + LeavesColumnName = "Leaves" + }; + + + bool isWrong = false; + try + { + var wrongPipeline = ML.Transforms.FastTreeBinaryFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var wrongModel = wrongPipeline.Fit(dataView); + } + catch + { + isWrong = true; // Only "Leaves" is produced by tree featurizer, so accessing "Trees" and "Paths" may lead to an error. + } + Assert.True(isWrong); + + var pipeline = ML.Transforms.FastTreeBinaryFeaturizing(options). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Leaves")). + Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.BinaryClassification.Evaluate(prediction); + + Assert.True(metrics.Accuracy > 0.98); + Assert.True(metrics.LogLoss < 0.05); + Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); + } } } From ce4378ffd6d9b270831a7794fab9d708ee26cd60 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 5 Jun 2019 11:15:29 -0700 Subject: [PATCH 09/25] Fix a test and add some XML docs --- .../TreeEnsembleFeaturizationEstimator.cs | 85 +++++++++++++------ .../TreeEnsembleFeaturizationTransformer.cs | 8 +- .../TreeTrainersCatalog.cs | 56 ++++++++++-- .../TreeEnsembleFeaturizerTest.cs | 47 ++++++---- 4 files changed, 143 insertions(+), 53 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index d802961bb1..7cf549e5ec 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -17,18 +17,8 @@ namespace Microsoft.ML.Trainers.FastTree /// produces three columns: (1) the prediction values of all trees, (2) the IDs of leaves the input feature vector falling into, and (3) /// the binary vector which encodes the paths to those destination leaves. /// - public abstract class FeaturizationEstimatorBase : IEstimator + public abstract class TreeEnsembleFeaturizationEstimatorBase : IEstimator { - /// - /// Default values of . - /// - private static class DefaultCommonOptions - { - public static string TreesColumnName = "Trees"; - public static string LeavesColumnName = "Leaves"; - public static string PathsColumnName = "Paths"; - } - /// /// The common options of tree-based featurizations such as , , /// , , and . @@ -37,7 +27,10 @@ public class CommonOptions { /// /// The name of feature column in the when calling . - /// The column type must be a vector of . + /// The column type must be a vector of . The column called would be mapped + /// to columns called , , and in the output + /// of and its derived classes. Note that is not + /// necessary to be the same as the feature column used to train the underlying tree model. /// public string InputColumnName; @@ -65,8 +58,7 @@ public class CommonOptions }; /// - /// Feature column to apply tree-based featurization. Note that is not necessary to be the same as - /// the feature column used to train the tree model. + /// See . /// private protected readonly string FeatureColumnName; @@ -90,7 +82,7 @@ public class CommonOptions /// private protected readonly IHostEnvironment Env; - private protected FeaturizationEstimatorBase(IHostEnvironment env, CommonOptions options) + private protected TreeEnsembleFeaturizationEstimatorBase(IHostEnvironment env, CommonOptions options) { Env = env; if (options.InputColumnName == null) @@ -174,15 +166,24 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) /// | `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | /// | `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | /// + /// Those output columns are all optional. Please see the names of skipped columns to null so that they would not be produced. + /// /// Check the See Also section for links to usage examples. /// ]]> /// /// - /// - public sealed class PretrainedTreeFeaturizationEstimator : FeaturizationEstimatorBase + /// + public sealed class PretrainedTreeFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { - public sealed class Options : FeaturizationEstimatorBase.CommonOptions + /// + /// of as + /// used when calling . + /// + public sealed class Options : TreeEnsembleFeaturizationEstimatorBase.CommonOptions { + /// + /// The pretrained tree model used to do tree-based featurization. Note that contains a collection of decision trees. + /// public TreeEnsembleModelParameters ModelParameters; }; @@ -200,12 +201,18 @@ internal PretrainedTreeFeaturizationEstimator(IHostEnvironment env, Options opti private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) => _modelParameters; } - public sealed class FastTreeBinaryFeaturizationEstimator : FeaturizationEstimatorBase + public sealed class FastTreeBinaryFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastTreeBinaryTrainer.Options _trainerOptions; + /// + /// Options for the . + /// public sealed class Options : CommonOptions { + /// + /// The configuration of used to train the underlying . + /// public FastTreeBinaryTrainer.Options TrainerOptions; } @@ -223,12 +230,18 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } - public sealed class FastTreeRegressionFeaturizationEstimator : FeaturizationEstimatorBase + public sealed class FastTreeRegressionFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastTreeRegressionTrainer.Options _trainerOptions; + /// + /// Options for the . + /// public sealed class Options : CommonOptions { + /// + /// The configuration of used to train the underlying . + /// public FastTreeRegressionTrainer.Options TrainerOptions; } @@ -246,12 +259,18 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } - public sealed class FastForestBinaryFeaturizationEstimator : FeaturizationEstimatorBase + public sealed class FastForestBinaryFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastForestBinaryTrainer.Options _trainerOptions; + /// + /// Options for the . + /// public sealed class Options : CommonOptions { + /// + /// The configuration of used to train the underlying . + /// public FastForestBinaryTrainer.Options TrainerOptions; } @@ -269,12 +288,18 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } - public sealed class FastForestRegressionFeaturizationEstimator : FeaturizationEstimatorBase + public sealed class FastForestRegressionFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastForestRegressionTrainer.Options _trainerOptions; + /// + /// Options for the . + /// public sealed class Options : CommonOptions { + /// + /// The configuration of used to train the underlying . + /// public FastForestRegressionTrainer.Options TrainerOptions; } @@ -292,12 +317,18 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } - public sealed class FastTreeRankingFeaturizationEstimator : FeaturizationEstimatorBase + public sealed class FastTreeRankingFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastTreeRankingTrainer.Options _trainerOptions; + /// + /// Options for the . + /// public sealed class Options : CommonOptions { + /// + /// The configuration of used to train the underlying . + /// public FastTreeRankingTrainer.Options TrainerOptions; } @@ -315,12 +346,18 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } - public sealed class FastTreeTweedieFeaturizationEstimator : FeaturizationEstimatorBase + public sealed class FastTreeTweedieFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastTreeTweedieTrainer.Options _trainerOptions; + /// + /// Options for the . + /// public sealed class Options : CommonOptions { + /// + /// The configuration of used to train the underlying . + /// public FastTreeTweedieTrainer.Options TrainerOptions; } diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs index 77900f7488..551ca2791a 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -15,7 +15,7 @@ namespace Microsoft.ML.Trainers.FastTree { /// - /// resulting from fitting any derived class of . + /// resulting from fitting any derived class of . /// The derived classes include, for example, and /// . /// @@ -25,15 +25,15 @@ public sealed class TreeEnsembleFeaturizationTransformer : PredictionTransformer private readonly TreeEnsembleFeaturizerBindableMapper.Arguments _scorerArgs; private readonly DataViewSchema.DetachedColumn _featureDetachedColumn; /// - /// See . + /// See . /// private readonly string _treesColumnName; /// - /// See . + /// See . /// private readonly string _leavesColumnName; /// - /// See . + /// See . /// private readonly string _pathsColumnName; /// diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index 38a1611195..73ece9adc5 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -437,7 +437,13 @@ public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalo return new FastForestBinaryTrainer(env, options); } - public static PretrainedTreeFeaturizationEstimator PretrainTreeEnsembleFeaturizing(this TransformsCatalog catalog, + /// + /// Create , which produces tree-based features given a . + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + public static PretrainedTreeFeaturizationEstimator FeaturizeByPretrainTreeEnsemble(this TransformsCatalog catalog, PretrainedTreeFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); @@ -445,7 +451,13 @@ public static PretrainedTreeFeaturizationEstimator PretrainTreeEnsembleFeaturizi return new PretrainedTreeFeaturizationEstimator(env, options); } - public static FastForestRegressionFeaturizationEstimator FastForestRegressionFeaturizing(this TransformsCatalog catalog, + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + public static FastForestRegressionFeaturizationEstimator FeaturizeByFastForestRegression(this TransformsCatalog catalog, FastForestRegressionFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); @@ -453,7 +465,13 @@ public static FastForestRegressionFeaturizationEstimator FastForestRegressionFea return new FastForestRegressionFeaturizationEstimator(env, options); } - public static FastTreeRegressionFeaturizationEstimator FastTreeRegressionFeaturizing(this TransformsCatalog catalog, + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + public static FastTreeRegressionFeaturizationEstimator FeaturizeByFastTreeRegression(this TransformsCatalog catalog, FastTreeRegressionFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); @@ -461,7 +479,13 @@ public static FastTreeRegressionFeaturizationEstimator FastTreeRegressionFeaturi return new FastTreeRegressionFeaturizationEstimator(env, options); } - public static FastForestBinaryFeaturizationEstimator FastForestBinaryFeaturizing(this TransformsCatalog catalog, + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + public static FastForestBinaryFeaturizationEstimator FeaturizeByFastForestBinary(this TransformsCatalog catalog, FastForestBinaryFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); @@ -469,7 +493,13 @@ public static FastForestBinaryFeaturizationEstimator FastForestBinaryFeaturizing return new FastForestBinaryFeaturizationEstimator(env, options); } - public static FastTreeBinaryFeaturizationEstimator FastTreeBinaryFeaturizing(this TransformsCatalog catalog, + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + public static FastTreeBinaryFeaturizationEstimator FeaturizeByFastTreeBinary(this TransformsCatalog catalog, FastTreeBinaryFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); @@ -477,7 +507,13 @@ public static FastTreeBinaryFeaturizationEstimator FastTreeBinaryFeaturizing(thi return new FastTreeBinaryFeaturizationEstimator(env, options); } - public static FastTreeRankingFeaturizationEstimator FastTreeRankingFeaturizing(this TransformsCatalog catalog, + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + public static FastTreeRankingFeaturizationEstimator FeaturizeByFastTreeRanking(this TransformsCatalog catalog, FastTreeRankingFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); @@ -485,7 +521,13 @@ public static FastTreeRankingFeaturizationEstimator FastTreeRankingFeaturizing(t return new FastTreeRankingFeaturizationEstimator(env, options); } - public static FastTreeTweedieFeaturizationEstimator FastTreeTweedieFeaturizing(this TransformsCatalog catalog, + /// + /// Create , which uses to train to create tree-based features. + /// + /// The context to create . + /// The options to configure . See and + /// for available settings. + public static FastTreeTweedieFeaturizationEstimator FeaturizeByFastTreeTweedie(this TransformsCatalog catalog, FastTreeTweedieFeaturizationEstimator.Options options) { Contracts.CheckValue(catalog, nameof(catalog)); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 8d0f936855..3c08d35307 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -245,17 +245,28 @@ public void TestPretrainedTreeFeaturizationEstimator() var predicted = model.Transform(dataView); // From the trained tree model, a mapper of tree featurizer is created. - var options = new PretrainedTreeFeaturizationEstimator.Options() { InputColumnName = "Features", ModelParameters = model.Model.SubModel }; - var treeFeaturizer = ML.Transforms.PretrainTreeEnsembleFeaturizing(options).Fit(dataView); + string featureColumnName = "Features"; + string treesColumnName = "MyTrees"; // a tree-based feature column. + string leavesColumnName = "MyLeaves"; // a tree-based feature column. + string pathsColumnName = "MyPaths"; // a tree-based feature column. + var options = new PretrainedTreeFeaturizationEstimator.Options() + { + InputColumnName = featureColumnName, + ModelParameters = model.Model.SubModel, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName + }; + var treeFeaturizer = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView); // Apply TreeEnsembleFeaturizer to the input data. var transformed = treeFeaturizer.Transform(dataView); // Extract the outputs of TreeEnsembleFeaturizer. - var features = transformed.GetColumn("Features").ToArray(); - var leafValues = transformed.GetColumn("Trees").ToArray(); - var leafIds = transformed.GetColumn("Leaves").ToArray(); - var paths = transformed.GetColumn("Paths").ToArray(); + var features = transformed.GetColumn(featureColumnName).ToArray(); + var leafValues = transformed.GetColumn(treesColumnName).ToArray(); + var leafIds = transformed.GetColumn(leavesColumnName).ToArray(); + var paths = transformed.GetColumn(pathsColumnName).ToArray(); // Check if the TreeEnsembleFeaturizer produce expected values. List path = null; @@ -310,7 +321,7 @@ public void TreeEnsembleFeaturizingPipeline() PathsColumnName = "Paths", ModelParameters = treeModel.Model.SubModel }; - var pipeline = ML.Transforms.PretrainTreeEnsembleFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -355,7 +366,7 @@ public void TestFastTreeBinaryFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FastTreeBinaryFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -393,7 +404,7 @@ public void TestFastForestBinaryFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FastForestBinaryFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByFastForestBinary(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -431,7 +442,7 @@ public void TestFastTreeRegressionFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FastTreeRegressionFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByFastTreeRegression(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -468,7 +479,7 @@ public void TestFastForestRegressionFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FastForestRegressionFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -505,7 +516,7 @@ public void TestFastTreeTweedieFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FastTreeTweedieFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByFastTreeTweedie(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -542,7 +553,7 @@ public void TestFastTreeRankingFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FastTreeRankingFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByFastTreeRanking(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -579,7 +590,7 @@ public void TestSaveAndLoadTreeFeaturizer() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FastForestRegressionFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -635,7 +646,7 @@ public void TestSaveAndLoadDoubleTreeFeaturizer() }; var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features"). - Append(ML.Transforms.FastForestRegressionFeaturizing(options)). + Append(ML.Transforms.FeaturizeByFastForestRegression(options)). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")). Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); @@ -667,7 +678,7 @@ public void TestSaveAndLoadDoubleTreeFeaturizer() var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features"). Append(ML.Transforms.NormalizeBinning("CopiedFeatures")). - Append(ML.Transforms.FastForestRegressionFeaturizing(options)). + Append(ML.Transforms.FeaturizeByFastForestRegression(options)). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")). Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var secondModel = secondPipeline.Fit(dataView); @@ -710,7 +721,7 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() bool isWrong = false; try { - var wrongPipeline = ML.Transforms.FastTreeBinaryFeaturizing(options). + var wrongPipeline = ML.Transforms.FeaturizeByFastTreeBinary(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var wrongModel = wrongPipeline.Fit(dataView); @@ -721,7 +732,7 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() } Assert.True(isWrong); - var pipeline = ML.Transforms.FastTreeBinaryFeaturizing(options). + var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Leaves")). Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); From 618179db2a682cf7753a42597f9dc5fc4e1b2d6a Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 5 Jun 2019 17:53:16 -0700 Subject: [PATCH 10/25] Add samples --- ...inaryClassificationFeaturization.ttinclude | 110 ++++++++++++++ ...astForestBinaryFeaturizationWithOptions.cs | 139 +++++++++++++++++ ...astForestBinaryFeaturizationWithOptions.tt | 49 ++++++ ...orestRegressionFeaturizationWithOptions.cs | 138 +++++++++++++++++ ...orestRegressionFeaturizationWithOptions.tt | 47 ++++++ .../FastTreeBinaryFeaturizationWithOptions.cs | 141 ++++++++++++++++++ .../FastTreeBinaryFeaturizationWithOptions.tt | 57 +++++++ ...FastTreeRankingFeaturizationWithOptions.cs | 141 ++++++++++++++++++ ...FastTreeRankingFeaturizationWithOptions.tt | 47 ++++++ ...tTreeRegressionFeaturizationWithOptions.cs | 138 +++++++++++++++++ ...tTreeRegressionFeaturizationWithOptions.tt | 47 ++++++ ...FastTreeTweedieFeaturizationWithOptions.cs | 138 +++++++++++++++++ ...FastTreeTweedieFeaturizationWithOptions.tt | 54 +++++++ .../RankingFeaturization.ttinclude | 114 ++++++++++++++ .../RegressionFeaturization.ttinclude | 111 ++++++++++++++ .../Microsoft.ML.Samples.csproj | 54 +++++++ .../TreeEnsembleFeaturizationEstimator.cs | 2 +- .../TreeEnsembleFeaturizerTest.cs | 108 +++++++------- 18 files changed, 1580 insertions(+), 55 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude new file mode 100644 index 0000000000..f88c683540 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude @@ -0,0 +1,110 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +<# if (TrainerOptions != null) { #> +<#=OptionsInclude#> +<# } #> + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class <#=ClassName#> + {<#=Comments#> + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of data points to be transformed. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); +<# if (CacheData) { #> + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); +<# } #> + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new <#=TrainerOptions#>; + + // Define the tree-based featurizer's configuration. + var options = new <#=Options#>; + + // Define the featurizer. + var pipeline = mlContext.Transforms.<#=Trainer#>(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + <#=ExpectedOutput#> + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = randomFloat() > <#=LabelThreshold#>; + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + // For data points with false label, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + <#=DataSepValue#>).ToArray() + }; + } + } + + // Example with label and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + public bool Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs new file mode 100644 index 0000000000..4b58ff44d4 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs @@ -0,0 +1,139 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastForestBinaryFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of data points to be transformed. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastForestBinaryTrainer.Options + { + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastForestBinaryFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastForestBinary(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.3333333,0.005309734]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02077151,0.005309734]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02077151,0.005309734]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = randomFloat() > 0.5f; + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + // For data points with false label, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + 0.03f).ToArray() + }; + } + } + + // Example with label and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + public bool Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt new file mode 100644 index 0000000000..11e49ad5df --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt @@ -0,0 +1,49 @@ +<#@ include file="BinaryClassificationFeaturization.ttinclude"#> +<#+ +string ClassName="FastForestBinaryFeaturizationWithOptions"; +string Trainer = "FeaturizeByFastForestBinary"; +bool CacheData = true; +string LabelThreshold = "0.5f"; +string DataSepValue = "0.03f"; +string OptionsInclude = "using Microsoft.ML.Trainers.FastTree;"; +string Comments= @" + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree."; + +string TrainerOptions = @"FastForestBinaryTrainer.Options + { + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }"; + +string Options = @"FastForestBinaryFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.3333333,0.005309734]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02077151,0.005309734]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02077151,0.005309734]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs new file mode 100644 index 0000000000..359f6c1d6f --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastForestRegressionFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastForestRegressionTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastForestRegressionFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastForestRegression(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Create testing data. Use different random seed to make it different from training data. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.6001529,0.8102381,0.7916333]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,1,1,1,1,0,1,0]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.329645,0.4225699,0.4536894]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,0,0,1,1,0,0,0]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.7266843,0.6299202,0.7916333]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,1,1,1,1,0,1,0]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 3).Select(x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} + diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt new file mode 100644 index 0000000000..493d46cfae --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt @@ -0,0 +1,47 @@ +<#@ include file="RegressionFeaturization.ttinclude"#> + +<#+ +string ClassHeader = @"// This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. "; + +string ClassName="FastForestRegressionFeaturizationWithOptions"; +bool CacheData = true; +string ExtraUsing = "using Microsoft.ML.Trainers.FastTree;"; +string Trainer = @"FeaturizeByFastForestRegression"; +string TrainerOptions = @"FastForestRegressionTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }"; + +string Options = @"FastForestRegressionFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.6001529,0.8102381,0.7916333]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,1,1,1,1,0,1,0]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.329645,0.4225699,0.4536894]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,0,0,1,1,0,0,0]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.7266843,0.6299202,0.7916333]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,1,1,1,1,0,1,0]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs new file mode 100644 index 0000000000..fa34c64c90 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs @@ -0,0 +1,141 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastTreeBinaryFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of data points to be transformed. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastTreeBinaryTrainer.Options + { + // Use L2Norm for early stopping. + EarlyStoppingMetric = EarlyStoppingMetric.L2Norm, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastTreeBinaryFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastTreeBinary(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = randomFloat() > 0.5f; + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + // For data points with false label, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + 0.03f).ToArray() + }; + } + } + + // Example with label and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + public bool Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt new file mode 100644 index 0000000000..9f22863e1a --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt @@ -0,0 +1,57 @@ +<#@ include file="BinaryClassificationFeaturization.ttinclude"#> +<#+ +string ClassName="FastTreeBinaryFeaturizationWithOptions"; +string Trainer = "FeaturizeByFastTreeBinary"; +bool CacheData = true; +string LabelThreshold = "0.5f"; +string DataSepValue = "0.03f"; +string OptionsInclude = "using Microsoft.ML.Trainers.FastTree;"; +string Comments= @" + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree."; +string TrainerOptions = @"FastTreeBinaryTrainer.Options + { + // Use L2Norm for early stopping. + EarlyStoppingMetric = EarlyStoppingMetric.L2Norm, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }"; + +string Options = @"FastTreeBinaryFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutputPerInstance= @"// Expected output: + // Label: True, Prediction: True + // Label: False, Prediction: False + // Label: True, Prediction: True + // Label: True, Prediction: True + // Label: False, Prediction: False"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs new file mode 100644 index 0000000000..bc7fbb3f89 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs @@ -0,0 +1,141 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastTreeRankingFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastTreeRankingTrainer.Options + { + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastTreeRankingFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastTreeRanking(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [1.117325,1.068023,0.8581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.226105,0.006197017,0.06219412]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // The original feature vector [0.6588848,1.006027,0.5421779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.226105,-0.3102316,-0.3081155]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // The original feature vector [0.6737045,0.6919063,0.8673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.226105,-0.3102316,-0.3081155]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0, int groupSize = 10) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = random.Next(0, 5); + yield return new DataPoint + { + Label = (uint)label, + GroupId = (uint)(i / groupSize), + // Create random features that are correlated with the label. + // For data points with larger labels, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => randomFloat() + x * 0.1f).ToArray() + }; + } + } + + // Example with label, groupId, and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + [KeyType(5)] + public uint Label { get; set; } + [KeyType(100)] + public uint GroupId { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt new file mode 100644 index 0000000000..3a885e0783 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt @@ -0,0 +1,47 @@ +<#@ include file="RankingFeaturization.ttinclude"#> +<#+ +string ClassName = "FastTreeRankingFeaturizationWithOptions"; +string Trainer = "FeaturizeByFastTreeRanking"; +bool CacheData = true; + +string TrainerOptions = @"FastTreeRankingTrainer.Options + { + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName + }"; + +string Options = @"FastTreeRankingFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string OptionsInclude = "using Microsoft.ML.Trainers.FastTree;"; + +string Comments= @" + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree."; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [1.117325,1.068023,0.8581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.226105,0.006197017,0.06219412]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // The original feature vector [0.6588848,1.006027,0.5421779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.226105,-0.3102316,-0.3081155]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // The original feature vector [0.6737045,0.6919063,0.8673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.226105,-0.3102316,-0.3081155]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs new file mode 100644 index 0000000000..0187ea8e3e --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastTreeRegressionFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastTreeRegressionTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastTreeRegressionFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastTreeRegression(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Create testing data. Use different random seed to make it different from training data. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.1263802,0.1333696,0.1057345]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,0,1,1,0,1,0,1]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.06627099,0.06554828,0.04006118]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,0,0,1,1,0,1,0]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.151194,0.1061093,0.1057345]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,0,1,1,0,1,0,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 3).Select(x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} + diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt new file mode 100644 index 0000000000..298c6b2488 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt @@ -0,0 +1,47 @@ +<#@ include file="RegressionFeaturization.ttinclude"#> + +<#+ +string ClassHeader = @"// This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. "; + +string ClassName="FastTreeRegressionFeaturizationWithOptions"; +bool CacheData = true; +string ExtraUsing = "using Microsoft.ML.Trainers.FastTree;"; +string Trainer = @"FeaturizeByFastTreeRegression"; +string TrainerOptions = @"FastTreeRegressionTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }"; + +string Options = @"FastTreeRegressionFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.1263802,0.1333696,0.1057345]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,0,1,1,0,1,0,1]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.06627099,0.06554828,0.04006118]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,0,0,1,1,0,1,0]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.151194,0.1061093,0.1057345]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,0,1,1,0,1,0,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs new file mode 100644 index 0000000000..2aea92e538 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs @@ -0,0 +1,138 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class FastTreeTweedieFeaturizationWithOptions + { + // This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new FastTreeTweedieTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }; + + // Define the tree-based featurizer's configuration. + var options = new FastTreeTweedieFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }; + + // Define the featurizer. + var pipeline = mlContext.Transforms.FeaturizeByFastTreeTweedie(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Create testing data. Use different random seed to make it different from training data. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.09180452,-0.04118096,-0.01008826]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,0,0,1,0,1,0,0]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.220913,-0.1675234,-0.1447738]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,1,0,1,1,0,1,0]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.05595072,-0.04118096,-0.04182037]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,0,0,1,0,1,0,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 3).Select(x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} + diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt new file mode 100644 index 0000000000..93ce26e80c --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt @@ -0,0 +1,54 @@ +<#@ include file="RegressionFeaturization.ttinclude"#> + +<#+ +string ClassHeader = @"// This example requires installation of additional NuGet package + // Microsoft.ML.FastTree. "; + +string ClassName="FastTreeTweedieFeaturizationWithOptions"; +bool CacheData = true; +string ExtraUsing = "using Microsoft.ML.Trainers.FastTree;"; +string Trainer = @"FeaturizeByFastTreeTweedie"; +string TrainerOptions = @"FastTreeTweedieTrainer.Options + { + // Only use 80% of features to reduce over-fitting. + FeatureFraction = 0.8, + // Create a simpler model by penalizing usage of new features. + FeatureFirstUsePenalty = 0.1, + // Reduce the number of trees to 3. + NumberOfTrees = 3, + // Number of leaves per tree. + NumberOfLeaves = 6, + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName + }"; + +string Options = @"FastTreeTweedieFeaturizationEstimator.Options + { + InputColumnName = featureColumnName, + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName, + TrainerOptions = trainerOptions + }"; + +string ExpectedOutputPerInstance= @"// Expected output: + // Label: 0.985, Prediction: 0.866 + // Label: 0.155, Prediction: 0.171 + // Label: 0.515, Prediction: 0.470 + // Label: 0.566, Prediction: 0.476 + // Label: 0.096, Prediction: 0.140"; + +string ExpectedOutput = @"// Expected output: + // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.09180452,-0.04118096,-0.01008826]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,0,0,1,0,1,0,0]. + // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.220913,-0.1675234,-0.1447738]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,1,0,1,1,0,1,0]. + // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: + // Trees' output values: [-0.05595072,-0.04118096,-0.04182037]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,0,0,1,0,1,0,1]."; +#> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude new file mode 100644 index 0000000000..346614b668 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude @@ -0,0 +1,114 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +<# if (TrainerOptions != null) { #> +<#=OptionsInclude#> +<# } #> + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class <#=ClassName#> + {<#=Comments#> + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); +<# if (CacheData) { #> + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); +<# } #> + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new <#=TrainerOptions#>; + + // Define the tree-based featurizer's configuration. + var options = new <#=Options#>; + + // Define the featurizer. + var pipeline = mlContext.Transforms.<#=Trainer#>(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + <#=ExpectedOutput#> + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0, int groupSize = 10) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = random.Next(0, 5); + yield return new DataPoint + { + Label = (uint)label, + GroupId = (uint)(i / groupSize), + // Create random features that are correlated with the label. + // For data points with larger labels, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => randomFloat() + x * 0.1f).ToArray() + }; + } + } + + // Example with label, groupId, and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + [KeyType(5)] + public uint Label { get; set; } + [KeyType(100)] + public uint GroupId { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude new file mode 100644 index 0000000000..49fef2cc0b --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude @@ -0,0 +1,111 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +<# if (ExtraUsing != null) { #> +<#=ExtraUsing#> +<# } #> + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class <#=ClassName#> + { +<# if (ClassHeader != null) { #> + <#=ClassHeader#> +<# } #> + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(1000).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); +<# if (CacheData) { #> + + // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times, + // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, + // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms + // which needs many data passes. + dataView = mlContext.Data.Cache(dataView); +<# } #> + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define the configuration of the trainer used to train a tree-based model. + var trainerOptions = new <#=TrainerOptions#>; + + // Define the tree-based featurizer's configuration. + var options = new <#=Options#>; + + // Define the featurizer. + var pipeline = mlContext.Transforms.<#=Trainer#>(options); + + // Train the model. + var model = pipeline.Fit(dataView); + + // Create testing data. Use different random seed to make it different from training data. + var transformed = model.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + <#=ExpectedOutput#> + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + for (int i = 0; i < count; i++) + { + float label = (float)random.NextDouble(); + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + Features = Enumerable.Repeat(label, 3).Select(x => x + (float)random.NextDouble()).ToArray() + }; + } + } + + // Example with label and 50 feature values. A data set is a collection of such examples. + private class DataPoint + { + public float Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index b78cf41bfa..8e1002eda2 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -448,6 +448,30 @@ TextTemplatingFileGenerator SdcaWithOptions.cs + + TextTemplatingFileGenerator + FastForestBinaryFeaturizationWithOptions.cs + + + FastForestRegressionFeaturizationWithOptions.cs + TextTemplatingFileGenerator + + + FastTreeBinaryFeaturizationWithOptions.cs + TextTemplatingFileGenerator + + + TextTemplatingFileGenerator + FastTreeRankingFeaturizationWithOptions.cs + + + TextTemplatingFileGenerator + FastTreeRegressionFeaturizationWithOptions.cs + + + TextTemplatingFileGenerator + FastTreeTweedieFeaturizationWithOptions.cs + @@ -895,6 +919,36 @@ True SdcaWithOptions.tt + + True + True + FastForestBinaryFeaturizationWithOptions.tt + + + FastForestRegressionFeaturizationWithOptions.tt + True + True + + + FastTreeBinaryFeaturizationWithOptions.tt + True + True + + + True + True + FastTreeRankingFeaturizationWithOptions.tt + + + True + True + FastTreeRegressionFeaturizationWithOptions.tt + + + True + True + FastTreeTweedieFeaturizationWithOptions.tt + diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index 7cf549e5ec..5b7fc22f3d 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -179,7 +179,7 @@ public sealed class PretrainedTreeFeaturizationEstimator : TreeEnsembleFeaturiza /// of as /// used when calling . /// - public sealed class Options : TreeEnsembleFeaturizationEstimatorBase.CommonOptions + public sealed class Options : CommonOptions { /// /// The pretrained tree model used to do tree-based featurization. Note that contains a collection of decision trees. diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 3c08d35307..74df9bfd29 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -349,12 +349,12 @@ public void TestFastTreeBinaryFeaturizationInPipeline() var trainerOptions = new FastTreeBinaryTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; var options = new FastTreeBinaryFeaturizationEstimator.Options() @@ -387,12 +387,12 @@ public void TestFastForestBinaryFeaturizationInPipeline() var trainerOptions = new FastForestBinaryTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; var options = new FastForestBinaryFeaturizationEstimator.Options() @@ -425,12 +425,12 @@ public void TestFastTreeRegressionFeaturizationInPipeline() var trainerOptions = new FastTreeRegressionTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; var options = new FastTreeRegressionFeaturizationEstimator.Options() @@ -462,12 +462,12 @@ public void TestFastForestRegressionFeaturizationInPipeline() var trainerOptions = new FastForestRegressionTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; var options = new FastForestRegressionFeaturizationEstimator.Options() @@ -499,12 +499,12 @@ public void TestFastTreeTweedieFeaturizationInPipeline() var trainerOptions = new FastTreeTweedieTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; var options = new FastTreeTweedieFeaturizationEstimator.Options() @@ -536,12 +536,12 @@ public void TestFastTreeRankingFeaturizationInPipeline() var trainerOptions = new FastTreeRankingTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; var options = new FastTreeRankingFeaturizationEstimator.Options() @@ -573,12 +573,12 @@ public void TestSaveAndLoadTreeFeaturizer() var trainerOptions = new FastForestRegressionTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; var options = new FastForestRegressionFeaturizationEstimator.Options() @@ -627,12 +627,12 @@ public void TestSaveAndLoadDoubleTreeFeaturizer() var trainerOptions = new FastForestRegressionTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; // Trains tree featurization on "Features" and applies on "CopiedFeatures". @@ -700,12 +700,12 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() var trainerOptions = new FastTreeBinaryTrainer.Options { - NumberOfThreads = 1, - NumberOfTrees = 10, - NumberOfLeaves = 4, - MinimumExampleCountPerLeaf = 10, - FeatureColumnName = "Features", - LabelColumnName = "Label" + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label" }; var options = new FastTreeBinaryFeaturizationEstimator.Options() From 49fe1d713ebbf0d5f55eeab2c696daee1a665c89 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 5 Jun 2019 18:18:31 -0700 Subject: [PATCH 11/25] Add a sample --- ...nedTreeEnsembleFeaturizationWithOptions.cs | 131 ++++++++++++++++++ .../TreeTrainersCatalog.cs | 49 +++++++ 2 files changed, 180 insertions(+) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs new file mode 100644 index 0000000000..1a464ed620 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs @@ -0,0 +1,131 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; + +namespace Samples.Dynamic.Transforms.TreeFeaturization +{ + public static class PretrainedTreeEnsembleFeaturizationWithOptions + { + public static void Example() + { + // Create data set + int dataPointCount = 200; + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create a list of training data points. + var dataPoints = GenerateRandomDataPoints(dataPointCount).ToList(); + + // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. + var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); + + // Define input and output columns of tree-based featurizer. + string labelColumnName = nameof(DataPoint.Label); + string featureColumnName = nameof(DataPoint.Features); + string treesColumnName = nameof(TransformedDataPoint.Trees); + string leavesColumnName = nameof(TransformedDataPoint.Leaves); + string pathsColumnName = nameof(TransformedDataPoint.Paths); + + // Define a tree model whose trees will be extracted to construct a tree featurizer. + var trainer = mlContext.BinaryClassification.Trainers.FastTree( + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 1, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 1, + FeatureColumnName = featureColumnName, + LabelColumnName = labelColumnName + }); + + // Train the defined tree model. + var model = trainer.Fit(dataView); + var predicted = model.Transform(dataView); + + // Define the configuration of tree-based featurizer. + var options = new PretrainedTreeFeaturizationEstimator.Options() + { + InputColumnName = featureColumnName, + ModelParameters = model.Model.SubModel, // Pretrained tree model. + TreesColumnName = treesColumnName, + LeavesColumnName = leavesColumnName, + PathsColumnName = pathsColumnName + }; + + // Fit the created featurizer. It doesn't perform actual training because a pretrained model is provided. + var treeFeaturizer = mlContext.Transforms.FeaturizeByPretrainTreeEnsemble(options).Fit(dataView); + + // Apply TreeEnsembleFeaturizer to the input data. + var transformed = treeFeaturizer.Transform(dataView); + + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); + + // Print out the transformation of the first 3 data points. + for (int i = 0; i < 3; ++i) + { + var dataPoint = dataPoints[i]; + var transformedDataPoint = transformedDataPoints[i]; + Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:"); + Console.WriteLine($" Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}]."); + Console.WriteLine($" Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}]."); + Console.WriteLine($" Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}]."); + } + + // Expected output: + // The original feature vector[0.8173254, 0.7680227, 0.5581612] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.4172185]. + // Leave IDs' 0-1 representation: [1,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1]. + // The original feature vector[0.7588848, 1.106027, 0.6421779] is transformed to three different tree - based feature vectors: + // Trees' output values: [-1]. + // Leave IDs' 0-1 representation: [0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,0]. + // The original feature vector[0.2737045, 0.2919063, 0.4673147] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.4172185]. + // Leave IDs' 0-1 representation: [1,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1]. + } + + private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) + { + var random = new Random(seed); + float randomFloat() => (float)random.NextDouble(); + for (int i = 0; i < count; i++) + { + var label = randomFloat() > 0.5; + yield return new DataPoint + { + Label = label, + // Create random features that are correlated with the label. + // For data points with false label, the feature values are slightly increased by adding a constant. + Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + 0.2f).ToArray() + }; + } + } + + // Example with label and 3 feature values. A data set is a collection of such examples. + private class DataPoint + { + public bool Label { get; set; } + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture the output of tree-base featurization. + private class TransformedDataPoint : DataPoint + { + // The i-th value is the output value of the i-th decision tree. + public float[] Trees { get; set; } + // The 0-1 encoding of leaves the input feature vector falls into. + public float[] Leaves { get; set; } + // The 0-1 encoding of paths the input feature vector reaches the leaves. + public float[] Paths { get; set; } + } + } +} diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index 73ece9adc5..23d22b6804 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -443,6 +443,13 @@ public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalo /// The context to create . /// The options to configure . See and /// for available settings. + /// + /// + /// + /// + /// public static PretrainedTreeFeaturizationEstimator FeaturizeByPretrainTreeEnsemble(this TransformsCatalog catalog, PretrainedTreeFeaturizationEstimator.Options options) { @@ -457,6 +464,13 @@ public static PretrainedTreeFeaturizationEstimator FeaturizeByPretrainTreeEnsemb /// The context to create . /// The options to configure . See and /// for available settings. + /// + /// + /// + /// + /// public static FastForestRegressionFeaturizationEstimator FeaturizeByFastForestRegression(this TransformsCatalog catalog, FastForestRegressionFeaturizationEstimator.Options options) { @@ -471,6 +485,13 @@ public static FastForestRegressionFeaturizationEstimator FeaturizeByFastForestRe /// The context to create . /// The options to configure . See and /// for available settings. + /// + /// + /// + /// + /// public static FastTreeRegressionFeaturizationEstimator FeaturizeByFastTreeRegression(this TransformsCatalog catalog, FastTreeRegressionFeaturizationEstimator.Options options) { @@ -485,6 +506,13 @@ public static FastTreeRegressionFeaturizationEstimator FeaturizeByFastTreeRegres /// The context to create . /// The options to configure . See and /// for available settings. + /// + /// + /// + /// + /// public static FastForestBinaryFeaturizationEstimator FeaturizeByFastForestBinary(this TransformsCatalog catalog, FastForestBinaryFeaturizationEstimator.Options options) { @@ -499,6 +527,13 @@ public static FastForestBinaryFeaturizationEstimator FeaturizeByFastForestBinary /// The context to create . /// The options to configure . See and /// for available settings. + /// + /// + /// + /// + /// public static FastTreeBinaryFeaturizationEstimator FeaturizeByFastTreeBinary(this TransformsCatalog catalog, FastTreeBinaryFeaturizationEstimator.Options options) { @@ -513,6 +548,13 @@ public static FastTreeBinaryFeaturizationEstimator FeaturizeByFastTreeBinary(thi /// The context to create . /// The options to configure . See and /// for available settings. + /// + /// + /// + /// + /// public static FastTreeRankingFeaturizationEstimator FeaturizeByFastTreeRanking(this TransformsCatalog catalog, FastTreeRankingFeaturizationEstimator.Options options) { @@ -527,6 +569,13 @@ public static FastTreeRankingFeaturizationEstimator FeaturizeByFastTreeRanking(t /// The context to create . /// The options to configure . See and /// for available settings. + /// + /// + /// + /// + /// public static FastTreeTweedieFeaturizationEstimator FeaturizeByFastTreeTweedie(this TransformsCatalog catalog, FastTreeTweedieFeaturizationEstimator.Options options) { From 2197391d09b0999411056199ccc339a4c758c8bd Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 6 Jun 2019 10:57:03 -0700 Subject: [PATCH 12/25] API docs --- ...ree-featurization-binary-classification.md | 13 +++ .../io-columns-tree-featurization-ranking.md | 20 ++++ ...o-columns-tree-featurization-regression.md | 14 +++ .../tree-featurization-prediction.md | 25 +++++ ...nedTreeEnsembleFeaturizationWithOptions.cs | 17 ++++ .../TreeEnsembleFeaturizationEstimator.cs | 97 ++++++++++++++++++- .../TreeEnsembleFeaturizer.cs | 13 +++ 7 files changed, 194 insertions(+), 5 deletions(-) create mode 100644 docs/api-reference/io-columns-tree-featurization-binary-classification.md create mode 100644 docs/api-reference/io-columns-tree-featurization-ranking.md create mode 100644 docs/api-reference/io-columns-tree-featurization-regression.md create mode 100644 docs/api-reference/tree-featurization-prediction.md diff --git a/docs/api-reference/io-columns-tree-featurization-binary-classification.md b/docs/api-reference/io-columns-tree-featurization-binary-classification.md new file mode 100644 index 0000000000..ba9dd8e37a --- /dev/null +++ b/docs/api-reference/io-columns-tree-featurization-binary-classification.md @@ -0,0 +1,13 @@ +### Input and Output Columns +The input column must be a known-sized vector of . + +This estimator outputs the following columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Trees` | Vector of | The output values of all trees. | +| `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | +| `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | + +Those output columns are all optional and user can change their names. +Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/docs/api-reference/io-columns-tree-featurization-ranking.md b/docs/api-reference/io-columns-tree-featurization-ranking.md new file mode 100644 index 0000000000..55b153a94a --- /dev/null +++ b/docs/api-reference/io-columns-tree-featurization-ranking.md @@ -0,0 +1,20 @@ +### Input and Output Columns +The input label data type must be [key](xref:Microsoft.ML.Data.KeyDataViewType) +type or . The value of the label determines relevance, where +higher values indicate higher relevance. If the label is a +[key](xref:Microsoft.ML.Data.KeyDataViewType) type, then the key index is the +relevance value, where the smallest index is the least relevant. If the label is a +, larger values indicate higher relevance. The feature +column must be a known-sized vector of and input row group +column must be [key](xref:Microsoft.ML.Data.KeyDataViewType) type. + +This estimator outputs the following columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Trees` | Vector of | The output values of all trees. | +| `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | +| `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | + +Those output columns are all optional and user can change their names. +Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/docs/api-reference/io-columns-tree-featurization-regression.md b/docs/api-reference/io-columns-tree-featurization-regression.md new file mode 100644 index 0000000000..616962559b --- /dev/null +++ b/docs/api-reference/io-columns-tree-featurization-regression.md @@ -0,0 +1,14 @@ +### Input and Output Columns +The input label column data must be . +The input features column data must be a known-sized vector of . + +This estimator outputs the following columns: + +| Output Column Name | Column Type | Description| +| -- | -- | -- | +| `Trees` | Vector of | The output values of all trees. | +| `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | +| `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | + +Those output columns are all optional and user can change their names. +Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/docs/api-reference/tree-featurization-prediction.md b/docs/api-reference/tree-featurization-prediction.md new file mode 100644 index 0000000000..6516dfef8c --- /dev/null +++ b/docs/api-reference/tree-featurization-prediction.md @@ -0,0 +1,25 @@ +### Prediction Details +This estimator produces several output columns from a tree ensemble model. Assume that the model contains only one decision tree: + + Node 0 + / \ + / \ + / \ + / \ + Node 1 Node 2 + / \ / \ + / \ / \ + / \ Leaf -3 Node 3 + Leaf -1 Leaf -2 / \ + / \ + Leaf -4 Leaf -5 + +Assume that the input feature vector falls into `Leaf -1`. The output `Trees` may be a 1-element vector where +the only value is the decision value carried by `Leaf -1`. The output `Leaves` is a 0-1 vector. If the reached +leaf is the $i$-th (indexed by $-(i+1)$ so the first leaf is `Leaf -1`) leaf in the tree, the $i$-th value in `Leaves` +would be 1 and all other values would be 0. The output `Paths` is a 0-1 representation of the nodes passed +through before reaching the leaf. The $i$-th element in `Paths` indicates if the $i$-th node (indexed by $i$) is touched. +For example, reaching `Leaf -1` lead to $[1, 1, 0, 0]$ as the `Paths`. If there are multiple trees, this estimator +just concatenates `Trees`'s, `Leaves`'s, `Paths`'s from all trees (first tree's information comes first in the concatenated vectors). + +Check the See Also section for links to usage examples. \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs index 1a464ed620..f96e595a63 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/PretrainedTreeEnsembleFeaturizationWithOptions.cs @@ -90,6 +90,23 @@ public static void Example() // Trees' output values: [0.4172185]. // Leave IDs' 0-1 representation: [1,0,0,0]. // Paths IDs' 0-1 representation: [1,1,1]. + // + // Note that the trained model contains only one tree. + // + // Node 0 + // / \ + // / Leaf -2 + // Node 1 + // / \ + // / Leaf -3 + // Node 2 + // / \ + // / Leaf -4 + // Leaf -1 + // + // Thus, if a data point reaches Leaf indexed by -1, its 0-1 path representation may be [1,1,1] because that data point + // went through all Node 0, Node 1, and Node 2. + } private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index 5b7fc22f3d..84b3d53d86 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -156,19 +156,22 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) /// . + /// + /// The input label column data must be. + /// The input features column data must be a known-sized vector of. /// /// This estimator outputs the following columns: /// /// | Output Column Name | Column Type | Description| /// | -- | -- | -- | /// | `Trees` | Vector of | The output values of all trees. | - /// | `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | - /// | `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | + /// | `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | + /// | `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | /// - /// Those output columns are all optional. Please see the names of skipped columns to null so that they would not be produced. + /// Those output columns are all optional and user can change their names. + /// Please set the names of skipped columns to null so that they would not be produced. /// - /// Check the See Also section for links to usage examples. + /// [!include[algorithm](~/../docs/samples/docs/api-reference/tree-featurization-prediction.md)] /// ]]> /// /// @@ -201,6 +204,20 @@ internal PretrainedTreeFeaturizationEstimator(IHostEnvironment env, Options opti private protected override TreeEnsembleModelParameters PrepareModel(IDataView input) => _modelParameters; } + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// public sealed class FastTreeBinaryFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastTreeBinaryTrainer.Options _trainerOptions; @@ -230,6 +247,20 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// public sealed class FastTreeRegressionFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastTreeRegressionTrainer.Options _trainerOptions; @@ -259,6 +290,20 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// public sealed class FastForestBinaryFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastForestBinaryTrainer.Options _trainerOptions; @@ -288,6 +333,20 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// public sealed class FastForestRegressionFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastForestRegressionTrainer.Options _trainerOptions; @@ -317,6 +376,20 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// public sealed class FastTreeRankingFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastTreeRankingTrainer.Options _trainerOptions; @@ -346,6 +419,20 @@ private protected override TreeEnsembleModelParameters PrepareModel(IDataView in } } + /// + /// A to transform input feature vector to tree-based features. + /// + /// + /// + /// + /// + /// public sealed class FastTreeTweedieFeaturizationEstimator : TreeEnsembleFeaturizationEstimatorBase { private readonly FastTreeTweedieTrainer.Options _trainerOptions; diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs index 0be2dadfcf..62b3c14f72 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs @@ -44,10 +44,23 @@ namespace Microsoft.ML.Data /// internal sealed class TreeEnsembleFeaturizerBindableMapper : ISchemaBindableMapper, ICanSaveModel { + /// + /// In addition to options inherited from , + /// adds output columns' names of tree-based featurizer. + /// public sealed class Arguments : ScorerArgumentsBase { + /// + /// See . + /// public string TreesColumnName; + /// + /// See . + /// public string LeavesColumnName; + /// + /// See . + /// public string PathsColumnName; } From b00be932f34d7f241177875c05f18fbf39d57ffa Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 6 Jun 2019 10:59:59 -0700 Subject: [PATCH 13/25] Fix one line --- .../io-columns-tree-featurization-binary-classification.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/api-reference/io-columns-tree-featurization-binary-classification.md b/docs/api-reference/io-columns-tree-featurization-binary-classification.md index ba9dd8e37a..f822a2d734 100644 --- a/docs/api-reference/io-columns-tree-featurization-binary-classification.md +++ b/docs/api-reference/io-columns-tree-featurization-binary-classification.md @@ -1,5 +1,6 @@ ### Input and Output Columns -The input column must be a known-sized vector of . +The input label column data must be . +The input features column data must be a known-sized vector of . This estimator outputs the following columns: From bbeb17f9fce2ca2cca998a8c0ba7fd05fb835f4c Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 6 Jun 2019 12:03:05 -0700 Subject: [PATCH 14/25] Add MC test --- .../TreeEnsembleFeaturizerTest.cs | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index 74df9bfd29..c156c51ff5 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -743,5 +743,44 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() Assert.True(metrics.LogLoss < 0.05); Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); } + + [Fact] + public void TreeEnsembleFeaturizingPipelineMulticlass() + { + int dataPointCount = 200; + var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var dataView = ML.Data.LoadFromEnumerable(data); + + var trainerOptions = new FastForestRegressionTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 10, + NumberOfLeaves = 4, + MinimumExampleCountPerLeaf = 10, + FeatureColumnName = "Features", + LabelColumnName = "Label", + ShuffleLabels = true + }; + + var options = new FastForestRegressionFeaturizationEstimator.Options() + { + InputColumnName = "Features", + TreesColumnName = "Trees", + LeavesColumnName = "Leaves", + PathsColumnName = "Paths", + TrainerOptions = trainerOptions + }; + + var pipeline = ML.Transforms.Conversion.Hash("HashedLabel", "Label", numberOfBits: 3). + Append(ML.Transforms.FeaturizeByFastForestRegression(options)). + Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). + Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("HashedLabel", "CombinedFeatures")); + var model = pipeline.Fit(dataView); + var prediction = model.Transform(dataView); + var metrics = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "HashedLabel"); + + Assert.True(metrics.MacroAccuracy > 0.9); + Assert.True(metrics.MicroAccuracy > 0.9); + } } } From 4906d0bb894711d21398f1de38fa1471461ab668 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 6 Jun 2019 13:31:53 -0700 Subject: [PATCH 15/25] Extend a test further --- .../TreeEnsembleFeaturizerTest.cs | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index c156c51ff5..bd50929fb2 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -748,7 +748,7 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() public void TreeEnsembleFeaturizingPipelineMulticlass() { int dataPointCount = 200; - var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); + var data = SamplesUtils.DatasetUtils.GenerateRandomMulticlassClassificationExamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); var trainerOptions = new FastForestRegressionTrainer.Options @@ -758,7 +758,7 @@ public void TreeEnsembleFeaturizingPipelineMulticlass() NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", - LabelColumnName = "Label", + LabelColumnName = "NumericalLabel", ShuffleLabels = true }; @@ -771,16 +771,32 @@ public void TreeEnsembleFeaturizingPipelineMulticlass() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.Conversion.Hash("HashedLabel", "Label", numberOfBits: 3). + var lookupData = new[] { + new LookupMap { Category = "AA", Value = 1.0f }, + new LookupMap { Category = "BB", Value = 2.0f }, + new LookupMap { Category = "CC", Value = 3.0f }, + new LookupMap { Category = "DD", Value = 4.0f } + }; + + var lookupIdvMap = ML.Data.LoadFromEnumerable(lookupData); + + var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label"). + Append(ML.Transforms.Conversion.MapValue("NumericalLabel", lookupIdvMap, lookupIdvMap.Schema["Category"], lookupIdvMap.Schema["Value"], "Label")). Append(ML.Transforms.FeaturizeByFastForestRegression(options)). Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("HashedLabel", "CombinedFeatures")); + Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); - var metrics = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "HashedLabel"); + var metrics = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "KeyLabel"); Assert.True(metrics.MacroAccuracy > 0.9); Assert.True(metrics.MicroAccuracy > 0.9); } + + private class LookupMap + { + public float Value { get; set; } + public string Category { get; set; } + } } } From f7ab9ab9c255e6c6c18c2cec04f12a06cb98d204 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 6 Jun 2019 14:14:02 -0700 Subject: [PATCH 16/25] Address some comments --- .../io-columns-tree-featurization-binary-classification.md | 6 +++--- .../api-reference/io-columns-tree-featurization-ranking.md | 6 +++--- .../io-columns-tree-featurization-regression.md | 6 +++--- src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs | 7 ++++--- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/api-reference/io-columns-tree-featurization-binary-classification.md b/docs/api-reference/io-columns-tree-featurization-binary-classification.md index f822a2d734..1fd9a68a89 100644 --- a/docs/api-reference/io-columns-tree-featurization-binary-classification.md +++ b/docs/api-reference/io-columns-tree-featurization-binary-classification.md @@ -6,9 +6,9 @@ This estimator outputs the following columns: | Output Column Name | Column Type | Description| | -- | -- | -- | -| `Trees` | Vector of | The output values of all trees. | -| `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | -| `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | +| `Trees` | Known-sized vector of | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. | +| `Leaves` | Known-sized vector of | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. | +| `Paths` | Known-sized vector of | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. | Those output columns are all optional and user can change their names. Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/docs/api-reference/io-columns-tree-featurization-ranking.md b/docs/api-reference/io-columns-tree-featurization-ranking.md index 55b153a94a..375ad18f9c 100644 --- a/docs/api-reference/io-columns-tree-featurization-ranking.md +++ b/docs/api-reference/io-columns-tree-featurization-ranking.md @@ -12,9 +12,9 @@ This estimator outputs the following columns: | Output Column Name | Column Type | Description| | -- | -- | -- | -| `Trees` | Vector of | The output values of all trees. | -| `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | -| `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | +| `Trees` | Known-sized vector of | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. | +| `Leaves` | Known-sized vector of | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. | +| `Paths` | Known-sized vector of | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. | Those output columns are all optional and user can change their names. Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/docs/api-reference/io-columns-tree-featurization-regression.md b/docs/api-reference/io-columns-tree-featurization-regression.md index 616962559b..d4acf06f39 100644 --- a/docs/api-reference/io-columns-tree-featurization-regression.md +++ b/docs/api-reference/io-columns-tree-featurization-regression.md @@ -6,9 +6,9 @@ This estimator outputs the following columns: | Output Column Name | Column Type | Description| | -- | -- | -- | -| `Trees` | Vector of | The output values of all trees. | -| `Leaves` | Vector of | The IDs of all leaves where the input feature vector falls into. | -| `Paths` | Vector of | The paths the input feature vector passed through to reach the leaves. | +| `Trees` | Known-sized vector of | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. | +| `Leaves` | Known-sized vector of | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. | +| `Paths` | Known-sized vector of | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. | Those output columns are all optional and user can change their names. Please set the names of skipped columns to null so that they would not be produced. \ No newline at end of file diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs index 62b3c14f72..249cd47b46 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs @@ -379,9 +379,10 @@ private static VersionInfo GetVersionInfo() return new VersionInfo( modelSignature: "TREEMAPR", // verWrittenCur: 0x00010001, // Initial - verWrittenCur: 0x00010002, // Add _defaultValueForMissing - verReadableCur: 0x00010002, - verWeCanReadBack: 0x00010001, + // verWrittenCur: 0x00010002, // Add _defaultValueForMissing + verWrittenCur: 0x00010003, // Add output column names (_treesColumnName, _leavesColumnName, _pathsColumnName) + verReadableCur: 0x00010003, + verWeCanReadBack: 0x00010002, loaderSignature: LoaderSignature, loaderAssemblyName: typeof(TreeEnsembleFeaturizerBindableMapper).Assembly.FullName); } From a8c0c6e825be198e44e3404175193322b4910e3e Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 6 Jun 2019 14:17:10 -0700 Subject: [PATCH 17/25] Address some comments --- .../TreeEnsembleFeaturizationEstimator.cs | 34 +++++++++---------- .../TreeEnsembleFeaturizationTransformer.cs | 6 ++-- .../TreeEnsembleFeaturizer.cs | 6 ++-- .../TreeTrainersCatalog.cs | 14 ++++---- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs index 84b3d53d86..c7248fe3c1 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationEstimator.cs @@ -23,7 +23,7 @@ public abstract class TreeEnsembleFeaturizationEstimatorBase : IEstimator, , /// , , and . /// - public class CommonOptions + public abstract class OptionsBase { /// /// The name of feature column in the when calling . @@ -58,22 +58,22 @@ public class CommonOptions }; /// - /// See . + /// See . /// private protected readonly string FeatureColumnName; /// - /// See . + /// See . /// private protected readonly string TreesColumnName; /// - /// See . + /// See . /// private protected readonly string LeavesColumnName; /// - /// See . + /// See . /// private protected readonly string PathsColumnName; @@ -82,14 +82,14 @@ public class CommonOptions /// private protected readonly IHostEnvironment Env; - private protected TreeEnsembleFeaturizationEstimatorBase(IHostEnvironment env, CommonOptions options) + private protected TreeEnsembleFeaturizationEstimatorBase(IHostEnvironment env, OptionsBase options) { Env = env; if (options.InputColumnName == null) throw Env.Except(nameof(options), "The " + nameof(options.InputColumnName) + " cannot be null."); if (options.TreesColumnName == null && options.LeavesColumnName == null && options.PathsColumnName == null) - throw Env.Except($"{nameof(CommonOptions.TreesColumnName)}, {nameof(CommonOptions.LeavesColumnName)}, and {nameof(CommonOptions.PathsColumnName)} cannot be all null at the same time. " + - $"At least one output column name should be provided so that at least one output column may be generated."); + throw Env.Except($"{nameof(OptionsBase.TreesColumnName)}, {nameof(OptionsBase.LeavesColumnName)}, and {nameof(OptionsBase.PathsColumnName)} cannot be all null at the same time. " + + $"At least one output column name should be provided so at least one output column may be generated."); FeatureColumnName = options.InputColumnName; TreesColumnName = options.TreesColumnName; @@ -106,7 +106,7 @@ private protected TreeEnsembleFeaturizationEstimatorBase(IHostEnvironment env, C private protected abstract TreeEnsembleModelParameters PrepareModel(IDataView input); /// - /// Produce a which maps the column called in + /// Produce a which maps the column called in /// to three output columns. /// public TreeEnsembleFeaturizationTransformer Fit(IDataView input) @@ -122,7 +122,7 @@ public TreeEnsembleFeaturizationTransformer Fit(IDataView input) /// vector falls into, and the paths to those leaves. /// /// A schema which contains a feature column. Note that feature column name can be specified - /// by . + /// by . /// Output produced by . public SchemaShape GetOutputSchema(SchemaShape inputSchema) { @@ -182,7 +182,7 @@ public sealed class PretrainedTreeFeaturizationEstimator : TreeEnsembleFeaturiza /// of as /// used when calling . /// - public sealed class Options : CommonOptions + public sealed class Options : OptionsBase { /// /// The pretrained tree model used to do tree-based featurization. Note that contains a collection of decision trees. @@ -225,7 +225,7 @@ public sealed class FastTreeBinaryFeaturizationEstimator : TreeEnsembleFeaturiza /// /// Options for the . /// - public sealed class Options : CommonOptions + public sealed class Options : OptionsBase { /// /// The configuration of used to train the underlying . @@ -268,7 +268,7 @@ public sealed class FastTreeRegressionFeaturizationEstimator : TreeEnsembleFeatu /// /// Options for the . /// - public sealed class Options : CommonOptions + public sealed class Options : OptionsBase { /// /// The configuration of used to train the underlying . @@ -311,7 +311,7 @@ public sealed class FastForestBinaryFeaturizationEstimator : TreeEnsembleFeaturi /// /// Options for the . /// - public sealed class Options : CommonOptions + public sealed class Options : OptionsBase { /// /// The configuration of used to train the underlying . @@ -354,7 +354,7 @@ public sealed class FastForestRegressionFeaturizationEstimator : TreeEnsembleFea /// /// Options for the . /// - public sealed class Options : CommonOptions + public sealed class Options : OptionsBase { /// /// The configuration of used to train the underlying . @@ -397,7 +397,7 @@ public sealed class FastTreeRankingFeaturizationEstimator : TreeEnsembleFeaturiz /// /// Options for the . /// - public sealed class Options : CommonOptions + public sealed class Options : OptionsBase { /// /// The configuration of used to train the underlying . @@ -440,7 +440,7 @@ public sealed class FastTreeTweedieFeaturizationEstimator : TreeEnsembleFeaturiz /// /// Options for the . /// - public sealed class Options : CommonOptions + public sealed class Options : OptionsBase { /// /// The configuration of used to train the underlying . diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs index 551ca2791a..d30337b07e 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizationTransformer.cs @@ -25,15 +25,15 @@ public sealed class TreeEnsembleFeaturizationTransformer : PredictionTransformer private readonly TreeEnsembleFeaturizerBindableMapper.Arguments _scorerArgs; private readonly DataViewSchema.DetachedColumn _featureDetachedColumn; /// - /// See . + /// See . /// private readonly string _treesColumnName; /// - /// See . + /// See . /// private readonly string _leavesColumnName; /// - /// See . + /// See . /// private readonly string _pathsColumnName; /// diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs index 249cd47b46..d6fb9bce72 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs @@ -51,15 +51,15 @@ internal sealed class TreeEnsembleFeaturizerBindableMapper : ISchemaBindableMapp public sealed class Arguments : ScorerArgumentsBase { /// - /// See . + /// See . /// public string TreesColumnName; /// - /// See . + /// See . /// public string LeavesColumnName; /// - /// See . + /// See . /// public string PathsColumnName; } diff --git a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs index 23d22b6804..cdd4f5f87f 100644 --- a/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs +++ b/src/Microsoft.ML.FastTree/TreeTrainersCatalog.cs @@ -442,7 +442,7 @@ public static FastForestBinaryTrainer FastForest(this BinaryClassificationCatalo /// /// The context to create . /// The options to configure . See and - /// for available settings. + /// for available settings. /// /// /// /// The context to create . /// The options to configure . See and - /// for available settings. + /// for available settings. /// /// /// /// The context to create . /// The options to configure . See and - /// for available settings. + /// for available settings. /// /// /// /// The context to create . /// The options to configure . See and - /// for available settings. + /// for available settings. /// /// /// /// The context to create . /// The options to configure . See and - /// for available settings. + /// for available settings. /// /// /// /// The context to create . /// The options to configure . See and - /// for available settings. + /// for available settings. /// /// /// /// The context to create . /// The options to configure . See and - /// for available settings. + /// for available settings. /// /// /// Date: Thu, 6 Jun 2019 14:18:52 -0700 Subject: [PATCH 18/25] Address comments --- ...inaryClassificationFeaturization.ttinclude | 16 ++++++------ ...astForestBinaryFeaturizationWithOptions.cs | 26 +++++++++---------- ...astForestBinaryFeaturizationWithOptions.tt | 10 +++---- ...orestRegressionFeaturizationWithOptions.cs | 18 ++++++------- ...orestRegressionFeaturizationWithOptions.tt | 6 ++--- .../FastTreeBinaryFeaturizationWithOptions.cs | 26 +++++++++---------- .../FastTreeBinaryFeaturizationWithOptions.tt | 10 +++---- ...FastTreeRankingFeaturizationWithOptions.cs | 26 +++++++++---------- ...FastTreeRankingFeaturizationWithOptions.tt | 10 +++---- ...tTreeRegressionFeaturizationWithOptions.cs | 18 ++++++------- ...tTreeRegressionFeaturizationWithOptions.tt | 6 ++--- ...FastTreeTweedieFeaturizationWithOptions.cs | 18 ++++++------- ...FastTreeTweedieFeaturizationWithOptions.tt | 6 ++--- .../RankingFeaturization.ttinclude | 16 ++++++------ .../RegressionFeaturization.ttinclude | 12 ++++----- 15 files changed, 112 insertions(+), 112 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude index f88c683540..46d385ca7f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude @@ -39,10 +39,10 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new <#=TrainerOptions#>; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new <#=Options#>; // Define the featurizer. @@ -51,10 +51,10 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization // Train the model. var model = pipeline.Fit(dataView); - // Apply the trained transformer to the considered data set. - var transformed = model.Transform(dataView); + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -99,11 +99,11 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs index 4b58ff44d4..4e74099081 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs @@ -37,7 +37,7 @@ public static void Example() string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new FastForestBinaryTrainer.Options { // Create a simpler model by penalizing usage of new features. @@ -46,16 +46,16 @@ public static void Example() NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - // Feature column name. - FeatureColumnName = featureColumnName, - // Label column name. - LabelColumnName = labelColumnName + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName }; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new FastForestBinaryFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, @@ -68,10 +68,10 @@ public static void Example() // Train the model. var model = pipeline.Fit(dataView); - // Apply the trained transformer to the considered data set. - var transformed = model.Transform(dataView); + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -128,11 +128,11 @@ private class DataPoint // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt index 11e49ad5df..171fd34702 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt @@ -18,15 +18,15 @@ string TrainerOptions = @"FastForestBinaryTrainer.Options NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - // Feature column name. - FeatureColumnName = featureColumnName, - // Label column name. - LabelColumnName = labelColumnName + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName }"; string Options = @"FastForestBinaryFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs index 359f6c1d6f..42162e40de 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs @@ -37,7 +37,7 @@ public static void Example() string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new FastForestRegressionTrainer.Options { // Only use 80% of features to reduce over-fitting. @@ -48,14 +48,14 @@ public static void Example() NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - LabelColumnName = labelColumnName, - FeatureColumnName = featureColumnName + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName }; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new FastForestRegressionFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, @@ -71,7 +71,7 @@ public static void Example() // Create testing data. Use different random seed to make it different from training data. var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -126,11 +126,11 @@ private class DataPoint // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt index 493d46cfae..00466cc269 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt @@ -18,13 +18,13 @@ string TrainerOptions = @"FastForestRegressionTrainer.Options NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - LabelColumnName = labelColumnName, - FeatureColumnName = featureColumnName + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName }"; string Options = @"FastForestRegressionFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs index fa34c64c90..2e646fc4cf 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs @@ -37,7 +37,7 @@ public static void Example() string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new FastTreeBinaryTrainer.Options { // Use L2Norm for early stopping. @@ -48,16 +48,16 @@ public static void Example() NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - // Feature column name. - FeatureColumnName = featureColumnName, - // Label column name. - LabelColumnName = labelColumnName + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName }; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new FastTreeBinaryFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, @@ -70,10 +70,10 @@ public static void Example() // Train the model. var model = pipeline.Fit(dataView); - // Apply the trained transformer to the considered data set. - var transformed = model.Transform(dataView); + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -130,11 +130,11 @@ private class DataPoint // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt index 9f22863e1a..a26d79cf86 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt @@ -19,15 +19,15 @@ string TrainerOptions = @"FastTreeBinaryTrainer.Options NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - // Feature column name. - FeatureColumnName = featureColumnName, - // Label column name. - LabelColumnName = labelColumnName + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName }"; string Options = @"FastTreeBinaryFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs index bc7fbb3f89..6a7e059870 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs @@ -37,23 +37,23 @@ public static void Example() string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new FastTreeRankingTrainer.Options { // Reduce the number of trees to 3. NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - // Feature column name. - FeatureColumnName = featureColumnName, - // Label column name. - LabelColumnName = labelColumnName + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName }; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new FastTreeRankingFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, @@ -66,10 +66,10 @@ public static void Example() // Train the model. var model = pipeline.Fit(dataView); - // Apply the trained transformer to the considered data set. - var transformed = model.Transform(dataView); + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -130,11 +130,11 @@ private class DataPoint // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt index 3a885e0783..a3f3bf6b7d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt @@ -10,15 +10,15 @@ string TrainerOptions = @"FastTreeRankingTrainer.Options NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - // Feature column name. - FeatureColumnName = featureColumnName, - // Label column name. - LabelColumnName = labelColumnName + // Feature column name. + FeatureColumnName = featureColumnName, + // Label column name. + LabelColumnName = labelColumnName }"; string Options = @"FastTreeRankingFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs index 0187ea8e3e..51f854a404 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs @@ -37,7 +37,7 @@ public static void Example() string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new FastTreeRegressionTrainer.Options { // Only use 80% of features to reduce over-fitting. @@ -48,14 +48,14 @@ public static void Example() NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - LabelColumnName = labelColumnName, - FeatureColumnName = featureColumnName + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName }; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new FastTreeRegressionFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, @@ -71,7 +71,7 @@ public static void Example() // Create testing data. Use different random seed to make it different from training data. var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -126,11 +126,11 @@ private class DataPoint // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt index 298c6b2488..7d51495679 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt @@ -18,13 +18,13 @@ string TrainerOptions = @"FastTreeRegressionTrainer.Options NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - LabelColumnName = labelColumnName, - FeatureColumnName = featureColumnName + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName }"; string Options = @"FastTreeRegressionFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs index 2aea92e538..9b9d9ec683 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs @@ -37,7 +37,7 @@ public static void Example() string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new FastTreeTweedieTrainer.Options { // Only use 80% of features to reduce over-fitting. @@ -48,14 +48,14 @@ public static void Example() NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - LabelColumnName = labelColumnName, - FeatureColumnName = featureColumnName + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName }; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new FastTreeTweedieFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, @@ -71,7 +71,7 @@ public static void Example() // Create testing data. Use different random seed to make it different from training data. var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -126,11 +126,11 @@ private class DataPoint // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt index 93ce26e80c..171473ab66 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt @@ -18,13 +18,13 @@ string TrainerOptions = @"FastTreeTweedieTrainer.Options NumberOfTrees = 3, // Number of leaves per tree. NumberOfLeaves = 6, - LabelColumnName = labelColumnName, - FeatureColumnName = featureColumnName + LabelColumnName = labelColumnName, + FeatureColumnName = featureColumnName }"; string Options = @"FastTreeTweedieFeaturizationEstimator.Options { - InputColumnName = featureColumnName, + InputColumnName = featureColumnName, TreesColumnName = treesColumnName, LeavesColumnName = leavesColumnName, PathsColumnName = pathsColumnName, diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude index 346614b668..89d0f48741 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude @@ -39,10 +39,10 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new <#=TrainerOptions#>; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new <#=Options#>; // Define the featurizer. @@ -51,10 +51,10 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization // Train the model. var model = pipeline.Fit(dataView); - // Apply the trained transformer to the considered data set. - var transformed = model.Transform(dataView); + // Apply the trained transformer to the considered data set. + var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -103,11 +103,11 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude index 49fef2cc0b..a03692786f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude @@ -42,10 +42,10 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization string leavesColumnName = nameof(TransformedDataPoint.Leaves); string pathsColumnName = nameof(TransformedDataPoint.Paths); - // Define the configuration of the trainer used to train a tree-based model. + // Define the configuration of the trainer used to train a tree-based model. var trainerOptions = new <#=TrainerOptions#>; - // Define the tree-based featurizer's configuration. + // Define the tree-based featurizer's configuration. var options = new <#=Options#>; // Define the featurizer. @@ -57,7 +57,7 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization // Create testing data. Use different random seed to make it different from training data. var transformed = model.Transform(dataView); - // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. + // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView. var transformedDataPoints = mlContext.Data.CreateEnumerable(transformed, false).ToList(); // Print out the transformation of the first 3 data points. @@ -100,11 +100,11 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization // Class used to capture the output of tree-base featurization. private class TransformedDataPoint : DataPoint { - // The i-th value is the output value of the i-th decision tree. + // The i-th value is the output value of the i-th decision tree. public float[] Trees { get; set; } - // The 0-1 encoding of leaves the input feature vector falls into. + // The 0-1 encoding of leaves the input feature vector falls into. public float[] Leaves { get; set; } - // The 0-1 encoding of paths the input feature vector reaches the leaves. + // The 0-1 encoding of paths the input feature vector reaches the leaves. public float[] Paths { get; set; } } } From 1f261c5e10ec71da6a1e2421ff1416c2a37d6542 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 6 Jun 2019 15:10:01 -0700 Subject: [PATCH 19/25] Comment --- src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs index d6fb9bce72..bbb69aa222 100644 --- a/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs +++ b/src/Microsoft.ML.FastTree/TreeEnsembleFeaturizer.cs @@ -425,6 +425,7 @@ public TreeEnsembleFeaturizerBindableMapper(IHostEnvironment env, ModelLoadConte Contracts.CheckValue(env, nameof(env)); _host = env.Register(LoaderSignature); _host.AssertValue(ctx); + ctx.CheckAtModel(GetVersionInfo()); // *** Binary format *** // ensemble From 241b3ad3513cb01aea3566df17085da7372c5dfe Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 6 Jun 2019 17:41:45 -0700 Subject: [PATCH 20/25] Add cache points --- .../TrainerEstimators/TreeEnsembleFeaturizerTest.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index bd50929fb2..aaf55ba310 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -229,6 +229,7 @@ public void TestPretrainedTreeFeaturizationEstimator() int dataPointCount = 20; var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); // Define a tree model whose trees will be extracted to construct a tree featurizer. var trainer = ML.BinaryClassification.Trainers.FastTree( @@ -296,6 +297,7 @@ public void TreeEnsembleFeaturizingPipeline() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); // Define a tree model whose trees will be extracted to construct a tree featurizer. var trainer = ML.BinaryClassification.Trainers.FastTree( @@ -346,6 +348,7 @@ public void TestFastTreeBinaryFeaturizationInPipeline() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastTreeBinaryTrainer.Options { @@ -384,6 +387,7 @@ public void TestFastForestBinaryFeaturizationInPipeline() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestBinaryTrainer.Options { @@ -422,6 +426,7 @@ public void TestFastTreeRegressionFeaturizationInPipeline() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastTreeRegressionTrainer.Options { @@ -459,6 +464,7 @@ public void TestFastForestRegressionFeaturizationInPipeline() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { @@ -496,6 +502,7 @@ public void TestFastTreeTweedieFeaturizationInPipeline() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastTreeTweedieTrainer.Options { @@ -533,6 +540,7 @@ public void TestFastTreeRankingFeaturizationInPipeline() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastTreeRankingTrainer.Options { @@ -570,6 +578,7 @@ public void TestSaveAndLoadTreeFeaturizer() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { @@ -624,6 +633,7 @@ public void TestSaveAndLoadDoubleTreeFeaturizer() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateFloatLabelFloatFeatureVectorSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { @@ -697,6 +707,7 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastTreeBinaryTrainer.Options { @@ -750,6 +761,7 @@ public void TreeEnsembleFeaturizingPipelineMulticlass() int dataPointCount = 200; var data = SamplesUtils.DatasetUtils.GenerateRandomMulticlassClassificationExamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { From 6850b8e8ee4faa8a4933fe58c11c3ea3ce9bba07 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 7 Jun 2019 08:15:39 -0700 Subject: [PATCH 21/25] Update test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs Co-Authored-By: Justin Ormont --- .../TrainerEstimators/TreeEnsembleFeaturizerTest.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index aaf55ba310..f00e36c9f4 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -739,7 +739,7 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() } catch { - isWrong = true; // Only "Leaves" is produced by tree featurizer, so accessing "Trees" and "Paths" may lead to an error. + isWrong = true; // Only "Leaves" is produced by the tree featurizer, so accessing "Trees" and "Paths" will lead to an error. } Assert.True(isWrong); From d337aa59b4666dd9c5a2ab798819fed8d8b8071b Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 7 Jun 2019 08:21:02 -0700 Subject: [PATCH 22/25] Address comment --- .../TreeEnsembleFeaturizerTest.cs | 88 +++++++++---------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index aaf55ba310..b35e6469e2 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -323,9 +323,9 @@ public void TreeEnsembleFeaturizingPipeline() PathsColumnName = "Paths", ModelParameters = treeModel.Model.SubModel }; - var pipeline = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByPretrainTreeEnsemble(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.BinaryClassification.Evaluate(prediction); @@ -369,9 +369,9 @@ public void TestFastTreeBinaryFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.BinaryClassification.Evaluate(prediction); @@ -408,9 +408,9 @@ public void TestFastForestBinaryFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FeaturizeByFastForestBinary(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByFastForestBinary(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.BinaryClassification.Evaluate(prediction); @@ -447,9 +447,9 @@ public void TestFastTreeRegressionFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FeaturizeByFastTreeRegression(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByFastTreeRegression(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); @@ -485,9 +485,9 @@ public void TestFastForestRegressionFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); @@ -523,9 +523,9 @@ public void TestFastTreeTweedieFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FeaturizeByFastTreeTweedie(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByFastTreeTweedie(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); @@ -561,9 +561,9 @@ public void TestFastTreeRankingFeaturizationInPipeline() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FeaturizeByFastTreeRanking(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByFastTreeRanking(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); @@ -599,9 +599,9 @@ public void TestSaveAndLoadTreeFeaturizer() TrainerOptions = trainerOptions }; - var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByFastForestRegression(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); @@ -655,10 +655,10 @@ public void TestSaveAndLoadDoubleTreeFeaturizer() PathsColumnName = "OhMyPaths" }; - var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features"). - Append(ML.Transforms.FeaturizeByFastForestRegression(options)). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")). - Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features") + .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.Regression.Evaluate(prediction); @@ -686,11 +686,11 @@ public void TestSaveAndLoadDoubleTreeFeaturizer() Assert.Equal(metrics.MeanAbsoluteError, loadedMetrics.MeanAbsoluteError); Assert.Equal(metrics.MeanSquaredError, loadedMetrics.MeanSquaredError); - var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features"). - Append(ML.Transforms.NormalizeBinning("CopiedFeatures")). - Append(ML.Transforms.FeaturizeByFastForestRegression(options)). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")). - Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); + var secondPipeline = ML.Transforms.CopyColumns("CopiedFeatures", "Features") + .Append(ML.Transforms.NormalizeBinning("CopiedFeatures")) + .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "OhMyTrees", "OhMyLeaves", "OhMyPaths")) + .Append(ML.Regression.Trainers.Sdca("Label", "CombinedFeatures")); var secondModel = secondPipeline.Fit(dataView); var secondPrediction = secondModel.Transform(dataView); var secondMetrics = ML.Regression.Evaluate(secondPrediction); @@ -732,9 +732,9 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() bool isWrong = false; try { - var wrongPipeline = ML.Transforms.FeaturizeByFastTreeBinary(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var wrongPipeline = ML.Transforms.FeaturizeByFastTreeBinary(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var wrongModel = wrongPipeline.Fit(dataView); } catch @@ -743,9 +743,9 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() } Assert.True(isWrong); - var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Leaves")). - Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); + var pipeline = ML.Transforms.FeaturizeByFastTreeBinary(options) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Leaves")) + .Append(ML.BinaryClassification.Trainers.SdcaLogisticRegression("Label", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.BinaryClassification.Evaluate(prediction); @@ -792,11 +792,11 @@ public void TreeEnsembleFeaturizingPipelineMulticlass() var lookupIdvMap = ML.Data.LoadFromEnumerable(lookupData); - var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label"). - Append(ML.Transforms.Conversion.MapValue("NumericalLabel", lookupIdvMap, lookupIdvMap.Schema["Category"], lookupIdvMap.Schema["Value"], "Label")). - Append(ML.Transforms.FeaturizeByFastForestRegression(options)). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures")); + var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label") + .Append(ML.Transforms.Conversion.MapValue("NumericalLabel", lookupIdvMap, lookupIdvMap.Schema["Category"], lookupIdvMap.Schema["Value"], "Label")) + .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")) + .Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures")); var model = pipeline.Fit(dataView); var prediction = model.Transform(dataView); var metrics = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "KeyLabel"); From 7b2d65479a857d93b0b4dea56cef6bca86d193d6 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 11 Jun 2019 08:40:20 -0700 Subject: [PATCH 23/25] Add Justin's test --- .../TreeEnsembleFeaturizerTest.cs | 51 +++++++++++-------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs index bd50929fb2..5486e5b24d 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TreeEnsembleFeaturizerTest.cs @@ -744,12 +744,17 @@ public void TestFastTreeBinaryFeaturizationInPipelineWithOptionalOutputs() Assert.True(metrics.AreaUnderPrecisionRecallCurve > 0.98); } + /// + /// Apply tree-based featurization on multiclass classification by converting key-typed labels to floats and training + /// a regression tree model for featurization. + /// [Fact] public void TreeEnsembleFeaturizingPipelineMulticlass() { - int dataPointCount = 200; + int dataPointCount = 1000; var data = SamplesUtils.DatasetUtils.GenerateRandomMulticlassClassificationExamples(dataPointCount).ToList(); var dataView = ML.Data.LoadFromEnumerable(data); + dataView = ML.Data.Cache(dataView); var trainerOptions = new FastForestRegressionTrainer.Options { @@ -758,7 +763,7 @@ public void TreeEnsembleFeaturizingPipelineMulticlass() NumberOfLeaves = 4, MinimumExampleCountPerLeaf = 10, FeatureColumnName = "Features", - LabelColumnName = "NumericalLabel", + LabelColumnName = "FloatLabel", ShuffleLabels = true }; @@ -771,32 +776,38 @@ public void TreeEnsembleFeaturizingPipelineMulticlass() TrainerOptions = trainerOptions }; - var lookupData = new[] { - new LookupMap { Category = "AA", Value = 1.0f }, - new LookupMap { Category = "BB", Value = 2.0f }, - new LookupMap { Category = "CC", Value = 3.0f }, - new LookupMap { Category = "DD", Value = 4.0f } + Action actionConvertKeyToFloat = (RowWithKey rowWithKey, RowWithFloat rowWithFloat) => + { + rowWithFloat.FloatLabel = rowWithKey.KeyLabel == 0 ? float.NaN : rowWithKey.KeyLabel - 1; }; - var lookupIdvMap = ML.Data.LoadFromEnumerable(lookupData); + var split = ML.Data.TrainTestSplit(dataView, 0.5); + var trainData = split.TrainSet; + var testData = split.TestSet; - var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label"). - Append(ML.Transforms.Conversion.MapValue("NumericalLabel", lookupIdvMap, lookupIdvMap.Schema["Category"], lookupIdvMap.Schema["Value"], "Label")). - Append(ML.Transforms.FeaturizeByFastForestRegression(options)). - Append(ML.Transforms.Concatenate("CombinedFeatures", "Features", "Trees", "Leaves", "Paths")). - Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures")); - var model = pipeline.Fit(dataView); - var prediction = model.Transform(dataView); + var pipeline = ML.Transforms.Conversion.MapValueToKey("KeyLabel", "Label") + .Append(ML.Transforms.CustomMapping(actionConvertKeyToFloat, "KeyLabel")) + .Append(ML.Transforms.FeaturizeByFastForestRegression(options)) + .Append(ML.Transforms.Concatenate("CombinedFeatures", "Trees", "Leaves", "Paths")) + .Append(ML.MulticlassClassification.Trainers.SdcaMaximumEntropy("KeyLabel", "CombinedFeatures")); + + var model = pipeline.Fit(trainData); + var prediction = model.Transform(testData); var metrics = ML.MulticlassClassification.Evaluate(prediction, labelColumnName: "KeyLabel"); - Assert.True(metrics.MacroAccuracy > 0.9); - Assert.True(metrics.MicroAccuracy > 0.9); + Assert.True(metrics.MacroAccuracy > 0.6); + Assert.True(metrics.MicroAccuracy > 0.6); + } + + private class RowWithKey + { + [KeyType()] + public uint KeyLabel { get; set; } } - private class LookupMap + private class RowWithFloat { - public float Value { get; set; } - public string Category { get; set; } + public float FloatLabel { get; set; } } } } From d1d6813128cd1d21643accabae9f28ece7be3cad Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 11 Jun 2019 09:17:38 -0700 Subject: [PATCH 24/25] Reduce sample size --- .../BinaryClassificationFeaturization.ttinclude | 2 +- .../FastForestBinaryFeaturizationWithOptions.cs | 2 +- .../FastForestRegressionFeaturizationWithOptions.cs | 2 +- .../FastTreeBinaryFeaturizationWithOptions.cs | 2 +- .../FastTreeRankingFeaturizationWithOptions.cs | 2 +- .../FastTreeRegressionFeaturizationWithOptions.cs | 2 +- .../FastTreeTweedieFeaturizationWithOptions.cs | 2 +- .../TreeFeaturization/RankingFeaturization.ttinclude | 2 +- .../TreeFeaturization/RegressionFeaturization.ttinclude | 2 +- docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj | 4 ++-- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude index 46d385ca7f..ec5507c3a7 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude @@ -19,7 +19,7 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization var mlContext = new MLContext(seed: 0); // Create a list of data points to be transformed. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs index 4e74099081..a23fcdc64a 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs @@ -19,7 +19,7 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of data points to be transformed. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs index 42162e40de..f4e9193849 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs @@ -19,7 +19,7 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of training data points. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs index 2e646fc4cf..65c8ac097f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs @@ -19,7 +19,7 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of data points to be transformed. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs index 6a7e059870..58392815e1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs @@ -19,7 +19,7 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of training data points. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs index 51f854a404..4fb0be9658 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs @@ -19,7 +19,7 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of training data points. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs index 9b9d9ec683..952b129dd2 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs @@ -19,7 +19,7 @@ public static void Example() var mlContext = new MLContext(seed: 0); // Create a list of training data points. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude index 89d0f48741..16d6858c91 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RankingFeaturization.ttinclude @@ -19,7 +19,7 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization var mlContext = new MLContext(seed: 0); // Create a list of training data points. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude index a03692786f..28ee91ffaf 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/RegressionFeaturization.ttinclude @@ -22,7 +22,7 @@ namespace Samples.Dynamic.Transforms.TreeFeaturization var mlContext = new MLContext(seed: 0); // Create a list of training data points. - var dataPoints = GenerateRandomDataPoints(1000).ToList(); + var dataPoints = GenerateRandomDataPoints(100).ToList(); // Convert the list of data points to an IDataView object, which is consumable by ML.NET API. var dataView = mlContext.Data.LoadFromEnumerable(dataPoints); diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 8e1002eda2..6793313c63 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -453,12 +453,12 @@ FastForestBinaryFeaturizationWithOptions.cs - FastForestRegressionFeaturizationWithOptions.cs TextTemplatingFileGenerator + FastForestRegressionFeaturizationWithOptions.cs - FastTreeBinaryFeaturizationWithOptions.cs TextTemplatingFileGenerator + FastTreeBinaryFeaturizationWithOptions.cs TextTemplatingFileGenerator From cc2d531c96d61e7fc7bebe89e28e658283684b7d Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 11 Jun 2019 14:57:36 -0700 Subject: [PATCH 25/25] Update sample output --- ...astForestBinaryFeaturizationWithOptions.cs | 18 +++++++------- ...astForestBinaryFeaturizationWithOptions.tt | 18 +++++++------- ...orestRegressionFeaturizationWithOptions.cs | 24 +++++++++---------- ...orestRegressionFeaturizationWithOptions.tt | 24 +++++++++---------- .../FastTreeBinaryFeaturizationWithOptions.cs | 24 +++++++++---------- .../FastTreeBinaryFeaturizationWithOptions.tt | 24 +++++++++---------- ...FastTreeRankingFeaturizationWithOptions.cs | 18 +++++++------- ...FastTreeRankingFeaturizationWithOptions.tt | 18 +++++++------- ...tTreeRegressionFeaturizationWithOptions.cs | 18 +++++++------- ...tTreeRegressionFeaturizationWithOptions.tt | 18 +++++++------- ...FastTreeTweedieFeaturizationWithOptions.cs | 18 +++++++------- ...FastTreeTweedieFeaturizationWithOptions.tt | 18 +++++++------- 12 files changed, 120 insertions(+), 120 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs index a23fcdc64a..6c9b100fa4 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs @@ -87,17 +87,17 @@ public static void Example() // Expected output: // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.3333333,0.005309734]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,1,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // Trees' output values: [0.1111111,0.8823529]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,0,1,0]. // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02077151,0.005309734]. - // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // Trees' output values: [0.4545455,0.8]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]. // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02077151,0.005309734]. - // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // Trees' output values: [0.4545455,0.1111111]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]. } private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt index 171fd34702..a2640ea95e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.tt @@ -35,15 +35,15 @@ string Options = @"FastForestBinaryFeaturizationEstimator.Options string ExpectedOutput = @"// Expected output: // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.3333333,0.005309734]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,1,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // Trees' output values: [0.1111111,0.8823529]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,0,1,0]. // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02077151,0.005309734]. - // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]. + // Trees' output values: [0.4545455,0.8]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]. // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02077151,0.005309734]. - // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1]."; + // Trees' output values: [0.4545455,0.1111111]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1]."; #> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs index f4e9193849..ca6c5a27c8 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.cs @@ -86,18 +86,18 @@ public static void Example() } // Expected output: - // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.6001529,0.8102381,0.7916333]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0]. - // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,1,1,1,1,0,1,0]. - // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.329645,0.4225699,0.4536894]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0]. - // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,0,0,1,1,0,0,0]. - // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.7266843,0.6299202,0.7916333]. - // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]. - // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,1,1,1,1,0,1,0]. + // The original feature vector[1.543569, 1.494266, 1.284405] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.7291142,0.7825329,0.8764582]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]. + // The original feature vector[0.764918, 1.11206, 0.648211] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.3802337,0.584159,0.5648927]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,1,0,0]. + // The original feature vector[1.251254, 1.269456, 1.444864] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.7591804,0.7825329,0.7443035]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]. } private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt index 00466cc269..1d949629d4 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastForestRegressionFeaturizationWithOptions.tt @@ -32,16 +32,16 @@ string Options = @"FastForestRegressionFeaturizationEstimator.Options }"; string ExpectedOutput = @"// Expected output: - // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.6001529,0.8102381,0.7916333]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0]. - // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,1,1,1,1,0,1,0]. - // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.329645,0.4225699,0.4536894]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0]. - // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,0,0,1,1,0,0,0]. - // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.7266843,0.6299202,0.7916333]. - // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0]. - // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,1,1,1,1,0,1,0]."; + // The original feature vector[1.543569, 1.494266, 1.284405] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.7291142,0.7825329,0.8764582]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]. + // The original feature vector[0.764918, 1.11206, 0.648211] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.3802337,0.584159,0.5648927]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,1,0,0]. + // The original feature vector[1.251254, 1.269456, 1.444864] is transformed to three different tree - based feature vectors: + // Trees' output values: [0.7591804,0.7825329,0.7443035]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,0,1,1,1,0,0,1,1,1,0,1]."; #> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs index 65c8ac097f..c8c52e1490 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.cs @@ -88,18 +88,18 @@ public static void Example() } // Expected output: - // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. - // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. - // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.5714286,0.4636412,0.535588]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,1,1,1]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2352941,-0.1382389,0.535588]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,1,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2352941,-0.1382389,-0.2184284]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,0,0,0]. } private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt index a26d79cf86..ec055986d5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeBinaryFeaturizationWithOptions.tt @@ -42,16 +42,16 @@ string ExpectedOutputPerInstance= @"// Expected output: // Label: False, Prediction: False"; string ExpectedOutput = @"// Expected output: - // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. - // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. - // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.02519168,0.0004318157,-0.0002457525]. - // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]."; + // The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.5714286,0.4636412,0.535588]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,1,1,1]. + // The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2352941,-0.1382389,0.535588]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,1,1,1]. + // The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors: + // Trees' output values: [0.2352941,-0.1382389,-0.2184284]. + // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,1,1,1,0,1,0,1,1,1,0,0,0]."; #> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs index 58392815e1..9e525643d3 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.cs @@ -85,17 +85,17 @@ public static void Example() // Expected output: // The original feature vector [1.117325,1.068023,0.8581612] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.226105,0.006197017,0.06219412]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // Trees' output values: [0.4095458,0.2061437,0.2364294]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. // The original feature vector [0.6588848,1.006027,0.5421779] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.226105,-0.3102316,-0.3081155]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // Trees' output values: [0.2543825,-0.06570309,-0.1456212]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,0]. // The original feature vector [0.6737045,0.6919063,0.8673147] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.226105,-0.3102316,-0.3081155]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // Trees' output values: [0.2543825,-0.06570309,0.01300209]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. } private static IEnumerable GenerateRandomDataPoints(int count, int seed = 0, int groupSize = 10) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt index a3f3bf6b7d..8be69bf2df 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRankingFeaturizationWithOptions.tt @@ -33,15 +33,15 @@ string Comments= @" string ExpectedOutput = @"// Expected output: // The original feature vector [1.117325,1.068023,0.8581612] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.226105,0.006197017,0.06219412]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // Trees' output values: [0.4095458,0.2061437,0.2364294]. + // Leave IDs' 0-1 representation: [0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]. // The original feature vector [0.6588848,1.006027,0.5421779] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.226105,-0.3102316,-0.3081155]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]. + // Trees' output values: [0.2543825,-0.06570309,-0.1456212]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,0]. // The original feature vector [0.6737045,0.6919063,0.8673147] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.226105,-0.3102316,-0.3081155]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0]. - // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,0,0,1,1,1,0,1]."; + // Trees' output values: [0.2543825,-0.06570309,0.01300209]. + // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]."; #> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs index 4fb0be9658..c8660e8127 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.cs @@ -87,17 +87,17 @@ public static void Example() // Expected output: // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.1263802,0.1333696,0.1057345]. - // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,0,1,1,0,1,0,1]. + // Trees' output values: [0.1507567,0.1372715,0.1019326]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,1,0,0,1,1,1,1,0]. // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.06627099,0.06554828,0.04006118]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0]. - // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,0,0,1,1,0,1,0]. + // Trees' output values: [0.07604675,0.08244576,0.03080027]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,0,0,0,1]. // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.151194,0.1061093,0.1057345]. - // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,0,1,1,0,1,0,1]. + // Trees' output values: [0.1507567,0.1090626,0.0731837]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,1,0,0,1,1,1,1,0]. } private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt index 7d51495679..e22153c900 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeRegressionFeaturizationWithOptions.tt @@ -33,15 +33,15 @@ string Options = @"FastTreeRegressionFeaturizationEstimator.Options string ExpectedOutput = @"// Expected output: // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.1263802,0.1333696,0.1057345]. - // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,0,1,1,0,1,0,1]. + // Trees' output values: [0.1507567,0.1372715,0.1019326]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,1,0,0,1,1,1,1,0]. // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.06627099,0.06554828,0.04006118]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0]. - // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,0,0,1,1,0,1,0]. + // Trees' output values: [0.07604675,0.08244576,0.03080027]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,0,0,0,1]. // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: - // Trees' output values: [0.151194,0.1061093,0.1057345]. - // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,0,1,1,0,1,0,1]."; + // Trees' output values: [0.1507567,0.1090626,0.0731837]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,1,0,0,1,1,1,1,0]."; #> \ No newline at end of file diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs index 952b129dd2..b6624560a3 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.cs @@ -87,17 +87,17 @@ public static void Example() // Expected output: // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.09180452,-0.04118096,-0.01008826]. - // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0]. - // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,0,0,1,0,1,0,0]. + // Trees' output values: [-0.05652997,-0.02312196,-0.01179363]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,0,1,0,1,1,0,0,0]. // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.220913,-0.1675234,-0.1447738]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0]. - // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,1,0,1,1,0,1,0]. + // Trees' output values: [-0.1933938,-0.1042738,-0.2312837]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,0,0,0,0]. // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.05595072,-0.04118096,-0.04182037]. - // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,0,0,1,0,1,0,1]. + // Trees' output values: [-0.05652997,-0.06082304,-0.04528879]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,0,1,0,1,1,1,0,1]. } private static IEnumerable GenerateRandomDataPoints(int count, int seed=0) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt index 171473ab66..a075887d1e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/TreeFeaturization/FastTreeTweedieFeaturizationWithOptions.tt @@ -40,15 +40,15 @@ string ExpectedOutputPerInstance= @"// Expected output: string ExpectedOutput = @"// Expected output: // The original feature vector [1.543569,1.494266,1.284405] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.09180452,-0.04118096,-0.01008826]. - // Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0]. - // Paths IDs' 0-1 representation: [1,1,0,1,0,1,1,0,0,0,1,0,1,0,0]. + // Trees' output values: [-0.05652997,-0.02312196,-0.01179363]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,0,1,0,1,1,0,0,0]. // The original feature vector [0.764918,1.11206,0.648211] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.220913,-0.1675234,-0.1447738]. - // Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0]. - // Paths IDs' 0-1 representation: [1,0,1,0,0,1,1,0,1,0,1,1,0,1,0]. + // Trees' output values: [-0.1933938,-0.1042738,-0.2312837]. + // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0]. + // Paths IDs' 0-1 representation: [1,1,1,0,0,1,1,0,0,0,1,0,0,0,0]. // The original feature vector [1.251254,1.269456,1.444864] is transformed to three different tree-based feature vectors: - // Trees' output values: [-0.05595072,-0.04118096,-0.04182037]. - // Leave IDs' 0-1 representation: [0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1]. - // Paths IDs' 0-1 representation: [1,1,0,0,1,1,1,0,0,0,1,0,1,0,1]."; + // Trees' output values: [-0.05652997,-0.06082304,-0.04528879]. + // Leave IDs' 0-1 representation: [0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0]. + // Paths IDs' 0-1 representation: [1,0,0,0,0,1,1,0,1,0,1,1,1,0,1]."; #> \ No newline at end of file