dotnet · wschin · Jun 26, 2019 · May 31, 2019 · Jun 3, 2019 · Jun 3, 2019
diff --git a/docs/api-reference/io-columns-tree-featurization-binary-classification.md b/docs/api-reference/io-columns-tree-featurization-binary-classification.md
@@ -0,0 +1,14 @@
+### Input and Output Columns
+The input label column data must be <xref:System.Boolean>.
+The input features column data must be a known-sized vector of <xref:System.Single>.
+
+This estimator outputs the following columns:
+
+| Output Column Name | Column Type | Description|
+| -- | -- | -- |
+| `Trees` | Known-sized vector of <xref:System.Single> | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. |
+| `Leaves` | Known-sized vector of <xref:System.Single> | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. |
+| `Paths` | Known-sized vector of <xref:System.Single> | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. |
+
+Those output columns are all optional and user can change their names.
+Please set the names of skipped columns to null so that they would not be produced.
diff --git a/docs/api-reference/io-columns-tree-featurization-ranking.md b/docs/api-reference/io-columns-tree-featurization-ranking.md
@@ -0,0 +1,20 @@
+### Input and Output Columns
+The input label data type must be [key](xref:Microsoft.ML.Data.KeyDataViewType)
+type or <xref:System.Single>. The value of the label determines relevance, where
+higher values indicate higher relevance. If the label is a
+[key](xref:Microsoft.ML.Data.KeyDataViewType) type, then the key index is the
+relevance value, where the smallest index is the least relevant. If the label is a
+<xref:System.Single>, larger values indicate higher relevance. The feature
+column must be a known-sized vector of <xref:System.Single> and input row group
+column must be [key](xref:Microsoft.ML.Data.KeyDataViewType) type.
+
+This estimator outputs the following columns:
+
+| Output Column Name | Column Type | Description|
+| -- | -- | -- |
+| `Trees` | Known-sized vector of <xref:System.Single> | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. |
+| `Leaves` | Known-sized vector of <xref:System.Single> | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. |
+| `Paths` | Known-sized vector of <xref:System.Single> | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. |
+
+Those output columns are all optional and user can change their names.
+Please set the names of skipped columns to null so that they would not be produced.
diff --git a/docs/api-reference/io-columns-tree-featurization-regression.md b/docs/api-reference/io-columns-tree-featurization-regression.md
@@ -0,0 +1,14 @@
+### Input and Output Columns
+The input label column data must be <xref:System.Single>.
+The input features column data must be a known-sized vector of <xref:System.Single>.
+
+This estimator outputs the following columns:
+
+| Output Column Name | Column Type | Description|
+| -- | -- | -- |
+| `Trees` | Known-sized vector of <xref:System.Single> | The output values of all trees. Its size is identical to the total number of trees in the tree ensemble model. |
+| `Leaves` | Known-sized vector of <xref:System.Single> | 0-1 vector representation to the IDs of all leaves where the input feature vector falls into. Its size is the number of total leaves in the tree ensemble model. |
+| `Paths` | Known-sized vector of <xref:System.Single> | 0-1 vector representation to the paths the input feature vector passed through to reach the leaves. Its size is the number of non-leaf nodes in the tree ensemble model. |
+
+Those output columns are all optional and user can change their names.
+Please set the names of skipped columns to null so that they would not be produced.
diff --git a/docs/api-reference/tree-featurization-prediction.md b/docs/api-reference/tree-featurization-prediction.md
@@ -0,0 +1,25 @@
+### Prediction Details
+This estimator produces several output columns from a tree ensemble model. Assume that the model contains only one decision tree:
+
+                   Node 0
+                   /    \
+                 /        \
+               /            \
+             /                \
+           Node 1            Node 2
+           /    \            /    \
+         /        \        /        \
+       /            \     Leaf -3  Node 3
+      Leaf -1      Leaf -2         /    \
+                                 /        \
+                                Leaf -4  Leaf -5
+
+Assume that the input feature vector falls into `Leaf -1`. The output `Trees` may be a 1-element vector where
+the only value is the decision value carried by `Leaf -1`. The output `Leaves` is a 0-1 vector. If the reached
+leaf is the $i$-th (indexed by $-(i+1)$ so the first leaf is `Leaf -1`) leaf in the tree, the $i$-th value in `Leaves`
+would be 1 and all other values would be 0. The output `Paths` is a 0-1 representation of the nodes passed
+through before reaching the leaf. The $i$-th element in `Paths` indicates if the $i$-th node (indexed by $i$) is touched.
+For example, reaching `Leaf -1` lead to $[1, 1, 0, 0]$ as the `Paths`. If there are multiple trees, this estimator
+just concatenates `Trees`'s, `Leaves`'s, `Paths`'s from all trees (first tree's information comes first in the concatenated vectors).
+
+Check the See Also section for links to usage examples.
diff --git a/....Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude b/....Samples/Dynamic/Transforms/TreeFeaturization/BinaryClassificationFeaturization.ttinclude
@@ -0,0 +1,110 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML;
+using Microsoft.ML.Data;
+<# if (TrainerOptions != null) { #>
+<#=OptionsInclude#>
+<# } #>
+
+namespace Samples.Dynamic.Transforms.TreeFeaturization
+{
+    public static class <#=ClassName#>
+    {<#=Comments#>
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Create a list of data points to be transformed.
+            var dataPoints = GenerateRandomDataPoints(100).ToList();
+
+            // Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
+            var dataView = mlContext.Data.LoadFromEnumerable(dataPoints);
+<# if (CacheData) { #>
+
+            // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times,
+            // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory,
+            // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms 
+            // which needs many data passes.
+            dataView = mlContext.Data.Cache(dataView);
+<# } #>
+
+            // Define input and output columns of tree-based featurizer.
+            string labelColumnName = nameof(DataPoint.Label);
+            string featureColumnName = nameof(DataPoint.Features);
+            string treesColumnName = nameof(TransformedDataPoint.Trees);
+            string leavesColumnName = nameof(TransformedDataPoint.Leaves);
+            string pathsColumnName = nameof(TransformedDataPoint.Paths);
+
+            // Define the configuration of the trainer used to train a tree-based model.
+            var trainerOptions = new <#=TrainerOptions#>;
+
+            // Define the tree-based featurizer's configuration.
+            var options = new <#=Options#>;
+
+            // Define the featurizer.
+            var pipeline = mlContext.Transforms.<#=Trainer#>(options);
+
+            // Train the model.
+            var model = pipeline.Fit(dataView);
+
+            // Apply the trained transformer to the considered data set.
+            var transformed = model.Transform(dataView);
+
+            // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView.
+            var transformedDataPoints = mlContext.Data.CreateEnumerable<TransformedDataPoint>(transformed, false).ToList();
+
+            // Print out the transformation of the first 3 data points.
+            for (int i = 0; i < 3; ++i)
+            {
+                var dataPoint = dataPoints[i];
+                var transformedDataPoint = transformedDataPoints[i];
+                Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:");
+                Console.WriteLine($"  Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}].");
+                Console.WriteLine($"  Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}].");
+                Console.WriteLine($"  Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}].");
+            }
+
+            <#=ExpectedOutput#>
+        }
+
+        private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed=0)
+        {
+            var random = new Random(seed);
+            float randomFloat() => (float)random.NextDouble();
+            for (int i = 0; i < count; i++)
+            {
+                var label = randomFloat() > <#=LabelThreshold#>;
+                yield return new DataPoint
+                {
+                    Label = label,
+                    // Create random features that are correlated with the label.
+                    // For data points with false label, the feature values are slightly increased by adding a constant.
+                    Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + <#=DataSepValue#>).ToArray()
+                };
+            }
+        }
+
+        // Example with label and 3 feature values. A data set is a collection of such examples.
+        private class DataPoint
+        {
+            public bool Label { get; set; }
+            [VectorType(3)]
+            public float[] Features { get; set; }
+        }
+
+        // Class used to capture the output of tree-base featurization.
+        private class TransformedDataPoint : DataPoint
+        {
+            // The i-th value is the output value of the i-th decision tree.
+            public float[] Trees { get; set; }
+            // The 0-1 encoding of leaves the input feature vector falls into.
+            public float[] Leaves { get; set; }
+            // The 0-1 encoding of paths the input feature vector reaches the leaves.
+            public float[] Paths { get; set; }
+        }
+    }
+}
diff --git a/....Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs b/....Samples/Dynamic/Transforms/TreeFeaturization/FastForestBinaryFeaturizationWithOptions.cs
@@ -0,0 +1,139 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML;
+using Microsoft.ML.Data;
+using Microsoft.ML.Trainers.FastTree;
+
+namespace Samples.Dynamic.Transforms.TreeFeaturization
+{
+    public static class FastForestBinaryFeaturizationWithOptions
+    {
+        // This example requires installation of additional NuGet package
+        // <a href="https://www.nuget.org/packages/Microsoft.ML.FastTree/">Microsoft.ML.FastTree</a>.
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Create a list of data points to be transformed.
+            var dataPoints = GenerateRandomDataPoints(100).ToList();
+
+            // Convert the list of data points to an IDataView object, which is consumable by ML.NET API.
+            var dataView = mlContext.Data.LoadFromEnumerable(dataPoints);
+
+            // ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times,
+            // it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory,
+            // a solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms 
+            // which needs many data passes.
+            dataView = mlContext.Data.Cache(dataView);
+
+            // Define input and output columns of tree-based featurizer.
+            string labelColumnName = nameof(DataPoint.Label);
+            string featureColumnName = nameof(DataPoint.Features);
+            string treesColumnName = nameof(TransformedDataPoint.Trees);
+            string leavesColumnName = nameof(TransformedDataPoint.Leaves);
+            string pathsColumnName = nameof(TransformedDataPoint.Paths);
+
+            // Define the configuration of the trainer used to train a tree-based model.
+            var trainerOptions = new FastForestBinaryTrainer.Options
+            {
+                // Create a simpler model by penalizing usage of new features.
+                FeatureFirstUsePenalty = 0.1,
+                // Reduce the number of trees to 3.
+                NumberOfTrees = 3,
+                // Number of leaves per tree.
+                NumberOfLeaves = 6,
+                // Feature column name.
+                FeatureColumnName = featureColumnName,
+                // Label column name.
+                LabelColumnName = labelColumnName
+            };
+
+            // Define the tree-based featurizer's configuration.
+            var options = new FastForestBinaryFeaturizationEstimator.Options
+            {
+                InputColumnName = featureColumnName,
+                TreesColumnName = treesColumnName,
+                LeavesColumnName = leavesColumnName,
+                PathsColumnName = pathsColumnName,
+                TrainerOptions = trainerOptions
+            };
+
+            // Define the featurizer.
+            var pipeline = mlContext.Transforms.FeaturizeByFastForestBinary(options);
+
+            // Train the model.
+            var model = pipeline.Fit(dataView);
+
+            // Apply the trained transformer to the considered data set.
+            var transformed = model.Transform(dataView);
+
+            // Convert IDataView object to a list. Each element in the resulted list corresponds to a row in the IDataView.
+            var transformedDataPoints = mlContext.Data.CreateEnumerable<TransformedDataPoint>(transformed, false).ToList();
+
+            // Print out the transformation of the first 3 data points.
+            for (int i = 0; i < 3; ++i)
+            {
+                var dataPoint = dataPoints[i];
+                var transformedDataPoint = transformedDataPoints[i];
+                Console.WriteLine($"The original feature vector [{String.Join(",", dataPoint.Features)}] is transformed to three different tree-based feature vectors:");
+                Console.WriteLine($"  Trees' output values: [{String.Join(",", transformedDataPoint.Trees)}].");
+                Console.WriteLine($"  Leave IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Leaves)}].");
+                Console.WriteLine($"  Paths IDs' 0-1 representation: [{String.Join(",", transformedDataPoint.Paths)}].");
+            }
+
+            // Expected output:
+            //  The original feature vector [0.8173254,0.7680227,0.5581612] is transformed to three different tree-based feature vectors:
+            //    Trees' output values: [0.1111111,0.8823529].
+            //    Leave IDs' 0-1 representation: [0,0,0,0,1,0,0,0,0,1,0].
+            //    Paths IDs' 0-1 representation: [1,1,1,1,1,1,0,1,0].
+            //  The original feature vector [0.5888848,0.9360271,0.4721779] is transformed to three different tree-based feature vectors:
+            //    Trees' output values: [0.4545455,0.8].
+            //    Leave IDs' 0-1 representation: [0,0,0,1,0,0,0,0,0,0,1].
+            //    Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1].
+            //  The original feature vector [0.2737045,0.2919063,0.4673147] is transformed to three different tree-based feature vectors:
+            //    Trees' output values: [0.4545455,0.1111111].
+            //    Leave IDs' 0-1 representation: [0,0,0,1,0,0,1,0,0,0,0].
+            //    Paths IDs' 0-1 representation: [1,1,1,1,0,1,0,1,1].
+        }
+
+        private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count, int seed=0)
+        {
+            var random = new Random(seed);
+            float randomFloat() => (float)random.NextDouble();
+            for (int i = 0; i < count; i++)
+            {
+                var label = randomFloat() > 0.5f;
+                yield return new DataPoint
+                {
+                    Label = label,
+                    // Create random features that are correlated with the label.
+                    // For data points with false label, the feature values are slightly increased by adding a constant.
+                    Features = Enumerable.Repeat(label, 3).Select(x => x ? randomFloat() : randomFloat() + 0.03f).ToArray()
+                };
+            }
+        }
+
+        // Example with label and 3 feature values. A data set is a collection of such examples.
+        private class DataPoint
+        {
+            public bool Label { get; set; }
+            [VectorType(3)]
+            public float[] Features { get; set; }
+        }
+
+        // Class used to capture the output of tree-base featurization.
+        private class TransformedDataPoint : DataPoint
+        {
+            // The i-th value is the output value of the i-th decision tree.
+            public float[] Trees { get; set; }
+            // The 0-1 encoding of leaves the input feature vector falls into.
+            public float[] Leaves { get; set; }
+            // The 0-1 encoding of paths the input feature vector reaches the leaves.
+            public float[] Paths { get; set; }
+        }
+    }
+}