Add an example of random PCA using in-memory data structure (#2780)

wschin · web-flow · commit 6e9023fa08ee · 2019-02-28T15:36:18.000-08:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs
@@ -0,0 +1,86 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML.Data;
+
+namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection
+{
+    public static class RandomizedPcaSample
+    {
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Training data.
+            var samples = new List<DataPoint>()
+            {
+                new DataPoint(){ Features = new float[3] {1, 0, 0} },
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {1, 2, 3} },
+                new DataPoint(){ Features = new float[3] {0, 1, 0} },
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {-100, 50, -100} }
+            };
+
+            // Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
+            var data = mlContext.Data.LoadFromEnumerable(samples);
+
+            // Create an anomaly detector. Its underlying algorithm is randomized PCA.
+            var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false);
+
+            // Train the anomaly detector.
+            var model = pipeline.Fit(data);
+
+            // Apply the trained model on the training data.
+            var transformed = model.Transform(data);
+
+            // Read ML.NET predictions into IEnumerable<Result>.
+            var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
+
+            // Let's go through all predictions.
+            for (int i = 0; i < samples.Count; ++i)
+            {
+                // The i-th example's prediction result.
+                var result = results[i];
+
+                // The i-th example's feature vector in text format.
+                var featuresInText = string.Join(',', samples[i].Features);
+
+                if (result.PredictedLabel)
+                    // The i-th sample is predicted as an inlier.
+                    Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}",
+                        i, featuresInText, result.Score);
+                else
+                    // The i-th sample is predicted as an outlier.
+                    Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}",
+                        i, featuresInText, result.Score);
+            }
+            // Lines printed out should be
+            //   The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
+            //   The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
+            //   The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
+            //   The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
+            //   The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
+            //   The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
+        }
+
+        // Example with 3 feature values. A training data set is a collection of such examples.
+        private class DataPoint
+        {
+            [VectorType(3)]
+            public float[] Features { get; set; }
+        }
+
+        // Class used to capture prediction of DataPoint.
+        private class Result
+        {
+            // Outlier gets false while inlier has true.
+            public bool PredictedLabel { get; set; }
+            // Outlier gets smaller score.
+            public float Score { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs
@@ -0,0 +1,93 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.ML.Data;
+
+namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection
+{
+    public static class RandomizedPcaSampleWithOptions
+    {
+        public static void Example()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Training data.
+            var samples = new List<DataPoint>()
+            {
+                new DataPoint(){ Features = new float[3] {1, 0, 0} },
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {1, 2, 3} },
+                new DataPoint(){ Features = new float[3] {0, 1, 0} },
+                new DataPoint(){ Features = new float[3] {0, 2, 1} },
+                new DataPoint(){ Features = new float[3] {-100, 50, -100} }
+            };
+
+            // Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
+            var data = mlContext.Data.LoadFromEnumerable(samples);
+
+            var options = new ML.Trainers.RandomizedPcaTrainer.Options()
+            {
+                FeatureColumnName = nameof(DataPoint.Features),
+                Rank = 1,
+                Seed = 10,
+            };
+
+            // Create an anomaly detector. Its underlying algorithm is randomized PCA.
+            var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);
+
+            // Train the anomaly detector.
+            var model = pipeline.Fit(data);
+
+            // Apply the trained model on the training data.
+            var transformed = model.Transform(data);
+
+            // Read ML.NET predictions into IEnumerable<Result>.
+            var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
+
+            // Let's go through all predictions.
+            for (int i = 0; i < samples.Count; ++i)
+            {
+                // The i-th example's prediction result.
+                var result = results[i];
+
+                // The i-th example's feature vector in text format.
+                var featuresInText = string.Join(',', samples[i].Features);
+
+                if (result.PredictedLabel)
+                    // The i-th sample is predicted as an inlier.
+                    Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}",
+                        i, featuresInText, result.Score);
+                else
+                    // The i-th sample is predicted as an outlier.
+                    Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}",
+                        i, featuresInText, result.Score);
+            }
+            // Lines printed out should be
+            //   The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
+            //   The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
+            //   The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
+            //   The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
+            //   The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
+            //   The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
+        }
+
+        // Example with 3 feature values. A training data set is a collection of such examples.
+        private class DataPoint
+        {
+            [VectorType(3)]
+            public float[] Features { get; set; }
+        }
+
+        // Class used to capture prediction of DataPoint.
+        private class Result
+        {
+            // Outlier gets false while inlier has true.
+            public bool PredictedLabel { get; set; }
+            // Outlier gets smaller score.
+            public float Score { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs
@@ -1,6 +1,5 @@
 using System;
 using System.Collections.Generic;
-using Microsoft.ML.Data;
 using static Microsoft.ML.SamplesUtils.DatasetUtils;
 
 namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs
@@ -1,6 +1,5 @@
 using System;
 using System.Collections.Generic;
-using Microsoft.ML.Data;
 using Microsoft.ML.Trainers;
 using static Microsoft.ML.SamplesUtils.DatasetUtils;
 
diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs
@@ -47,6 +47,12 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t
         /// <param name="oversampling">Oversampling parameter for randomized PCA training.</param>
         /// <param name="center">If enabled, data is centered to be zero mean.</param>
         /// <param name="seed">The seed for random number generation.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        ///  [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs)]
+        /// ]]></format>
+        /// </example>
         public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog,
             string featureColumnName = DefaultColumnNames.Features,
             string exampleWeightColumnName = null,
@@ -65,6 +71,12 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An
         /// </summary>
         /// <param name="catalog">The anomaly detection catalog trainer object.</param>
         /// <param name="options">Advanced options to the algorithm.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        ///  [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs)]
+        /// ]]></format>
+        /// </example>
         public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options)
         {
             Contracts.CheckValue(catalog, nameof(catalog));
diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs
@@ -3,6 +3,8 @@
 // See the LICENSE file in the project root for more information.
 
 using System;
+using System.Collections.Generic;
+using System.Linq;
 using Microsoft.Data.DataView;
 using Microsoft.ML.Data;
 using Microsoft.ML.RunTests;
@@ -48,6 +50,96 @@ public void NoAnomalyTest()
             Assert.Throws<ArgumentOutOfRangeException>(() => ML.AnomalyDetection.Evaluate(transformedData));
         }
 
+        [Fact]
+        public static void RandomizedPcaInMemory()
+        {
+            // Create a new context for ML.NET operations. It can be used for exception tracking and logging, 
+            // as a catalog of available operations and as the source of randomness.
+            // Setting the seed to a fixed number in this example to make outputs deterministic.
+            var mlContext = new MLContext(seed: 0);
+
+            // Create an anomaly detector. Its underlying algorithm is randomized PCA.
+            var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false);
+
+            // Test the first detector.
+            ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1);
+
+            // Object required in the creation of another detector.
+            var options = new Trainers.RandomizedPcaTrainer.Options()
+            {
+                FeatureColumnName = nameof(DataPoint.Features),
+                Rank = 1,
+                Center = false
+            };
+
+            // Create anther anomaly detector. Its underlying algorithm is randomized PCA.
+            var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);
+
+            // Test the second detector.
+            ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2);
+        }
+
+        /// <summary>
+        /// Example with 3 feature values used in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
+        /// </summary>
+        private class DataPoint
+        {
+            [VectorType(3)]
+            public float[] Features { get; set; }
+        }
+
+        /// <summary>
+        /// Class used to capture prediction of <see cref="DataPoint"/> in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
+        /// </summary>
+        private class Result
+        {
+            // Outlier gets false while inlier has true.
+            public bool PredictedLabel { get; set; }
+            // Outlier gets smaller score.
+            public float Score { get; set; }
+        }
+
+        /// <summary>
+        /// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
+        /// </summary>
+        private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
+        {
+            var samples = new List<DataPoint>()
+            {
+                new DataPoint(){ Features= new float[3] {1, 0, 0} },
+                new DataPoint(){ Features= new float[3] {0, 2, 1} },
+                new DataPoint(){ Features= new float[3] {1, 2, 3} },
+                new DataPoint(){ Features= new float[3] {0, 1, 0} },
+                new DataPoint(){ Features= new float[3] {0, 2, 1} },
+                new DataPoint(){ Features= new float[3] {-100, 50, -100} }
+            };
+
+            // Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
+            var data = mlContext.Data.LoadFromEnumerable(samples);
+
+            // Train the anomaly detector.
+            var model = trainer.Fit(data);
+
+            // Apply the trained model on the training data.
+            var transformed = model.Transform(data);
+
+            // Read ML.NET predictions into IEnumerable<Result>.
+            var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
+
+            // First 5 examples are inliers.
+            for (int i = 0; i < 5; ++i)
+            {
+                // Inlier should be predicted as true.
+                Assert.True(results[i].PredictedLabel);
+                // Higher score means closer to inlier.
+                Assert.InRange(results[i].Score, 0.3, 1);
+            }
+
+            // Last example is outlier. Note that outlier should be predicted as false.
+            Assert.False(results[5].PredictedLabel);
+            Assert.InRange(results[5].Score, 0, 0.3);
+        }
+
         private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)
         {
             var loader = ML.Data.CreateTextLoader(new[]