diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs new file mode 100644 index 0000000000..b734b1581b --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs @@ -0,0 +1,86 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection +{ + public static class RandomizedPcaSample + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Training data. + var samples = new List() + { + new DataPoint(){ Features = new float[3] {1, 0, 0} }, + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {1, 2, 3} }, + new DataPoint(){ Features = new float[3] {0, 1, 0} }, + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {-100, 50, -100} } + }; + + // Convert the List to IDataView, a consumble format to ML.NET functions. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Create an anomaly detector. Its underlying algorithm is randomized PCA. + var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false); + + // Train the anomaly detector. + var model = pipeline.Fit(data); + + // Apply the trained model on the training data. + var transformed = model.Transform(data); + + // Read ML.NET predictions into IEnumerable. + var results = mlContext.Data.CreateEnumerable(transformed, reuseRowObject: false).ToList(); + + // Let's go through all predictions. + for (int i = 0; i < samples.Count; ++i) + { + // The i-th example's prediction result. + var result = results[i]; + + // The i-th example's feature vector in text format. + var featuresInText = string.Join(',', samples[i].Features); + + if (result.PredictedLabel) + // The i-th sample is predicted as an inlier. + Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}", + i, featuresInText, result.Score); + else + // The i-th sample is predicted as an outlier. + Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}", + i, featuresInText, result.Score); + } + // Lines printed out should be + // The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707 + // The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999 + // The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122 + // The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905 + // The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999 + // The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0 + } + + // Example with 3 feature values. A training data set is a collection of such examples. + private class DataPoint + { + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture prediction of DataPoint. + private class Result + { + // Outlier gets false while inlier has true. + public bool PredictedLabel { get; set; } + // Outlier gets smaller score. + public float Score { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs new file mode 100644 index 0000000000..f9160570c9 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs @@ -0,0 +1,93 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection +{ + public static class RandomizedPcaSampleWithOptions + { + public static void Example() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Training data. + var samples = new List() + { + new DataPoint(){ Features = new float[3] {1, 0, 0} }, + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {1, 2, 3} }, + new DataPoint(){ Features = new float[3] {0, 1, 0} }, + new DataPoint(){ Features = new float[3] {0, 2, 1} }, + new DataPoint(){ Features = new float[3] {-100, 50, -100} } + }; + + // Convert the List to IDataView, a consumble format to ML.NET functions. + var data = mlContext.Data.LoadFromEnumerable(samples); + + var options = new ML.Trainers.RandomizedPcaTrainer.Options() + { + FeatureColumnName = nameof(DataPoint.Features), + Rank = 1, + Seed = 10, + }; + + // Create an anomaly detector. Its underlying algorithm is randomized PCA. + var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(options); + + // Train the anomaly detector. + var model = pipeline.Fit(data); + + // Apply the trained model on the training data. + var transformed = model.Transform(data); + + // Read ML.NET predictions into IEnumerable. + var results = mlContext.Data.CreateEnumerable(transformed, reuseRowObject: false).ToList(); + + // Let's go through all predictions. + for (int i = 0; i < samples.Count; ++i) + { + // The i-th example's prediction result. + var result = results[i]; + + // The i-th example's feature vector in text format. + var featuresInText = string.Join(',', samples[i].Features); + + if (result.PredictedLabel) + // The i-th sample is predicted as an inlier. + Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}", + i, featuresInText, result.Score); + else + // The i-th sample is predicted as an outlier. + Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}", + i, featuresInText, result.Score); + } + // Lines printed out should be + // The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707 + // The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999 + // The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122 + // The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905 + // The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999 + // The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0 + } + + // Example with 3 feature values. A training data set is a collection of such examples. + private class DataPoint + { + [VectorType(3)] + public float[] Features { get; set; } + } + + // Class used to capture prediction of DataPoint. + private class Result + { + // Outlier gets false while inlier has true. + public bool PredictedLabel { get; set; } + // Outlier gets smaller score. + public float Score { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs index 5c98e36f69..e0340dca20 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using Microsoft.ML.Data; using static Microsoft.ML.SamplesUtils.DatasetUtils; namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs index c78a20cc9a..f20bbe95dd 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using Microsoft.ML.Data; using Microsoft.ML.Trainers; using static Microsoft.ML.SamplesUtils.DatasetUtils; diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index e4eb886448..6a5443aa84 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -47,6 +47,12 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t /// Oversampling parameter for randomized PCA training. /// If enabled, data is centered to be zero mean. /// The seed for random number generation. + /// + /// + /// + /// public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, string featureColumnName = DefaultColumnNames.Features, string exampleWeightColumnName = null, @@ -65,6 +71,12 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An /// /// The anomaly detection catalog trainer object. /// Advanced options to the algorithm. + /// + /// + /// + /// public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options) { Contracts.CheckValue(catalog, nameof(catalog)); diff --git a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs index fcf5ce470e..fbf3db60d0 100644 --- a/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs +++ b/test/Microsoft.ML.Tests/AnomalyDetectionTests.cs @@ -3,6 +3,8 @@ // See the LICENSE file in the project root for more information. using System; +using System.Collections.Generic; +using System.Linq; using Microsoft.Data.DataView; using Microsoft.ML.Data; using Microsoft.ML.RunTests; @@ -48,6 +50,96 @@ public void NoAnomalyTest() Assert.Throws(() => ML.AnomalyDetection.Evaluate(transformedData)); } + [Fact] + public static void RandomizedPcaInMemory() + { + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, + // as a catalog of available operations and as the source of randomness. + // Setting the seed to a fixed number in this example to make outputs deterministic. + var mlContext = new MLContext(seed: 0); + + // Create an anomaly detector. Its underlying algorithm is randomized PCA. + var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false); + + // Test the first detector. + ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1); + + // Object required in the creation of another detector. + var options = new Trainers.RandomizedPcaTrainer.Options() + { + FeatureColumnName = nameof(DataPoint.Features), + Rank = 1, + Center = false + }; + + // Create anther anomaly detector. Its underlying algorithm is randomized PCA. + var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options); + + // Test the second detector. + ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2); + } + + /// + /// Example with 3 feature values used in . + /// + private class DataPoint + { + [VectorType(3)] + public float[] Features { get; set; } + } + + /// + /// Class used to capture prediction of in . + /// + private class Result + { + // Outlier gets false while inlier has true. + public bool PredictedLabel { get; set; } + // Outlier gets smaller score. + public float Score { get; set; } + } + + /// + /// Help function used to execute trainers defined in . + /// + private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer) + { + var samples = new List() + { + new DataPoint(){ Features= new float[3] {1, 0, 0} }, + new DataPoint(){ Features= new float[3] {0, 2, 1} }, + new DataPoint(){ Features= new float[3] {1, 2, 3} }, + new DataPoint(){ Features= new float[3] {0, 1, 0} }, + new DataPoint(){ Features= new float[3] {0, 2, 1} }, + new DataPoint(){ Features= new float[3] {-100, 50, -100} } + }; + + // Convert the List to IDataView, a consumble format to ML.NET functions. + var data = mlContext.Data.LoadFromEnumerable(samples); + + // Train the anomaly detector. + var model = trainer.Fit(data); + + // Apply the trained model on the training data. + var transformed = model.Transform(data); + + // Read ML.NET predictions into IEnumerable. + var results = mlContext.Data.CreateEnumerable(transformed, reuseRowObject: false).ToList(); + + // First 5 examples are inliers. + for (int i = 0; i < 5; ++i) + { + // Inlier should be predicted as true. + Assert.True(results[i].PredictedLabel); + // Higher score means closer to inlier. + Assert.InRange(results[i].Score, 0.3, 1); + } + + // Last example is outlier. Note that outlier should be predicted as false. + Assert.False(results[5].PredictedLabel); + Assert.InRange(results[5].Score, 0, 0.3); + } + private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath) { var loader = ML.Data.CreateTextLoader(new[]