Skip to content

Add an example of random PCA using in-memory data structure #2780

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Feb 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection
{
public static class RandomizedPcaSample
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Training data.
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[3] {1, 0, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {1, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 1, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
var data = mlContext.Data.LoadFromEnumerable(samples);

// Create an anomaly detector. Its underlying algorithm is randomized PCA.
var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false);

// Train the anomaly detector.
var model = pipeline.Fit(data);

// Apply the trained model on the training data.
var transformed = model.Transform(data);

// Read ML.NET predictions into IEnumerable<Result>.
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();

// Let's go through all predictions.
for (int i = 0; i < samples.Count; ++i)
{
// The i-th example's prediction result.
var result = results[i];

// The i-th example's feature vector in text format.
var featuresInText = string.Join(',', samples[i].Features);

if (result.PredictedLabel)
// The i-th sample is predicted as an inlier.
Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}",
i, featuresInText, result.Score);
else
// The i-th sample is predicted as an outlier.
Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}",
i, featuresInText, result.Score);
}
Copy link
Member

@sfilipi sfilipi Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please put in comment below what would the output of this look. (The results of those WriteLine) #Closed

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Please see iteration 3.


In reply to: 261047636 [](ancestors = 261047636)

// Lines printed out should be
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
}

// Example with 3 feature values. A training data set is a collection of such examples.
private class DataPoint
{
[VectorType(3)]
public float[] Features { get; set; }
}

// Class used to capture prediction of DataPoint.
private class Result
{
// Outlier gets false while inlier has true.
public bool PredictedLabel { get; set; }
// Outlier gets smaller score.
public float Score { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection
{
public static class RandomizedPcaSampleWithOptions
{
public static void Example()
{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Training data.
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[3] {1, 0, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {1, 2, 3} },
new DataPoint(){ Features = new float[3] {0, 1, 0} },
new DataPoint(){ Features = new float[3] {0, 2, 1} },
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
var data = mlContext.Data.LoadFromEnumerable(samples);

var options = new ML.Trainers.RandomizedPcaTrainer.Options()
{
FeatureColumnName = nameof(DataPoint.Features),
Rank = 1,
Seed = 10,
};

// Create an anomaly detector. Its underlying algorithm is randomized PCA.
var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);

// Train the anomaly detector.
var model = pipeline.Fit(data);

// Apply the trained model on the training data.
var transformed = model.Transform(data);

// Read ML.NET predictions into IEnumerable<Result>.
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();

// Let's go through all predictions.
for (int i = 0; i < samples.Count; ++i)
{
// The i-th example's prediction result.
var result = results[i];

// The i-th example's feature vector in text format.
var featuresInText = string.Join(',', samples[i].Features);

if (result.PredictedLabel)
// The i-th sample is predicted as an inlier.
Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}",
i, featuresInText, result.Score);
else
// The i-th sample is predicted as an outlier.
Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}",
i, featuresInText, result.Score);
}
// Lines printed out should be
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
}

// Example with 3 feature values. A training data set is a collection of such examples.
private class DataPoint
{
[VectorType(3)]
public float[] Features { get; set; }
}

// Class used to capture prediction of DataPoint.
private class Result
{
// Outlier gets false while inlier has true.
public bool PredictedLabel { get; set; }
// Outlier gets smaller score.
public float Score { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;
using static Microsoft.ML.SamplesUtils.DatasetUtils;

namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using static Microsoft.ML.SamplesUtils.DatasetUtils;

Expand Down
12 changes: 12 additions & 0 deletions src/Microsoft.ML.PCA/PCACatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t
/// <param name="oversampling">Oversampling parameter for randomized PCA training.</param>
/// <param name="center">If enabled, data is centered to be zero mean.</param>
/// <param name="seed">The seed for random number generation.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs)]
/// ]]></format>
/// </example>
public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog,
string featureColumnName = DefaultColumnNames.Features,
string exampleWeightColumnName = null,
Expand All @@ -65,6 +71,12 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An
/// </summary>
/// <param name="catalog">The anomaly detection catalog trainer object.</param>
/// <param name="options">Advanced options to the algorithm.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs)]
Copy link
Member

@sfilipi sfilipi Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RandomizedPcaSampleWithOptions [](start = 119, length = 30)

i don't think this exists #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added. Thanks!


In reply to: 261047854 [](ancestors = 261047854)

/// ]]></format>
/// </example>
public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options)
{
Contracts.CheckValue(catalog, nameof(catalog));
Expand Down
92 changes: 92 additions & 0 deletions test/Microsoft.ML.Tests/AnomalyDetectionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.Data.DataView;
using Microsoft.ML.Data;
using Microsoft.ML.RunTests;
Expand Down Expand Up @@ -48,6 +50,96 @@ public void NoAnomalyTest()
Assert.Throws<ArgumentOutOfRangeException>(() => ML.AnomalyDetection.Evaluate(transformedData));
}

[Fact]
public static void RandomizedPcaInMemory()
Copy link
Member

@sfilipi sfilipi Feb 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RandomizedPcaInMemory [](start = 27, length = 21)

love it :) #WontFix

{
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
// as a catalog of available operations and as the source of randomness.
// Setting the seed to a fixed number in this example to make outputs deterministic.
var mlContext = new MLContext(seed: 0);

// Create an anomaly detector. Its underlying algorithm is randomized PCA.
var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false);

// Test the first detector.
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1);

// Object required in the creation of another detector.
var options = new Trainers.RandomizedPcaTrainer.Options()
{
FeatureColumnName = nameof(DataPoint.Features),
Rank = 1,
Center = false
};

// Create anther anomaly detector. Its underlying algorithm is randomized PCA.
var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);

// Test the second detector.
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2);
}

/// <summary>
/// Example with 3 feature values used in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
/// </summary>
private class DataPoint
{
[VectorType(3)]
public float[] Features { get; set; }
}

/// <summary>
/// Class used to capture prediction of <see cref="DataPoint"/> in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
/// </summary>
private class Result
{
// Outlier gets false while inlier has true.
public bool PredictedLabel { get; set; }
// Outlier gets smaller score.
public float Score { get; set; }
}

/// <summary>
/// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
/// </summary>
private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
{
var samples = new List<DataPoint>()
{
new DataPoint(){ Features= new float[3] {1, 0, 0} },
new DataPoint(){ Features= new float[3] {0, 2, 1} },
new DataPoint(){ Features= new float[3] {1, 2, 3} },
new DataPoint(){ Features= new float[3] {0, 1, 0} },
new DataPoint(){ Features= new float[3] {0, 2, 1} },
new DataPoint(){ Features= new float[3] {-100, 50, -100} }
};

// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
var data = mlContext.Data.LoadFromEnumerable(samples);

// Train the anomaly detector.
var model = trainer.Fit(data);

// Apply the trained model on the training data.
var transformed = model.Transform(data);

// Read ML.NET predictions into IEnumerable<Result>.
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();

// First 5 examples are inliers.
for (int i = 0; i < 5; ++i)
{
// Inlier should be predicted as true.
Assert.True(results[i].PredictedLabel);
// Higher score means closer to inlier.
Assert.InRange(results[i].Score, 0.3, 1);
}

// Last example is outlier. Note that outlier should be predicted as false.
Assert.False(results[5].PredictedLabel);
Assert.InRange(results[5].Score, 0, 0.3);
}

private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)
{
var loader = ML.Data.CreateTextLoader(new[]
Expand Down