Skip to content

Commit 6e9023f

Browse files
authored
Add an example of random PCA using in-memory data structure (#2780)
1 parent fbf282d commit 6e9023f

File tree

6 files changed

+283
-2
lines changed

6 files changed

+283
-2
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML.Data;
5+
6+
namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection
7+
{
8+
public static class RandomizedPcaSample
9+
{
10+
public static void Example()
11+
{
12+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
13+
// as a catalog of available operations and as the source of randomness.
14+
// Setting the seed to a fixed number in this example to make outputs deterministic.
15+
var mlContext = new MLContext(seed: 0);
16+
17+
// Training data.
18+
var samples = new List<DataPoint>()
19+
{
20+
new DataPoint(){ Features = new float[3] {1, 0, 0} },
21+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
22+
new DataPoint(){ Features = new float[3] {1, 2, 3} },
23+
new DataPoint(){ Features = new float[3] {0, 1, 0} },
24+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
25+
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
26+
};
27+
28+
// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
29+
var data = mlContext.Data.LoadFromEnumerable(samples);
30+
31+
// Create an anomaly detector. Its underlying algorithm is randomized PCA.
32+
var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false);
33+
34+
// Train the anomaly detector.
35+
var model = pipeline.Fit(data);
36+
37+
// Apply the trained model on the training data.
38+
var transformed = model.Transform(data);
39+
40+
// Read ML.NET predictions into IEnumerable<Result>.
41+
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
42+
43+
// Let's go through all predictions.
44+
for (int i = 0; i < samples.Count; ++i)
45+
{
46+
// The i-th example's prediction result.
47+
var result = results[i];
48+
49+
// The i-th example's feature vector in text format.
50+
var featuresInText = string.Join(',', samples[i].Features);
51+
52+
if (result.PredictedLabel)
53+
// The i-th sample is predicted as an inlier.
54+
Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}",
55+
i, featuresInText, result.Score);
56+
else
57+
// The i-th sample is predicted as an outlier.
58+
Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}",
59+
i, featuresInText, result.Score);
60+
}
61+
// Lines printed out should be
62+
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
63+
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
64+
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
65+
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
66+
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
67+
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
68+
}
69+
70+
// Example with 3 feature values. A training data set is a collection of such examples.
71+
private class DataPoint
72+
{
73+
[VectorType(3)]
74+
public float[] Features { get; set; }
75+
}
76+
77+
// Class used to capture prediction of DataPoint.
78+
private class Result
79+
{
80+
// Outlier gets false while inlier has true.
81+
public bool PredictedLabel { get; set; }
82+
// Outlier gets smaller score.
83+
public float Score { get; set; }
84+
}
85+
}
86+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using Microsoft.ML.Data;
5+
6+
namespace Microsoft.ML.Samples.Dynamic.Trainers.AnomalyDetection
7+
{
8+
public static class RandomizedPcaSampleWithOptions
9+
{
10+
public static void Example()
11+
{
12+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
13+
// as a catalog of available operations and as the source of randomness.
14+
// Setting the seed to a fixed number in this example to make outputs deterministic.
15+
var mlContext = new MLContext(seed: 0);
16+
17+
// Training data.
18+
var samples = new List<DataPoint>()
19+
{
20+
new DataPoint(){ Features = new float[3] {1, 0, 0} },
21+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
22+
new DataPoint(){ Features = new float[3] {1, 2, 3} },
23+
new DataPoint(){ Features = new float[3] {0, 1, 0} },
24+
new DataPoint(){ Features = new float[3] {0, 2, 1} },
25+
new DataPoint(){ Features = new float[3] {-100, 50, -100} }
26+
};
27+
28+
// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
29+
var data = mlContext.Data.LoadFromEnumerable(samples);
30+
31+
var options = new ML.Trainers.RandomizedPcaTrainer.Options()
32+
{
33+
FeatureColumnName = nameof(DataPoint.Features),
34+
Rank = 1,
35+
Seed = 10,
36+
};
37+
38+
// Create an anomaly detector. Its underlying algorithm is randomized PCA.
39+
var pipeline = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);
40+
41+
// Train the anomaly detector.
42+
var model = pipeline.Fit(data);
43+
44+
// Apply the trained model on the training data.
45+
var transformed = model.Transform(data);
46+
47+
// Read ML.NET predictions into IEnumerable<Result>.
48+
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
49+
50+
// Let's go through all predictions.
51+
for (int i = 0; i < samples.Count; ++i)
52+
{
53+
// The i-th example's prediction result.
54+
var result = results[i];
55+
56+
// The i-th example's feature vector in text format.
57+
var featuresInText = string.Join(',', samples[i].Features);
58+
59+
if (result.PredictedLabel)
60+
// The i-th sample is predicted as an inlier.
61+
Console.WriteLine("The {0}-th example with features [{1}] is an inlier with a score of being inlier {2}",
62+
i, featuresInText, result.Score);
63+
else
64+
// The i-th sample is predicted as an outlier.
65+
Console.WriteLine("The {0}-th example with features [{1}] is an outlier with a score of being inlier {2}",
66+
i, featuresInText, result.Score);
67+
}
68+
// Lines printed out should be
69+
// The 0 - th example with features[1, 0, 0] is an inlier with a score of being inlier 0.7453707
70+
// The 1 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
71+
// The 2 - th example with features[1, 2, 3] is an inlier with a score of being inlier 0.8450122
72+
// The 3 - th example with features[0, 1, 0] is an inlier with a score of being inlier 0.9428905
73+
// The 4 - th example with features[0, 2, 1] is an inlier with a score of being inlier 0.9999999
74+
// The 5 - th example with features[-100, 50, -100] is an outlier with a score of being inlier 0
75+
}
76+
77+
// Example with 3 feature values. A training data set is a collection of such examples.
78+
private class DataPoint
79+
{
80+
[VectorType(3)]
81+
public float[] Features { get; set; }
82+
}
83+
84+
// Class used to capture prediction of DataPoint.
85+
private class Result
86+
{
87+
// Outlier gets false while inlier has true.
88+
public bool PredictedLabel { get; set; }
89+
// Outlier gets smaller score.
90+
public float Score { get; set; }
91+
}
92+
}
93+
}

docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorization.cs

-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
using System;
22
using System.Collections.Generic;
3-
using Microsoft.ML.Data;
43
using static Microsoft.ML.SamplesUtils.DatasetUtils;
54

65
namespace Microsoft.ML.Samples.Dynamic.Trainers.Recommendation

docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/Recommendation/MatrixFactorizationWithOptions.cs

-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
using System;
22
using System.Collections.Generic;
3-
using Microsoft.ML.Data;
43
using Microsoft.ML.Trainers;
54
using static Microsoft.ML.SamplesUtils.DatasetUtils;
65

src/Microsoft.ML.PCA/PCACatalog.cs

+12
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,12 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t
4747
/// <param name="oversampling">Oversampling parameter for randomized PCA training.</param>
4848
/// <param name="center">If enabled, data is centered to be zero mean.</param>
4949
/// <param name="seed">The seed for random number generation.</param>
50+
/// <example>
51+
/// <format type="text/markdown">
52+
/// <![CDATA[
53+
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSample.cs)]
54+
/// ]]></format>
55+
/// </example>
5056
public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog,
5157
string featureColumnName = DefaultColumnNames.Features,
5258
string exampleWeightColumnName = null,
@@ -65,6 +71,12 @@ public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.An
6571
/// </summary>
6672
/// <param name="catalog">The anomaly detection catalog trainer object.</param>
6773
/// <param name="options">Advanced options to the algorithm.</param>
74+
/// <example>
75+
/// <format type="text/markdown">
76+
/// <![CDATA[
77+
/// [!code-csharp[RPCA](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/AnomalyDetection/RandomizedPcaSampleWithOptions.cs)]
78+
/// ]]></format>
79+
/// </example>
6880
public static RandomizedPcaTrainer RandomizedPca(this AnomalyDetectionCatalog.AnomalyDetectionTrainers catalog, Options options)
6981
{
7082
Contracts.CheckValue(catalog, nameof(catalog));

test/Microsoft.ML.Tests/AnomalyDetectionTests.cs

+92
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Collections.Generic;
7+
using System.Linq;
68
using Microsoft.Data.DataView;
79
using Microsoft.ML.Data;
810
using Microsoft.ML.RunTests;
@@ -48,6 +50,96 @@ public void NoAnomalyTest()
4850
Assert.Throws<ArgumentOutOfRangeException>(() => ML.AnomalyDetection.Evaluate(transformedData));
4951
}
5052

53+
[Fact]
54+
public static void RandomizedPcaInMemory()
55+
{
56+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
57+
// as a catalog of available operations and as the source of randomness.
58+
// Setting the seed to a fixed number in this example to make outputs deterministic.
59+
var mlContext = new MLContext(seed: 0);
60+
61+
// Create an anomaly detector. Its underlying algorithm is randomized PCA.
62+
var trainer1 = mlContext.AnomalyDetection.Trainers.RandomizedPca(featureColumnName: nameof(DataPoint.Features), rank: 1, center: false);
63+
64+
// Test the first detector.
65+
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer1);
66+
67+
// Object required in the creation of another detector.
68+
var options = new Trainers.RandomizedPcaTrainer.Options()
69+
{
70+
FeatureColumnName = nameof(DataPoint.Features),
71+
Rank = 1,
72+
Center = false
73+
};
74+
75+
// Create anther anomaly detector. Its underlying algorithm is randomized PCA.
76+
var trainer2 = mlContext.AnomalyDetection.Trainers.RandomizedPca(options);
77+
78+
// Test the second detector.
79+
ExecutePipelineWithGivenRandomizedPcaTrainer(mlContext, trainer2);
80+
}
81+
82+
/// <summary>
83+
/// Example with 3 feature values used in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
84+
/// </summary>
85+
private class DataPoint
86+
{
87+
[VectorType(3)]
88+
public float[] Features { get; set; }
89+
}
90+
91+
/// <summary>
92+
/// Class used to capture prediction of <see cref="DataPoint"/> in <see cref="ExecutePipelineWithGivenRandomizedPcaTrainer"/>.
93+
/// </summary>
94+
private class Result
95+
{
96+
// Outlier gets false while inlier has true.
97+
public bool PredictedLabel { get; set; }
98+
// Outlier gets smaller score.
99+
public float Score { get; set; }
100+
}
101+
102+
/// <summary>
103+
/// Help function used to execute trainers defined in <see cref="RandomizedPcaInMemory"/>.
104+
/// </summary>
105+
private static void ExecutePipelineWithGivenRandomizedPcaTrainer(MLContext mlContext, Trainers.RandomizedPcaTrainer trainer)
106+
{
107+
var samples = new List<DataPoint>()
108+
{
109+
new DataPoint(){ Features= new float[3] {1, 0, 0} },
110+
new DataPoint(){ Features= new float[3] {0, 2, 1} },
111+
new DataPoint(){ Features= new float[3] {1, 2, 3} },
112+
new DataPoint(){ Features= new float[3] {0, 1, 0} },
113+
new DataPoint(){ Features= new float[3] {0, 2, 1} },
114+
new DataPoint(){ Features= new float[3] {-100, 50, -100} }
115+
};
116+
117+
// Convert the List<DataPoint> to IDataView, a consumble format to ML.NET functions.
118+
var data = mlContext.Data.LoadFromEnumerable(samples);
119+
120+
// Train the anomaly detector.
121+
var model = trainer.Fit(data);
122+
123+
// Apply the trained model on the training data.
124+
var transformed = model.Transform(data);
125+
126+
// Read ML.NET predictions into IEnumerable<Result>.
127+
var results = mlContext.Data.CreateEnumerable<Result>(transformed, reuseRowObject: false).ToList();
128+
129+
// First 5 examples are inliers.
130+
for (int i = 0; i < 5; ++i)
131+
{
132+
// Inlier should be predicted as true.
133+
Assert.True(results[i].PredictedLabel);
134+
// Higher score means closer to inlier.
135+
Assert.InRange(results[i].Score, 0.3, 1);
136+
}
137+
138+
// Last example is outlier. Note that outlier should be predicted as false.
139+
Assert.False(results[5].PredictedLabel);
140+
Assert.InRange(results[5].Score, 0, 0.3);
141+
}
142+
51143
private IDataView DetectAnomalyInMnistOneClass(string trainPath, string testPath)
52144
{
53145
var loader = ML.Data.CreateTextLoader(new[]

0 commit comments

Comments
 (0)