-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Normalize documentation #3244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Normalize documentation #3244
Changes from all commits
3eca87e
aaec479
611172a
9cffe3a
5d2da69
215a4ab
66fe40a
6217b99
6bda83d
ed811fc
43ca3e8
b0b639c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Collections.Immutable; | ||
using System.Linq; | ||
using Microsoft.ML; | ||
using Microsoft.ML.Data; | ||
using static Microsoft.ML.Transforms.NormalizingTransformer; | ||
|
||
namespace Samples.Dynamic | ||
{ | ||
public class NormalizeBinning | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
one line comment about what this does. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
var samples = new List<DataPoint>() | ||
{ | ||
new DataPoint(){ Features = new float[4] { 8, 1, 3, 0} }, | ||
new DataPoint(){ Features = new float[4] { 6, 2, 2, 0} }, | ||
new DataPoint(){ Features = new float[4] { 4, 0, 1, 0} }, | ||
new DataPoint(){ Features = new float[4] { 2,-1,-1, 1} } | ||
}; | ||
// Convert training data to IDataView, the general data type used in ML.NET. | ||
var data = mlContext.Data.LoadFromEnumerable(samples); | ||
// NormalizeBinning normalizes the data by constructing equidensity bins and produce output based on | ||
// to which bin original value belong. | ||
var normalize = mlContext.Transforms.NormalizeBinning("Features", maximumBinCount: 4, fixZero: false); | ||
|
||
// NormalizeBinning normalizes the data by constructing equidensity bins and produce output based on | ||
// to which bin original value belong but make sure zero values would remain zero after normalization. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. duplicate comment...see line 26-27 #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please move the details to similar to your other PR |
||
// Helps preserve sparsity. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe I would not repeat this explanation. Here and also in other samples. |
||
var normalizeFixZero = mlContext.Transforms.NormalizeBinning("Features", maximumBinCount: 4, fixZero: true); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
love the variation on the parameter! #WontFix |
||
|
||
// Now we can transform the data and look at the output to confirm the behavior of the estimator. | ||
// This operation doesn't actually evaluate data until we read the data below. | ||
var normalizeTransform = normalize.Fit(data); | ||
var transformedData = normalizeTransform.Transform(data); | ||
var normalizeFixZeroTransform = normalizeFixZero.Fit(data); | ||
var fixZeroData = normalizeFixZeroTransform.Transform(data); | ||
var column = transformedData.GetColumn<float[]>("Features").ToArray(); | ||
foreach (var row in column) | ||
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4")))); | ||
// Expected output: | ||
// 1.0000, 0.6667, 1.0000, 0.0000 | ||
// 0.6667, 1.0000, 0.6667, 0.0000 | ||
// 0.3333, 0.3333, 0.3333, 0.0000 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I'd emphasize this value. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
// 0.0000, 0.0000, 0.0000, 1.0000 | ||
|
||
var columnFixZero = fixZeroData.GetColumn<float[]>("Features").ToArray(); | ||
foreach (var row in columnFixZero) | ||
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4")))); | ||
// Expected output: | ||
// 1.0000, 0.3333, 1.0000, 0.0000 | ||
// 0.6667, 0.6667, 0.6667, 0.0000 | ||
// 0.3333, 0.0000, 0.3333, 0.0000 | ||
// 0.0000, -0.3333, 0.0000, 1.0000 | ||
|
||
// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we need to pass 0, the index of this column in the dataview, as parameter for.. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair. | ||
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as BinNormalizerModelParameters<ImmutableArray<float>>; | ||
var density = transformParams.Density[0]; | ||
var offset = (transformParams.Offset.Length == 0 ? 0 : transformParams.Offset[0]); | ||
Console.WriteLine($"The 0-index value in resulting array would be produce by: y = (Index(x) / {density}) - {offset}"); | ||
Console.WriteLine("Where Index(x) is the index of the bin to which x belongs"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please break super-long lines into two lines #Resolved |
||
Console.WriteLine($"Bins upper bounds are: {string.Join(" ", transformParams.UpperBounds[0])}"); | ||
// Expected output: | ||
// The 0-index value in resulting array would be produce by: y = (Index(x) / 3) - 0 | ||
// Where Index(x) is the index of the bin to which x belongs | ||
// Bins upper bounds are: 3 5 7 ∞ | ||
|
||
var fixZeroParams = (normalizeFixZeroTransform.GetNormalizerModelParameters(0) as BinNormalizerModelParameters<ImmutableArray<float>>); | ||
density = fixZeroParams.Density[1]; | ||
offset = (fixZeroParams.Offset.Length == 0 ? 0 : fixZeroParams.Offset[1]); | ||
Console.WriteLine($"The 0-index value in resulting array would be produce by: y = (Index(x) / {density}) - {offset}"); | ||
Console.WriteLine("Where Index(x) is the index of the bin to which x belongs"); | ||
Console.WriteLine($"Bins upper bounds are: {string.Join(" ", fixZeroParams.UpperBounds[1])}"); | ||
// Expected output: | ||
// The 0-index value in resulting array would be produce by: y = (Index(x) / 3) - 0.3333333 | ||
// Where Index(x) is the index of the bin to which x belongs | ||
// Bins upper bounds are: -0.5 0.5 1.5 ∞ | ||
} | ||
|
||
private class DataPoint | ||
{ | ||
[VectorType(4)] | ||
public float[] Features { get; set; } | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Collections.Immutable; | ||
using System.Linq; | ||
using Microsoft.ML; | ||
using Microsoft.ML.Data; | ||
using static Microsoft.ML.Transforms.NormalizingTransformer; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
ditto #Resolved |
||
|
||
namespace Samples.Dynamic | ||
{ | ||
public class NormalizeLogMeanVariance | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
var samples = new List<DataPoint>() | ||
{ | ||
new DataPoint(){ Features = new float[4] { 1, 1, 3, 0} }, | ||
new DataPoint(){ Features = new float[4] { 2, 2, 2, 0} }, | ||
new DataPoint(){ Features = new float[4] { 0, 0, 1, 0} }, | ||
new DataPoint(){ Features = new float[4] {-1,-1,-1, 1} } | ||
}; | ||
// Convert training data to IDataView, the general data type used in ML.NET. | ||
var data = mlContext.Data.LoadFromEnumerable(samples); | ||
// NormalizeLogMeanVariance normalizes the data based on the computed mean and variance of the logarithm of the data. | ||
// Uses Cumulative distribution function as output. | ||
var normalize = mlContext.Transforms.NormalizeLogMeanVariance("Features", useCdf: true); | ||
|
||
// NormalizeLogMeanVariance normalizes the data based on the computed mean and variance of the logarithm of the data. | ||
var normalizeNoCdf = mlContext.Transforms.NormalizeLogMeanVariance("Features", useCdf: false); | ||
|
||
// Now we can transform the data and look at the output to confirm the behavior of the estimator. | ||
// This operation doesn't actually evaluate data until we read the data below. | ||
var normalizeTransform = normalize.Fit(data); | ||
var transformedData = normalizeTransform.Transform(data); | ||
var normalizeNoCdfTransform = normalizeNoCdf.Fit(data); | ||
var noCdfData = normalizeNoCdfTransform.Transform(data); | ||
var column = transformedData.GetColumn<float[]>("Features").ToArray(); | ||
foreach (var row in column) | ||
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4")))); | ||
// Expected output: | ||
// 0.1587, 0.1587, 0.8654, 0.0000 | ||
// 0.8413, 0.8413, 0.5837, 0.0000 | ||
// 0.0000, 0.0000, 0.0940, 0.0000 | ||
// 0.0000, 0.0000, 0.0000, 0.0000 | ||
|
||
var columnFixZero = noCdfData.GetColumn<float[]>("Features").ToArray(); | ||
foreach (var row in columnFixZero) | ||
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4")))); | ||
// Expected output: | ||
// 1.8854, 1.8854, 5.2970, 0.0000 | ||
// 4.7708, 4.7708, 3.0925, 0.0000 | ||
// -1.0000,-1.0000, 0.8879, 0.0000 | ||
// 3.8854,-3.8854,-3.5213, 0.0000 | ||
|
||
// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters. | ||
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair. | ||
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as CdfNormalizerModelParameters<ImmutableArray<float>>; | ||
Console.WriteLine("The 1-index value in resulting array would be produce by:"); | ||
Console.WriteLine($"y = 0.5* (1 + ERF((Math.Log(x)- {transformParams.Mean[1]}) / ({transformParams.StandardDeviation[1]} * sqrt(2)))"); | ||
|
||
// ERF is https://en.wikipedia.org/wiki/Error_function. | ||
// Expected output: | ||
// The 1-index value in resulting array would be produce by: | ||
// y = 0.5* (1 + ERF((Math.Log(x)- 0.3465736) / (0.3465736 * sqrt(2))) | ||
var noCdfParams = normalizeNoCdfTransform.GetNormalizerModelParameters(0) as AffineNormalizerModelParameters<ImmutableArray<float>>; | ||
var offset = noCdfParams.Offset.Length == 0 ? 0 : noCdfParams.Offset[1]; | ||
var scale = noCdfParams.Scale[1]; | ||
Console.WriteLine($"The 1-index value in resulting array would be produce by: y = (x - ({offset})) * {scale}"); | ||
// Expected output: | ||
// The 1-index value in resulting array would be produce by: y = (x - (2.88539)) * 0.3465736 | ||
} | ||
|
||
private class DataPoint | ||
{ | ||
[VectorType(4)] | ||
public float[] Features { get; set; } | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Collections.Immutable; | ||
using System.Linq; | ||
using Microsoft.ML; | ||
using Microsoft.ML.Data; | ||
using static Microsoft.ML.Transforms.NormalizingTransformer; | ||
|
||
namespace Samples.Dynamic | ||
{ | ||
public class NormalizeMeanVariance | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
var samples = new List<DataPoint>() | ||
{ | ||
new DataPoint(){ Features = new float[4] { 1, 1, 3, 0} }, | ||
new DataPoint(){ Features = new float[4] { 2, 2, 2, 0} }, | ||
new DataPoint(){ Features = new float[4] { 0, 0, 1, 0} }, | ||
new DataPoint(){ Features = new float[4] {-1,-1,-1, 1} } | ||
}; | ||
// Convert training data to IDataView, the general data type used in ML.NET. | ||
var data = mlContext.Data.LoadFromEnumerable(samples); | ||
// NormalizeMeanVariance normalizes the data based on the computed mean and variance of the data. | ||
// Uses Cumulative distribution function as output. | ||
var normalize = mlContext.Transforms.NormalizeMeanVariance("Features", useCdf: true); | ||
|
||
// NormalizeMeanVariance normalizes the data based on the computed mean and variance of the data. | ||
var normalizeNoCdf = mlContext.Transforms.NormalizeMeanVariance("Features", useCdf: false); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would not repeat what |
||
|
||
// Now we can transform the data and look at the output to confirm the behavior of the estimator. | ||
// This operation doesn't actually evaluate data until we read the data below. | ||
var normalizeTransform = normalize.Fit(data); | ||
var transformedData = normalizeTransform.Transform(data); | ||
var normalizeNoCdfTransform = normalizeNoCdf.Fit(data); | ||
var noCdfData = normalizeNoCdfTransform.Transform(data); | ||
var column = transformedData.GetColumn<float[]>("Features").ToArray(); | ||
foreach (var row in column) | ||
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4")))); | ||
// Expected output: | ||
// 0.6726, 0.6726, 0.8816, 0.2819 | ||
// 0.9101, 0.9101, 0.6939, 0.2819 | ||
// 0.3274, 0.3274, 0.4329, 0.2819 | ||
// 0.0899, 0.0899, 0.0641, 0.9584 | ||
|
||
|
||
var columnFixZero = noCdfData.GetColumn<float[]>("Features").ToArray(); | ||
foreach (var row in columnFixZero) | ||
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4")))); | ||
// Expected output: | ||
// 0.8165, 0.8165, 1.5492, 0.0000 | ||
// 1.6330, 1.6330, 1.0328, 0.0000 | ||
// 0.0000, 0.0000, 0.5164, 0.0000 | ||
// -0.8165,-0.8165,-0.5164, 2.0000 | ||
|
||
// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters. | ||
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair. | ||
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as CdfNormalizerModelParameters<ImmutableArray<float>>; | ||
Console.WriteLine($"The 1-index value in resulting array would be produce by:"); | ||
Console.WriteLine($" y = 0.5* (1 + ERF((x- {transformParams.Mean[1]}) / ({transformParams.StandardDeviation[1]} * sqrt(2)))"); | ||
// ERF is https://en.wikipedia.org/wiki/Error_function. | ||
// Expected output: | ||
// The 1-index value in resulting array would be produce by: | ||
// y = 0.5 * (1 + ERF((x - 0.5) / (1.118034 * sqrt(2))) | ||
|
||
var noCdfParams = normalizeNoCdfTransform.GetNormalizerModelParameters(0) as AffineNormalizerModelParameters<ImmutableArray<float>>; | ||
var offset = noCdfParams.Offset.Length == 0 ? 0 : noCdfParams.Offset[1]; | ||
var scale = noCdfParams.Scale[1]; | ||
Console.WriteLine($"Values for slot 1 would be transfromed by applying y = (x - ({offset})) * {scale}"); | ||
// Expected output: | ||
// The 1-index value in resulting array would be produce by: y = (x - (0)) * 0.8164966 | ||
} | ||
|
||
private class DataPoint | ||
{ | ||
[VectorType(4)] | ||
public float[] Features { get; set; } | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we need this? #ByDesign
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, otherwise I need to write Microsoft.ML.Transforms.NormalizingTransformer.BinNormalizerModelParameters which takes too much space
In reply to: 274057369 [](ancestors = 274057369)