Skip to content

Cherry pick Projection documentation and Normalize changes to 1.0 #3344

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 0 additions & 91 deletions docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs

This file was deleted.

97 changes: 0 additions & 97 deletions docs/samples/Microsoft.ML.Samples/Dynamic/ProjectionTransforms.cs

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

namespace Samples.Dynamic
{
public static class ApproximatedKernelMap
{
// Transform feature vector to another non-linear space. See https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf.
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[7] { 1, 1, 0, 0, 1, 0, 1} },
new DataPoint(){ Features = new float[7] { 0, 0, 1, 0, 0, 1, 1} },
new DataPoint(){ Features = new float[7] {-1, 1, 0,-1,-1, 0,-1} },
new DataPoint(){ Features = new float[7] { 0,-1, 0, 1, 0,-1,-1} }
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);
// ApproximatedKernel map takes data and maps it's to a random low-dimensional space.
var approximation = mlContext.Transforms.ApproximatedKernelMap("Features", rank: 4, generator: new GaussianKernel(gamma: 0.7f), seed: 1);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var tansformer = approximation.Fit(data);
var transformedData = tansformer.Transform(data);

var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// -0.0119, 0.5867, 0.4942, 0.7041
// 0.4720, 0.5639, 0.4346, 0.2671
// -0.2243, 0.7071, 0.7053, -0.1681
// 0.0846, 0.5836, 0.6575, 0.0581
}

private class DataPoint
{
[VectorType(7)]
public float[] Features { get; set; }
}

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using static Microsoft.ML.Transforms.NormalizingTransformer;

namespace Samples.Dynamic
{
public class NormalizeBinning
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[4] { 8, 1, 3, 0} },
new DataPoint(){ Features = new float[4] { 6, 2, 2, 0} },
new DataPoint(){ Features = new float[4] { 4, 0, 1, 0} },
new DataPoint(){ Features = new float[4] { 2,-1,-1, 1} }
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);
// NormalizeBinning normalizes the data by constructing equidensity bins and produce output based on
// to which bin original value belong.
var normalize = mlContext.Transforms.NormalizeBinning("Features", maximumBinCount: 4, fixZero: false);

// NormalizeBinning normalizes the data by constructing equidensity bins and produce output based on
// to which bin original value belong but make sure zero values would remain zero after normalization.
// Helps preserve sparsity.
var normalizeFixZero = mlContext.Transforms.NormalizeBinning("Features", maximumBinCount: 4, fixZero: true);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var normalizeTransform = normalize.Fit(data);
var transformedData = normalizeTransform.Transform(data);
var normalizeFixZeroTransform = normalizeFixZero.Fit(data);
var fixZeroData = normalizeFixZeroTransform.Transform(data);
var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 1.0000, 0.6667, 1.0000, 0.0000
// 0.6667, 1.0000, 0.6667, 0.0000
// 0.3333, 0.3333, 0.3333, 0.0000
// 0.0000, 0.0000, 0.0000, 1.0000

var columnFixZero = fixZeroData.GetColumn<float[]>("Features").ToArray();
foreach (var row in columnFixZero)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 1.0000, 0.3333, 1.0000, 0.0000
// 0.6667, 0.6667, 0.6667, 0.0000
// 0.3333, 0.0000, 0.3333, 0.0000
// 0.0000, -0.3333, 0.0000, 1.0000

// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters.
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair.
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as BinNormalizerModelParameters<ImmutableArray<float>>;
var density = transformParams.Density[0];
var offset = (transformParams.Offset.Length == 0 ? 0 : transformParams.Offset[0]);
Console.WriteLine($"The 0-index value in resulting array would be produce by: y = (Index(x) / {density}) - {offset}");
Console.WriteLine("Where Index(x) is the index of the bin to which x belongs");
Console.WriteLine($"Bins upper bounds are: {string.Join(" ", transformParams.UpperBounds[0])}");
// Expected output:
// The 0-index value in resulting array would be produce by: y = (Index(x) / 3) - 0
// Where Index(x) is the index of the bin to which x belongs
// Bins upper bounds are: 3 5 7 ∞

var fixZeroParams = (normalizeFixZeroTransform.GetNormalizerModelParameters(0) as BinNormalizerModelParameters<ImmutableArray<float>>);
density = fixZeroParams.Density[1];
offset = (fixZeroParams.Offset.Length == 0 ? 0 : fixZeroParams.Offset[1]);
Console.WriteLine($"The 0-index value in resulting array would be produce by: y = (Index(x) / {density}) - {offset}");
Console.WriteLine("Where Index(x) is the index of the bin to which x belongs");
Console.WriteLine($"Bins upper bounds are: {string.Join(" ", fixZeroParams.UpperBounds[1])}");
// Expected output:
// The 0-index value in resulting array would be produce by: y = (Index(x) / 3) - 0.3333333
// Where Index(x) is the index of the bin to which x belongs
// Bins upper bounds are: -0.5 0.5 1.5 ∞
}

private class DataPoint
{
[VectorType(4)]
public float[] Features { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Samples.Dynamic
{
class NormalizeGlobalContrast
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[4] { 1, 1, 0, 0} },
new DataPoint(){ Features = new float[4] { 2, 2, 0, 0} },
new DataPoint(){ Features = new float[4] { 1, 0, 1, 0} },
new DataPoint(){ Features = new float[4] { 0, 1, 0, 1} }
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);
var approximation = mlContext.Transforms.NormalizeGlobalContrast("Features", ensureZeroMean: false, scale: 2, ensureUnitStandardDeviation: true);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var tansformer = approximation.Fit(data);
var transformedData = tansformer.Transform(data);

var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 2.0000, 2.0000,-2.0000,-2.0000
// 2.0000, 2.0000,-2.0000,-2.0000
// 2.0000,-2.0000, 2.0000,-2.0000
//- 2.0000, 2.0000,-2.0000, 2.0000
}

private class DataPoint
{
[VectorType(4)]
public float[] Features { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using static Microsoft.ML.Transforms.NormalizingTransformer;

namespace Samples.Dynamic
{
public class NormalizeLogMeanVariance
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[4] { 1, 1, 3, 0} },
new DataPoint(){ Features = new float[4] { 2, 2, 2, 0} },
new DataPoint(){ Features = new float[4] { 0, 0, 1, 0} },
new DataPoint(){ Features = new float[4] {-1,-1,-1, 1} }
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);
// NormalizeLogMeanVariance normalizes the data based on the computed mean and variance of the logarithm of the data.
// Uses Cumulative distribution function as output.
var normalize = mlContext.Transforms.NormalizeLogMeanVariance("Features", useCdf: true);

// NormalizeLogMeanVariance normalizes the data based on the computed mean and variance of the logarithm of the data.
var normalizeNoCdf = mlContext.Transforms.NormalizeLogMeanVariance("Features", useCdf: false);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var normalizeTransform = normalize.Fit(data);
var transformedData = normalizeTransform.Transform(data);
var normalizeNoCdfTransform = normalizeNoCdf.Fit(data);
var noCdfData = normalizeNoCdfTransform.Transform(data);
var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 0.1587, 0.1587, 0.8654, 0.0000
// 0.8413, 0.8413, 0.5837, 0.0000
// 0.0000, 0.0000, 0.0940, 0.0000
// 0.0000, 0.0000, 0.0000, 0.0000

var columnFixZero = noCdfData.GetColumn<float[]>("Features").ToArray();
foreach (var row in columnFixZero)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 1.8854, 1.8854, 5.2970, 0.0000
// 4.7708, 4.7708, 3.0925, 0.0000
// -1.0000,-1.0000, 0.8879, 0.0000
// 3.8854,-3.8854,-3.5213, 0.0000

// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters.
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair.
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as CdfNormalizerModelParameters<ImmutableArray<float>>;
Console.WriteLine("The 1-index value in resulting array would be produce by:");
Console.WriteLine($"y = 0.5* (1 + ERF((Math.Log(x)- {transformParams.Mean[1]}) / ({transformParams.StandardDeviation[1]} * sqrt(2)))");

// ERF is https://en.wikipedia.org/wiki/Error_function.
// Expected output:
// The 1-index value in resulting array would be produce by:
// y = 0.5* (1 + ERF((Math.Log(x)- 0.3465736) / (0.3465736 * sqrt(2)))
var noCdfParams = normalizeNoCdfTransform.GetNormalizerModelParameters(0) as AffineNormalizerModelParameters<ImmutableArray<float>>;
var offset = noCdfParams.Offset.Length == 0 ? 0 : noCdfParams.Offset[1];
var scale = noCdfParams.Scale[1];
Console.WriteLine($"The 1-index value in resulting array would be produce by: y = (x - ({offset})) * {scale}");
// Expected output:
// The 1-index value in resulting array would be produce by: y = (x - (2.88539)) * 0.3465736
}

private class DataPoint
{
[VectorType(4)]
public float[] Features { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms;

namespace Samples.Dynamic
{
class NormalizeLpNorm
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[4] { 1, 1, 0, 0} },
new DataPoint(){ Features = new float[4] { 2, 2, 0, 0} },
new DataPoint(){ Features = new float[4] { 1, 0, 1, 0} },
new DataPoint(){ Features = new float[4] { 0, 1, 0, 1} }
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);
var approximation = mlContext.Transforms.NormalizeLpNorm("Features", norm: LpNormNormalizingEstimatorBase.NormFunction.L1, ensureZeroMean: true);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var tansformer = approximation.Fit(data);
var transformedData = tansformer.Transform(data);

var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 0.2500, 0.2500, -0.2500, -0.2500
// 0.2500, 0.2500, -0.2500, -0.2500
// 0.2500, -0.2500, 0.2500, -0.2500
// -0.2500, 0.2500, -0.2500, 0.2500
}

private class DataPoint
{
[VectorType(4)]
public float[] Features { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using static Microsoft.ML.Transforms.NormalizingTransformer;

namespace Samples.Dynamic
{
public class NormalizeMeanVariance
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[4] { 1, 1, 3, 0} },
new DataPoint(){ Features = new float[4] { 2, 2, 2, 0} },
new DataPoint(){ Features = new float[4] { 0, 0, 1, 0} },
new DataPoint(){ Features = new float[4] {-1,-1,-1, 1} }
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);
// NormalizeMeanVariance normalizes the data based on the computed mean and variance of the data.
// Uses Cumulative distribution function as output.
var normalize = mlContext.Transforms.NormalizeMeanVariance("Features", useCdf: true);

// NormalizeMeanVariance normalizes the data based on the computed mean and variance of the data.
var normalizeNoCdf = mlContext.Transforms.NormalizeMeanVariance("Features", useCdf: false);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var normalizeTransform = normalize.Fit(data);
var transformedData = normalizeTransform.Transform(data);
var normalizeNoCdfTransform = normalizeNoCdf.Fit(data);
var noCdfData = normalizeNoCdfTransform.Transform(data);
var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 0.6726, 0.6726, 0.8816, 0.2819
// 0.9101, 0.9101, 0.6939, 0.2819
// 0.3274, 0.3274, 0.4329, 0.2819
// 0.0899, 0.0899, 0.0641, 0.9584


var columnFixZero = noCdfData.GetColumn<float[]>("Features").ToArray();
foreach (var row in columnFixZero)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 0.8165, 0.8165, 1.5492, 0.0000
// 1.6330, 1.6330, 1.0328, 0.0000
// 0.0000, 0.0000, 0.5164, 0.0000
// -0.8165,-0.8165,-0.5164, 2.0000

// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters.
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair.
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as CdfNormalizerModelParameters<ImmutableArray<float>>;
Console.WriteLine($"The 1-index value in resulting array would be produce by:");
Console.WriteLine($" y = 0.5* (1 + ERF((x- {transformParams.Mean[1]}) / ({transformParams.StandardDeviation[1]} * sqrt(2)))");
// ERF is https://en.wikipedia.org/wiki/Error_function.
// Expected output:
// The 1-index value in resulting array would be produce by:
// y = 0.5 * (1 + ERF((x - 0.5) / (1.118034 * sqrt(2)))

var noCdfParams = normalizeNoCdfTransform.GetNormalizerModelParameters(0) as AffineNormalizerModelParameters<ImmutableArray<float>>;
var offset = noCdfParams.Offset.Length == 0 ? 0 : noCdfParams.Offset[1];
var scale = noCdfParams.Scale[1];
Console.WriteLine($"Values for slot 1 would be transfromed by applying y = (x - ({offset})) * {scale}");
// Expected output:
// The 1-index value in resulting array would be produce by: y = (x - (0)) * 0.8164966
}

private class DataPoint
{
[VectorType(4)]
public float[] Features { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using static Microsoft.ML.Transforms.NormalizingTransformer;

namespace Samples.Dynamic
{
public class NormalizeMinMax
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[4] { 1, 1, 3, 0} },
new DataPoint(){ Features = new float[4] { 2, 2, 2, 0} },
new DataPoint(){ Features = new float[4] { 0, 0, 1, 0} },
new DataPoint(){ Features = new float[4] {-1,-1,-1, 1} }
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);
// NormalizeMinMax normalize rows by finding min and max values in each row slot
// and setting projection of min value to 0 and max to 1 and everything else to
// values in between.
var normalize = mlContext.Transforms.NormalizeMinMax("Features", fixZero: false);

// Normalize rows by finding min and max values in each row slot, but make sure
// zero values would remain zero after normalization. Helps preserve sparsity.
var normalizeFixZero = mlContext.Transforms.NormalizeMinMax("Features", fixZero: true);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var normalizeTransform = normalize.Fit(data);
var transformedData = normalizeTransform.Transform(data);
var normalizeFixZeroTransform = normalizeFixZero.Fit(data);
var fixZeroData = normalizeFixZeroTransform.Transform(data);
var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 0.6667, 0.6667, 1.0000, 0.0000
// 1.0000, 1.0000, 0.7500, 0.0000
// 0.3333, 0.3333, 0.5000, 0.0000
// 0.0000, 0.0000, 0.0000, 1.0000

var columnFixZero = fixZeroData.GetColumn<float[]>("Features").ToArray();
foreach (var row in columnFixZero)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 0.5000, 0.5000, 1.0000, 0.0000
// 1.0000, 1.0000, 0.6667, 0.0000
// 0.0000, 0.0000, 0.3333, 0.0000
// -0.5000,-0.5000,-0.3333, 1.0000

// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters.
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair.
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as AffineNormalizerModelParameters<ImmutableArray<float>>;
Console.WriteLine($"The 1-index value in resulting array would be produce by:");
Console.WriteLine($" y = (x - ({(transformParams.Offset.Length == 0 ? 0 : transformParams.Offset[1])})) * {transformParams.Scale[1]}");
// Expected output:
// The 1-index value in resulting array would be produce by:
// y = (x - (-1)) * 0.3333333
}

private class DataPoint
{
[VectorType(4)]
public float[] Features { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using static Microsoft.ML.Transforms.NormalizingTransformer;

namespace Samples.Dynamic
{
public class NormalizeSupervisedBinning
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();
var samples = new List<DataPoint>()
{
new DataPoint(){ Features = new float[4] { 8, 1, 3, 0}, Bin="Bin1" },
new DataPoint(){ Features = new float[4] { 6, 2, 2, 1}, Bin="Bin2" },
new DataPoint(){ Features = new float[4] { 5, 3, 0, 2}, Bin="Bin2" },
new DataPoint(){ Features = new float[4] { 4,-8, 1, 3}, Bin="Bin3" },
new DataPoint(){ Features = new float[4] { 2,-5,-1, 4}, Bin="Bin3" }
};
// Convert training data to IDataView, the general data type used in ML.NET.
var data = mlContext.Data.LoadFromEnumerable(samples);
// Let's transform "Bin" column from string to key.
data = mlContext.Transforms.Conversion.MapValueToKey("Bin").Fit(data).Transform(data);
// NormalizeSupervisedBinning normalizes the data by constructing bins based on correlation with the label column and produce output based on
// to which bin original value belong.
var normalize = mlContext.Transforms.NormalizeSupervisedBinning("Features", labelColumnName: "Bin", mininimumExamplesPerBin: 1, fixZero: false);

// NormalizeSupervisedBinning normalizes the data by constructing bins based on correlation with the label column and produce output based on
// to which bin original value belong but make sure zero values would remain zero after normalization.
// Helps preserve sparsity.
var normalizeFixZero = mlContext.Transforms.NormalizeSupervisedBinning("Features", labelColumnName: "Bin", mininimumExamplesPerBin: 1, fixZero: true);

// Now we can transform the data and look at the output to confirm the behavior of the estimator.
// This operation doesn't actually evaluate data until we read the data below.
var normalizeTransform = normalize.Fit(data);
var transformedData = normalizeTransform.Transform(data);
var normalizeFixZeroTransform = normalizeFixZero.Fit(data);
var fixZeroData = normalizeFixZeroTransform.Transform(data);
var column = transformedData.GetColumn<float[]>("Features").ToArray();
foreach (var row in column)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 1.0000, 0.5000, 1.0000, 0.0000
// 0.5000, 1.0000, 0.0000, 0.5000
// 0.5000, 1.0000, 0.0000, 0.5000
// 0.0000, 0.0000, 0.0000, 1.0000
// 0.0000, 0.0000, 0.0000, 1.0000

var columnFixZero = fixZeroData.GetColumn<float[]>("Features").ToArray();
foreach (var row in columnFixZero)
Console.WriteLine(string.Join(", ", row.Select(x => x.ToString("f4"))));
// Expected output:
// 1.0000, 0.0000, 1.0000, 0.0000
// 0.5000, 0.5000, 0.0000, 0.5000
// 0.5000, 0.5000, 0.0000, 0.5000
// 0.0000,-0.5000, 0.0000, 1.0000
// 0.0000,-0.5000, 0.0000, 1.0000

// Let's get transformation parameters. Since we work with only one column we need to pass 0 as parameter for GetNormalizerModelParameters.
// If we have multiple columns transformations we need to pass index of InputOutputColumnPair.
var transformParams = normalizeTransform.GetNormalizerModelParameters(0) as BinNormalizerModelParameters<ImmutableArray<float>>;
Console.WriteLine($"The 1-index value in resulting array would be produce by:");
Console.WriteLine($"y = (Index(x) / {transformParams.Density[0]}) - {(transformParams.Offset.Length == 0 ? 0 : transformParams.Offset[0])}");
Console.WriteLine("Where Index(x) is the index of the bin to which x belongs");
Console.WriteLine($"Bins upper borders are: {string.Join(" ", transformParams.UpperBounds[0])}");
// Expected output:
// The 1-index value in resulting array would be produce by:
// y = (Index(x) / 2) - 0
// Where Index(x) is the index of the bin to which x belongs
// Bins upper bounds are: 4.5 7 ∞

var fixZeroParams = normalizeFixZeroTransform.GetNormalizerModelParameters(0) as BinNormalizerModelParameters<ImmutableArray<float>>;
Console.WriteLine($"The 1-index value in resulting array would be produce by:");
Console.WriteLine($" y = (Index(x) / {fixZeroParams.Density[1]}) - {(fixZeroParams.Offset.Length == 0 ? 0 : fixZeroParams.Offset[1])}");
Console.WriteLine("Where Index(x) is the index of the bin to which x belongs");
Console.WriteLine($"Bins upper borders are: {string.Join(" ", fixZeroParams.UpperBounds[1])}");
// Expected output:
// The 1-index value in resulting array would be produce by:
// y = (Index(x) / 2) - 0.5
// Where Index(x) is the index of the bin to which x belongs
// Bins upper bounds are: -2 1.5 ∞
}

private class DataPoint
{
[VectorType(4)]
public float[] Features { get; set; }

public string Bin { get; set; }
}
}
}
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Transforms/FourierDistributionSampler.cs
Original file line number Diff line number Diff line change
@@ -203,7 +203,7 @@ internal sealed class Options : IComponentFactory<KernelBase>
/// <summary>
/// Create a new instance of a LaplacianKernel.
/// </summary>
/// <param name="a">The coefficient in the exponent of the kernel function</param>
/// <param name="a">The coefficient in the exponent of the kernel function.</param>
public LaplacianKernel(float a = 1)
{
Contracts.CheckParam(a > 0, nameof(a));
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Transforms/KernelCatalog.cs
Original file line number Diff line number Diff line change
@@ -26,7 +26,7 @@ public static class KernelExpansionCatalog
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[CreateRandomFourierFeatures](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/ProjectionTransforms.cs?range=1-6,12-112)]
/// [!code-csharp[ApproximatedKernelMap](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/ApproximatedKernelMap.cs)]
/// ]]>
/// </format>
/// </example>
59 changes: 42 additions & 17 deletions src/Microsoft.ML.Transforms/NormalizerCatalog.cs
Original file line number Diff line number Diff line change
@@ -20,13 +20,6 @@ public static class NormalizationCatalog
/// <param name="catalog">The transform catalog</param>
/// <param name="mode">The <see cref="NormalizingEstimator.NormalizationMode"/> used to map the old values to the new ones. </param>
/// <param name="columns">The pairs of input and output columns.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[Normalize](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs)]
/// ]]>
/// </format>
/// </example>
[BestFriend]
internal static NormalizingEstimator Normalize(this TransformsCatalog catalog,
NormalizingEstimator.NormalizationMode mode,
@@ -48,7 +41,7 @@ internal static NormalizingEstimator Normalize(this TransformsCatalog catalog,
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[Normalize](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs)]
/// [!code-csharp[NormalizeMinMax](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeMinMax.cs)]
/// ]]>
/// </format>
/// </example>
@@ -84,6 +77,13 @@ public static NormalizingEstimator NormalizeMinMax(this TransformsCatalog catalo
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
/// <param name="useCdf">Whether to use CDF as the output.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[NormalizeMeanVariance](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeMeanVariance.cs)]
/// ]]>
/// </format>
/// </example>
public static NormalizingEstimator NormalizeMeanVariance(this TransformsCatalog catalog,
string outputColumnName, string inputColumnName = null,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
@@ -118,6 +118,13 @@ public static NormalizingEstimator NormalizeMeanVariance(this TransformsCatalog
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="useCdf">Whether to use CDF as the output.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[NormalizeLogMeanVariance](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeLogMeanVariance.cs)]
/// ]]>
/// </format>
/// </example>
public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatalog catalog,
string outputColumnName, string inputColumnName = null,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
@@ -134,13 +141,6 @@ public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatal
/// <param name="columns">List of Output and Input column pairs.</param>
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="useCdf">Whether to use CDF as the output.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[Normalize](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs)]
/// ]]>
/// </format>
/// </example>
public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatalog catalog, InputOutputColumnPair[] columns,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
bool useCdf = NormalizingEstimator.Defaults.LogMeanVarCdf) =>
@@ -157,6 +157,13 @@ public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatal
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
/// <param name="maximumBinCount">Maximum number of bins (power of 2 recommended).</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[NormalizeBinning](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeBinning.cs)]
/// ]]>
/// </format>
/// </example>
public static NormalizingEstimator NormalizeBinning(this TransformsCatalog catalog,
string outputColumnName, string inputColumnName = null,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
@@ -194,6 +201,13 @@ public static NormalizingEstimator NormalizeBinning(this TransformsCatalog catal
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
/// <param name="maximumBinCount">Maximum number of bins (power of 2 recommended).</param>
/// <param name="mininimumExamplesPerBin">Minimum number of examples per bin.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[NormalizeBinning](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeSupervisedBinning.cs)]
/// ]]>
/// </format>
/// </example>
public static NormalizingEstimator NormalizeSupervisedBinning(this TransformsCatalog catalog,
string outputColumnName, string inputColumnName = null,
string labelColumnName = DefaultColumnNames.Label,
@@ -246,10 +260,15 @@ internal static NormalizingEstimator Normalize(this TransformsCatalog catalog,
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="norm">Type of norm to use to normalize each sample. The indicated norm of the resulted vector will be normalized to one.</param>
/// <param name="ensureZeroMean">If <see langword="true"/>, subtract mean from each value before normalizing and use the raw input otherwise.</param>
/// <remarks>
/// This transform performs the following operation on a each row X: Y = (X - M(X)) / D(X)
/// where M(X) is scalar value of mean for all elements in the current row if <paramref name="ensureZeroMean"/>set to <see langword="true"/> or <value>0</value> othewise
/// and D(X) is scalar value of selected <paramref name="norm"/>.
/// </remarks>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[LpNormalize](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/ProjectionTransforms.cs?range=1-6,12-112)]
/// [!code-csharp[NormalizeLpNorm](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeLpNorm.cs)]
/// ]]>
/// </format>
/// </example>
@@ -276,10 +295,16 @@ internal static LpNormNormalizingEstimator NormalizeLpNorm(this TransformsCatalo
/// <param name="ensureZeroMean">If <see langword="true"/>, subtract mean from each value before normalizing and use the raw input otherwise.</param>
/// <param name="ensureUnitStandardDeviation">If <see langword="true"/>, resulted vector's standard deviation would be one. Otherwise, resulted vector's L2-norm would be one.</param>
/// <param name="scale">Scale features by this value.</param>
/// <remarks>
/// This transform performs the following operation on a row X: Y = scale * (X - M(X)) / D(X)
/// where M(X) is scalar value of mean for all elements in the current row if <paramref name="ensureZeroMean"/>set to <see langword="true"/> or <value>0</value> othewise
/// D(X) is scalar value of standard deviation for row if <paramref name="ensureUnitStandardDeviation"/> set to <see langword="true"/> or
/// L2 norm of this row vector if <paramref name="ensureUnitStandardDeviation"/> set to <see langword="false"/> and scale is <paramref name="scale"/>.
/// </remarks>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[GlobalContrastNormalize](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/ProjectionTransforms.cs?range=1-6,12-112)]
/// [!code-csharp[NormalizeGlobalContrast](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/NormalizeGlobalContrast.cs)]
/// ]]>
/// </format>
/// </example>