Skip to content

Multi-column mapping API for normalizer estimators. #3172

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ public static void Example()

// Composing a different pipeline if we wanted to normalize more than one column at a time.
// Using log scale as the normalization mode.
var multiColPipeline = ml.Transforms.NormalizeMinMax("LogInduced", "Induced")
.Append(ml.Transforms.NormalizeMinMax("LogSpontaneous", "Spontaneous"));
var multiColPipeline = ml.Transforms.NormalizeLogMeanVariance(new[] { new InputOutputColumnPair("LogInduced", "Induced"), new InputOutputColumnPair("LogSpontaneous", "Spontaneous") });

// The transformed data.
var multiColtransformer = multiColPipeline.Fit(trainData);
var multiColtransformedData = multiColtransformer.Transform(trainData);
Expand Down
95 changes: 94 additions & 1 deletion src/Microsoft.ML.Transforms/NormalizerCatalog.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
using Microsoft.ML.Data;
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System.Linq;
using Microsoft.ML.Data;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms;

Expand Down Expand Up @@ -56,6 +61,20 @@ public static NormalizingEstimator NormalizeMinMax(this TransformsCatalog catalo
return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
}

/// <summary>
/// It normalizes the data based on the observed minimum and maximum values of the data.
/// </summary>
/// <param name="catalog">The transform catalog</param>
/// <param name="columns">List of Output and Input column pairs.</param>
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
public static NormalizingEstimator NormalizeMinMax(this TransformsCatalog catalog, InputOutputColumnPair[] columns,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched) =>
new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog),
columns.Select(column =>
new NormalizingEstimator.MinMaxColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, fixZero)).ToArray());

/// <summary>
/// It normalizes the data based on the computed mean and variance of the data.
/// </summary>
Expand All @@ -75,6 +94,22 @@ public static NormalizingEstimator NormalizeMeanVariance(this TransformsCatalog
return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
}

/// <summary>
/// It normalizes the data based on the computed mean and variance of the data.
/// </summary>
/// <param name="catalog">The transform catalog</param>
/// <param name="columns">List of Output and Input column pairs.</param>
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
/// <param name="useCdf">Whether to use CDF as the output.</param>
public static NormalizingEstimator NormalizeMeanVariance(this TransformsCatalog catalog, InputOutputColumnPair[] columns,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched,
bool useCdf = NormalizingEstimator.Defaults.MeanVarCdf) =>
new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog),
columns.Select(column =>
new NormalizingEstimator.MeanVarianceColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, fixZero, useCdf)).ToArray());

/// <summary>
/// It normalizes the data based on the computed mean and variance of the logarithm of the data.
/// </summary>
Expand All @@ -92,6 +127,27 @@ public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatal
return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
}

/// <summary>
/// It normalizes the data based on the computed mean and variance of the logarithm of the data.
/// </summary>
/// <param name="catalog">The transform catalog</param>
/// <param name="columns">List of Output and Input column pairs.</param>
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="useCdf">Whether to use CDF as the output.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[Normalize](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs)]
/// ]]>
/// </format>
/// </example>
public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatalog catalog, InputOutputColumnPair[] columns,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
bool useCdf = NormalizingEstimator.Defaults.LogMeanVarCdf) =>
new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog),
columns.Select(column =>
new NormalizingEstimator.LogMeanVarianceColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, useCdf)).ToArray());

/// <summary>
/// The values are assigned into bins with equal density.
/// </summary>
Expand All @@ -111,6 +167,22 @@ public static NormalizingEstimator NormalizeBinning(this TransformsCatalog catal
return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
}

/// <summary>
/// The values are assigned into bins with equal density.
/// </summary>
/// <param name="catalog">The transform catalog</param>
/// <param name="columns">List of Output and Input column pairs.</param>
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
/// <param name="maximumBinCount">Maximum number of bins (power of 2 recommended).</param>
public static NormalizingEstimator NormalizeBinning(this TransformsCatalog catalog, InputOutputColumnPair[] columns,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched,
int maximumBinCount = NormalizingEstimator.Defaults.MaximumBinCount) =>
new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog),
columns.Select(column =>
new NormalizingEstimator.BinningColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, fixZero, maximumBinCount)).ToArray());

/// <summary>
/// The values are assigned into bins based on correlation with the <paramref name="labelColumnName"/> column.
/// </summary>
Expand All @@ -134,6 +206,27 @@ public static NormalizingEstimator NormalizeSupervisedBinning(this TransformsCat
return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions);
}

/// <summary>
/// The values are assigned into bins based on correlation with the <paramref name="labelColumnName"/> column.
/// </summary>
/// <param name="catalog">The transform catalog</param>
/// <param name="columns">List of Output and Input column pairs.</param>
/// <param name="labelColumnName">Name of the label column for supervised binning.</param>
/// <param name="maximumExampleCount">Maximum number of examples used to train the normalizer.</param>
/// <param name="fixZero">Whether to map zero to zero, preserving sparsity.</param>
/// <param name="maximumBinCount">Maximum number of bins (power of 2 recommended).</param>
/// <param name="mininimumExamplesPerBin">Minimum number of examples per bin.</param>
public static NormalizingEstimator NormalizeSupervisedBinning(this TransformsCatalog catalog, InputOutputColumnPair[] columns,
string labelColumnName = DefaultColumnNames.Label,
long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount,
bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched,
int maximumBinCount = NormalizingEstimator.Defaults.MaximumBinCount,
int mininimumExamplesPerBin = NormalizingEstimator.Defaults.MininimumBinSize) =>
new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog),
columns.Select(column =>
new NormalizingEstimator.SupervisedBinningColumOptions(
column.OutputColumnName, column.InputColumnName, labelColumnName, maximumExampleCount, fixZero, maximumBinCount, mininimumExamplesPerBin)).ToArray());

/// <summary>
/// Normalize (rescale) columns according to specified custom parameters.
/// </summary>
Expand Down
Loading