diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index f509ebfe57..78447515e1 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -344,7 +344,7 @@ var cachedTrainData = mlContext.Data.Cache(trainData); var pipeline = // First 'normalize' the data (rescale to be // between -1 and 1 for all examples) - mlContext.Transforms.Normalize("FeatureVector") + mlContext.Transforms.NormalizeMinMax("FeatureVector") // We add a step for caching data in memory so that the downstream iterative training // algorithm can efficiently scan through the data multiple times. Otherwise, the following // trainer will load data from disk multiple times. The caching mechanism uses an on-demand strategy. @@ -625,18 +625,15 @@ var trainData = mlContext.Data.LoadFromTextFile(dataPath, separatorChar: ',' ); -// Apply all kinds of standard ML.NET normalization to the raw features. +// Apply MinMax normalization to the raw features. var pipeline = - mlContext.Transforms.Normalize( - new NormalizingEstimator.MinMaxColumnOptions("MinMaxNormalized", "Features", fixZero: true), - new NormalizingEstimator.MeanVarianceColumnOptions("MeanVarNormalized", "Features", fixZero: true), - new NormalizingEstimator.BinningColumnOptions("BinNormalized", "Features", maximumBinCount: 256)); + mlContext.Transforms.NormalizeMinMax("MinMaxNormalized", "Features"); // Let's train our pipeline of normalizers, and then apply it to the same data. var normalizedData = pipeline.Fit(trainData).Transform(trainData); // Inspect one column of the resulting dataset. -var meanVarValues = normalizedData.GetColumn(normalizedData.Schema["MeanVarNormalized"]).ToArray(); +var meanVarValues = normalizedData.GetColumn(normalizedData.Schema["MinMaxNormalized"]).ToArray(); ``` ## How do I train my model on categorical data? diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs index 55f3c89845..2c0fcce6bb 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs @@ -1,8 +1,6 @@ using System; using System.Collections.Generic; -using System.Linq; using Microsoft.ML.Data; -using Microsoft.ML.Transforms; namespace Microsoft.ML.Samples.Dynamic { @@ -28,7 +26,7 @@ public static void Example() // 35 1 6-11yrs 1 3 32 5 ... // A pipeline for normalizing the Induced column. - var pipeline = ml.Transforms.Normalize("Induced"); + var pipeline = ml.Transforms.NormalizeMinMax("Induced"); // The transformed (normalized according to Normalizer.NormalizerMode.MinMax) data. var transformer = pipeline.Fit(trainData); @@ -58,8 +56,8 @@ public static void Example() // Composing a different pipeline if we wanted to normalize more than one column at a time. // Using log scale as the normalization mode. - var multiColPipeline = ml.Transforms.Normalize("LogInduced", "Induced", NormalizingEstimator.NormalizationMode.LogMeanVariance) - .Append(ml.Transforms.Normalize("LogSpontaneous", "Spontaneous", NormalizingEstimator.NormalizationMode.LogMeanVariance)); + var multiColPipeline = ml.Transforms.NormalizeMinMax("LogInduced", "Induced") + .Append(ml.Transforms.NormalizeMinMax("LogSpontaneous", "Spontaneous")); // The transformed data. var multiColtransformer = multiColPipeline.Fit(trainData); var multiColtransformedData = multiColtransformer.Transform(trainData); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs index 4afa964850..46b5bc65a6 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs @@ -19,7 +19,7 @@ public static void Example() // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0. // Then append a linear regression trainer. var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.Regression.Trainers.Ols( labelColumnName: labelName, featureColumnName: "Features")); var model = pipeline.Fit(data); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs index 09fb640f30..8e109890e1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs @@ -21,7 +21,7 @@ public static void Example() // Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0. // Then append a logistic regression trainer. var pipeline = mlContext.Transforms.Concatenate("Features", featureNames) - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression( labelColumnName: labelName, featureColumnName: "Features")); var model = pipeline.Fit(data); diff --git a/src/Microsoft.ML.Data/Transforms/Normalizer.cs b/src/Microsoft.ML.Data/Transforms/Normalizer.cs index 114f0a328c..e0f5d5f019 100644 --- a/src/Microsoft.ML.Data/Transforms/Normalizer.cs +++ b/src/Microsoft.ML.Data/Transforms/Normalizer.cs @@ -39,7 +39,8 @@ internal static class Defaults public const long MaximumExampleCount = 1000000000; } - public enum NormalizationMode + [BestFriend] + internal enum NormalizationMode { /// /// Linear rescale such that minimum and maximum values are mapped between -1 and 1. diff --git a/src/Microsoft.ML.Experimental/TransformsCatalogExtensions.cs b/src/Microsoft.ML.Experimental/TransformsCatalogExtensions.cs deleted file mode 100644 index 1c811b8243..0000000000 --- a/src/Microsoft.ML.Experimental/TransformsCatalogExtensions.cs +++ /dev/null @@ -1,112 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.ML.Data; -using Microsoft.ML.Transforms; - -namespace Microsoft.ML.Experimental -{ - public static class TransformsCatalogExtensions - { - /// - /// Normalize (rescale) the column according to the mode. - /// It normalizes the data based on the observed minimum and maximum values of the data. - /// - /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. - /// Maximum number of examples used to train the normalizer. - /// Whether to map zero to zero, preserving sparsity. - public static NormalizingEstimator NormalizeMinMax(this TransformsCatalog catalog, - string outputColumnName, string inputColumnName = null, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched) - { - var columnOptions = new NormalizingEstimator.MinMaxColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, fixZero); - return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); - } - - /// - /// Normalize (rescale) the column according to the mode. - /// It normalizes the data based on the computed mean and variance of the data. - /// - /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. - /// Maximum number of examples used to train the normalizer. - /// Whether to map zero to zero, preserving sparsity. - /// Whether to use CDF as the output. - public static NormalizingEstimator NormalizeMeanVariance(this TransformsCatalog catalog, - string outputColumnName, string inputColumnName = null, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched, - bool useCdf = NormalizingEstimator.Defaults.MeanVarCdf) - { - var columnOptions = new NormalizingEstimator.MeanVarianceColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, fixZero, useCdf); - return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); - } - - /// - /// Normalize (rescale) the column according to the mode. - /// It normalizes the data based on the computed mean and variance of the logarithm of the data. - /// - /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. - /// Maximum number of examples used to train the normalizer. - /// Whether to use CDF as the output. - public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatalog catalog, - string outputColumnName, string inputColumnName = null, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - bool useCdf = NormalizingEstimator.Defaults.LogMeanVarCdf) - { - var columnOptions = new NormalizingEstimator.LogMeanVarianceColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, useCdf); - return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); - } - - /// - /// Normalize (rescale) the column according to the mode. - /// The values are assigned into bins with equal density. - /// - /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. - /// Maximum number of examples used to train the normalizer. - /// Whether to map zero to zero, preserving sparsity. - /// Maximum number of bins (power of 2 recommended). - public static NormalizingEstimator NormalizeBinning(this TransformsCatalog catalog, - string outputColumnName, string inputColumnName = null, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched, - int maximumBinCount = NormalizingEstimator.Defaults.MaximumBinCount) - { - var columnOptions = new NormalizingEstimator.BinningColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, fixZero, maximumBinCount); - return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); - } - - /// - /// Normalize (rescale) the column according to the mode. - /// The values are assigned into bins based on correlation with the column. - /// - /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. - /// Name of the label column for supervised binning. - /// Maximum number of examples used to train the normalizer. - /// Whether to map zero to zero, preserving sparsity. - /// Maximum number of bins (power of 2 recommended). - /// Minimum number of examples per bin. - public static NormalizingEstimator NormalizeSupervisedBinning(this TransformsCatalog catalog, - string outputColumnName, string inputColumnName = null, - string labelColumnName = DefaultColumnNames.Label, - long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, - bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched, - int maximumBinCount = NormalizingEstimator.Defaults.MaximumBinCount, - int mininimumExamplesPerBin = NormalizingEstimator.Defaults.MininimumBinSize) - { - var columnOptions = new NormalizingEstimator.SupervisedBinningColumOptions(outputColumnName, inputColumnName, labelColumnName, maximumExampleCount, fixZero, maximumBinCount, mininimumExamplesPerBin); - return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); - } - } -} diff --git a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs index 3d2aa09791..6396ecee86 100644 --- a/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs +++ b/src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs @@ -171,7 +171,7 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext) "occupation", "relationship", "ethnicity", "native-country", "age", "education-num", "capital-gain", "capital-loss", "hours-per-week")) // Min-max normalize all the features - .Append(mlContext.Transforms.Normalize("Features")); + .Append(mlContext.Transforms.NormalizeMinMax("Features")); var data = loader.Load(dataFile); var featurizedData = pipeline.Fit(data).Transform(data); diff --git a/src/Microsoft.ML.Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Transforms/NormalizerCatalog.cs index dd12c452c4..22696851c8 100644 --- a/src/Microsoft.ML.Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Transforms/NormalizerCatalog.cs @@ -10,12 +10,11 @@ namespace Microsoft.ML public static class NormalizationCatalog { /// - /// Normalize (rescale) the column according to the specified . + /// Normalize (rescale) several columns according to the specified . /// /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. - /// The used to map the old values in the new scale. + /// The used to map the old values to the new ones. + /// The pairs of input and output columns. /// /// /// /// /// - public static NormalizingEstimator Normalize(this TransformsCatalog catalog, - string outputColumnName, string inputColumnName = null, - NormalizingEstimator.NormalizationMode mode = NormalizingEstimator.NormalizationMode.MinMax) - => new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName ?? outputColumnName, mode); + [BestFriend] + internal static NormalizingEstimator Normalize(this TransformsCatalog catalog, + NormalizingEstimator.NormalizationMode mode, + params InputOutputColumnPair[] columns) + { + var env = CatalogUtils.GetEnvironment(catalog); + env.CheckValue(columns, nameof(columns)); + return new NormalizingEstimator(env, mode, InputOutputColumnPair.ConvertToValueTuples(columns)); + } /// - /// Normalize (rescale) several columns according to the specified . + /// It normalizes the data based on the observed minimum and maximum values of the data. /// /// The transform catalog - /// The used to map the old values to the new ones. - /// The pairs of input and output columns. + /// Name of the column resulting from the transformation of . + /// Name of the column to transform. If set to , the value of the will be used as source. + /// Maximum number of examples used to train the normalizer. + /// Whether to map zero to zero, preserving sparsity. /// /// /// /// /// - [BestFriend] - internal static NormalizingEstimator Normalize(this TransformsCatalog catalog, - NormalizingEstimator.NormalizationMode mode, - params InputOutputColumnPair[] columns) + public static NormalizingEstimator NormalizeMinMax(this TransformsCatalog catalog, + string outputColumnName, string inputColumnName = null, + long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, + bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched) { - var env = CatalogUtils.GetEnvironment(catalog); - env.CheckValue(columns, nameof(columns)); - return new NormalizingEstimator(env, mode, InputOutputColumnPair.ConvertToValueTuples(columns)); + var columnOptions = new NormalizingEstimator.MinMaxColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, fixZero); + return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + } + + /// + /// It normalizes the data based on the computed mean and variance of the data. + /// + /// The transform catalog + /// Name of the column resulting from the transformation of . + /// Name of the column to transform. If set to , the value of the will be used as source. + /// Maximum number of examples used to train the normalizer. + /// Whether to map zero to zero, preserving sparsity. + /// Whether to use CDF as the output. + public static NormalizingEstimator NormalizeMeanVariance(this TransformsCatalog catalog, + string outputColumnName, string inputColumnName = null, + long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, + bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched, + bool useCdf = NormalizingEstimator.Defaults.MeanVarCdf) + { + var columnOptions = new NormalizingEstimator.MeanVarianceColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, fixZero, useCdf); + return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + } + + /// + /// It normalizes the data based on the computed mean and variance of the logarithm of the data. + /// + /// The transform catalog + /// Name of the column resulting from the transformation of . + /// Name of the column to transform. If set to , the value of the will be used as source. + /// Maximum number of examples used to train the normalizer. + /// Whether to use CDF as the output. + public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatalog catalog, + string outputColumnName, string inputColumnName = null, + long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, + bool useCdf = NormalizingEstimator.Defaults.LogMeanVarCdf) + { + var columnOptions = new NormalizingEstimator.LogMeanVarianceColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, useCdf); + return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + } + + /// + /// The values are assigned into bins with equal density. + /// + /// The transform catalog + /// Name of the column resulting from the transformation of . + /// Name of the column to transform. If set to , the value of the will be used as source. + /// Maximum number of examples used to train the normalizer. + /// Whether to map zero to zero, preserving sparsity. + /// Maximum number of bins (power of 2 recommended). + public static NormalizingEstimator NormalizeBinning(this TransformsCatalog catalog, + string outputColumnName, string inputColumnName = null, + long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, + bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched, + int maximumBinCount = NormalizingEstimator.Defaults.MaximumBinCount) + { + var columnOptions = new NormalizingEstimator.BinningColumnOptions(outputColumnName, inputColumnName, maximumExampleCount, fixZero, maximumBinCount); + return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); + } + + /// + /// The values are assigned into bins based on correlation with the column. + /// + /// The transform catalog + /// Name of the column resulting from the transformation of . + /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the label column for supervised binning. + /// Maximum number of examples used to train the normalizer. + /// Whether to map zero to zero, preserving sparsity. + /// Maximum number of bins (power of 2 recommended). + /// Minimum number of examples per bin. + public static NormalizingEstimator NormalizeSupervisedBinning(this TransformsCatalog catalog, + string outputColumnName, string inputColumnName = null, + string labelColumnName = DefaultColumnNames.Label, + long maximumExampleCount = NormalizingEstimator.Defaults.MaximumExampleCount, + bool fixZero = NormalizingEstimator.Defaults.EnsureZeroUntouched, + int maximumBinCount = NormalizingEstimator.Defaults.MaximumBinCount, + int mininimumExamplesPerBin = NormalizingEstimator.Defaults.MininimumBinSize) + { + var columnOptions = new NormalizingEstimator.SupervisedBinningColumOptions(outputColumnName, inputColumnName, labelColumnName, maximumExampleCount, fixZero, maximumBinCount, mininimumExamplesPerBin); + return new NormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columnOptions); } /// diff --git a/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs b/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs index 663c1383d3..812e01242a 100644 --- a/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs +++ b/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs @@ -35,7 +35,7 @@ public CalibratedModelParametersBase - + diff --git a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs index 3790d84cdd..5689be7320 100644 --- a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs +++ b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs @@ -7,7 +7,6 @@ using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework; using Microsoft.ML.Trainers; -using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Text; using Xunit; using Xunit.Abstractions; @@ -174,7 +173,7 @@ void ExtensibilityNormalizeColumns() // Compose the transformation. var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) - .Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizationMode.MinMax)); + .Append(mlContext.Transforms.NormalizeMinMax("Features")); // Transform the data. var transformedData = pipeline.Fit(data).Transform(data); diff --git a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs index 89caf8c4f2..c60165558f 100644 --- a/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Functional.Tests/IntrospectiveTraining.cs @@ -254,7 +254,7 @@ void IntrospectNormalization() // Compose the transformation. var pipeline = mlContext.Transforms.Concatenate("Features", Iris.Features) - .Append(mlContext.Transforms.Normalize("Features", mode: NormalizingEstimator.NormalizationMode.MinMax)); + .Append(mlContext.Transforms.NormalizeMinMax("Features")); // Fit the pipeline. var model = pipeline.Fit(data); diff --git a/test/Microsoft.ML.Functional.Tests/ModelFiles.cs b/test/Microsoft.ML.Functional.Tests/ModelFiles.cs index e1fbe98749..78d4b003a9 100644 --- a/test/Microsoft.ML.Functional.Tests/ModelFiles.cs +++ b/test/Microsoft.ML.Functional.Tests/ModelFiles.cs @@ -275,7 +275,7 @@ public void LoadSchemaAndCreateNewData() var data = loader.Load(file); // Pipeline. - var pipeline = ML.Transforms.Normalize("Features"); + var pipeline = ML.Transforms.NormalizeMinMax("Features"); // Train. var model = pipeline.Fit(data); @@ -330,7 +330,7 @@ public void SaveCompositeLoaderAndLoad() { var file = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename)); var loader = ML.Data.CreateTextLoader(hasHeader: true, dataSample: file); - var composite = loader.Append(ML.Transforms.Normalize("Features")); + var composite = loader.Append(ML.Transforms.NormalizeMinMax("Features")); var loaderWithEmbeddedModel = composite.Fit(file); string modelPath = GetOutputPath(FullTestName + "-model.zip"); @@ -368,7 +368,7 @@ public void SaveLoaderAndTransformerAndLoad() { var file = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename)); var loader = ML.Data.CreateTextLoader(hasHeader: true, dataSample: file); - var estimator = ML.Transforms.Normalize("Features"); + var estimator = ML.Transforms.NormalizeMinMax("Features"); var data = loader.Load(file); var model = estimator.Fit(data); @@ -401,7 +401,7 @@ public void SaveTransformerAndSchemaAndLoad() { var file = new MultiFileSource(GetDataPath(TestDatasets.adult.trainFilename)); var loader = ML.Data.CreateTextLoader(hasHeader: true, dataSample: file); - var estimator = ML.Transforms.Normalize("Features"); + var estimator = ML.Transforms.NormalizeMinMax("Features"); var model = estimator.Fit(loader.Load(file)); string modelPath = GetOutputPath(FullTestName + "-model.zip"); diff --git a/test/Microsoft.ML.Functional.Tests/ONNX.cs b/test/Microsoft.ML.Functional.Tests/ONNX.cs index 3ece5658b8..88438305bd 100644 --- a/test/Microsoft.ML.Functional.Tests/ONNX.cs +++ b/test/Microsoft.ML.Functional.Tests/ONNX.cs @@ -33,7 +33,7 @@ public void SaveOnnxModelLoadAndScoreFastTree() // Create a pipeline to train on the housing data. var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .AppendCacheCheckpoint(mlContext) .Append(mlContext.Regression.Trainers.FastTree( new FastTreeRegressionTrainer.Options { NumberOfThreads = 1, NumberOfTrees = 10 })); @@ -85,7 +85,7 @@ public void SaveOnnxModelLoadAndScoreKMeans() // Create a pipeline to train on the housing data. var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .AppendCacheCheckpoint(mlContext) .Append(mlContext.Clustering.Trainers.KMeans( new KMeansTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 10 })); @@ -137,7 +137,7 @@ public void SaveOnnxModelLoadAndScoreSDCA() // Create a pipeline to train on the housing data. var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .AppendCacheCheckpoint(mlContext) .Append(mlContext.Regression.Trainers.Sdca( new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 10 })); diff --git a/test/Microsoft.ML.Functional.Tests/Training.cs b/test/Microsoft.ML.Functional.Tests/Training.cs index 165c57dc20..6f8d264805 100644 --- a/test/Microsoft.ML.Functional.Tests/Training.cs +++ b/test/Microsoft.ML.Functional.Tests/Training.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System; using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.Functional.Tests.Datasets; @@ -316,7 +315,7 @@ public void ContinueTrainingOnlineGradientDescent() // Create a transformation pipeline. var featurizationPipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .AppendCacheCheckpoint(mlContext); var trainer = mlContext.Regression.Trainers.OnlineGradientDescent( @@ -360,7 +359,7 @@ public void ContinueTrainingPoissonRegression() // Create a transformation pipeline. var featurizationPipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .AppendCacheCheckpoint(mlContext); var trainer = mlContext.Regression.Trainers.LbfgsPoissonRegression( diff --git a/test/Microsoft.ML.Tests/CachingTests.cs b/test/Microsoft.ML.Tests/CachingTests.cs index 1b58848391..46d1c7149e 100644 --- a/test/Microsoft.ML.Tests/CachingTests.cs +++ b/test/Microsoft.ML.Tests/CachingTests.cs @@ -43,8 +43,8 @@ public void CacheCheckpointTest() var trainData = Enumerable.Range(0, 100).Select(c => new MyData()).ToArray(); var pipe = ML.Transforms.CopyColumns("F1", "Features") - .Append(ML.Transforms.Normalize("Norm1", "F1")) - .Append(ML.Transforms.Normalize("Norm2", "F1", Transforms.NormalizingEstimator.NormalizationMode.MeanVariance)); + .Append(ML.Transforms.NormalizeMinMax("Norm1", "F1")) + .Append(ML.Transforms.NormalizeMeanVariance("Norm2", "F1")); pipe.Fit(ML.Data.LoadFromEnumerable(trainData)); @@ -53,8 +53,8 @@ public void CacheCheckpointTest() trainData = Enumerable.Range(0, 100).Select(c => new MyData()).ToArray(); pipe = ML.Transforms.CopyColumns("F1", "Features") .AppendCacheCheckpoint(ML) - .Append(ML.Transforms.Normalize("Norm1", "F1")) - .Append(ML.Transforms.Normalize("Norm2", "F1", Transforms.NormalizingEstimator.NormalizationMode.MeanVariance)); + .Append(ML.Transforms.NormalizeMinMax("Norm1", "F1")) + .Append(ML.Transforms.NormalizeMeanVariance("Norm2", "F1")); pipe.Fit(ML.Data.LoadFromEnumerable(trainData)); diff --git a/test/Microsoft.ML.Tests/FeatureContributionTests.cs b/test/Microsoft.ML.Tests/FeatureContributionTests.cs index d1f691d8c5..02cf7b3d1b 100644 --- a/test/Microsoft.ML.Tests/FeatureContributionTests.cs +++ b/test/Microsoft.ML.Tests/FeatureContributionTests.cs @@ -8,11 +8,9 @@ using Microsoft.ML.Data; using Microsoft.ML.Data.IO; using Microsoft.ML.Internal.Utilities; -using Microsoft.ML.Model; using Microsoft.ML.RunTests; using Microsoft.ML.TestFramework.Attributes; using Microsoft.ML.Trainers; -using Microsoft.ML.Transforms; using Xunit; using Xunit.Abstractions; @@ -306,7 +304,7 @@ private IDataView GetSparseDataset(TaskType task = TaskType.Regression, int numb var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important") - .Append(ML.Transforms.Normalize("Features")); + .Append(ML.Transforms.NormalizeMinMax("Features")); if (task == TaskType.BinaryClassification) return pipeline.Append(ML.Transforms.Conversion.ConvertType("Label", outputKind: DataKind.Boolean)) diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 723ab59055..1cd5a2a0d0 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -57,7 +57,7 @@ public void SimpleEndToEndOnnxConversionTest() hasHeader: true); var cachedTrainData = mlContext.Data.Cache(data); var dynamicPipeline = - mlContext.Transforms.Normalize("FeatureVector") + mlContext.Transforms.NormalizeMinMax("FeatureVector") .AppendCacheCheckpoint(mlContext) .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() { LabelColumnName = "Target", @@ -137,7 +137,7 @@ public void KmeansOnnxConversionTest() separatorChar: '\t', hasHeader: true); - var pipeline = mlContext.Transforms.Normalize("Features"). + var pipeline = mlContext.Transforms.NormalizeMinMax("Features"). Append(mlContext.Clustering.Trainers.KMeans(new Trainers.KMeansTrainer.Options { FeatureColumnName = DefaultColumnNames.Features, @@ -315,7 +315,7 @@ public void LogisticRegressionOnnxConversionTest() hasHeader: true); var cachedTrainData = mlContext.Data.Cache(data); var dynamicPipeline = - mlContext.Transforms.Normalize("FeatureVector") + mlContext.Transforms.NormalizeMinMax("FeatureVector") .AppendCacheCheckpoint(mlContext) .Append(mlContext.Regression.Trainers.Sdca(new SdcaRegressionTrainer.Options() { LabelColumnName = "Target", @@ -352,7 +352,7 @@ public void LightGbmBinaryClassificationOnnxConversionTest() hasHeader: true); var cachedTrainData = mlContext.Data.Cache(data); var dynamicPipeline = - mlContext.Transforms.Normalize("FeatureVector") + mlContext.Transforms.NormalizeMinMax("FeatureVector") .AppendCacheCheckpoint(mlContext) .Append(mlContext.Regression.Trainers.LightGbm(labelColumnName: "Target", featureColumnName: "FeatureVector", numberOfIterations: 3, numberOfLeaves: 16, minimumExampleCountPerLeaf: 100)); var model = dynamicPipeline.Fit(data); @@ -383,7 +383,7 @@ public void MulticlassLogisticRegressionOnnxConversionTest() separatorChar: '\t', hasHeader: true); - var pipeline = mlContext.Transforms.Normalize("Features"). + var pipeline = mlContext.Transforms.NormalizeMinMax("Features"). Append(mlContext.Transforms.Conversion.MapValueToKey("Label")). Append(mlContext.MulticlassClassification.Trainers.LbfgsMaximumEntropy(new LbfgsMaximumEntropyMulticlassTrainer.Options() { NumberOfThreads = 1 })); @@ -416,7 +416,7 @@ public void RemoveVariablesInPipelineTest() var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", Transforms.OneHotEncodingEstimator.OutputKind.Bag) .Append(mlContext.Transforms.ReplaceMissingValues(new MissingValueReplacingEstimator.ColumnOptions("F2"))) .Append(mlContext.Transforms.Concatenate("Features", "F1", "F2")) - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumnName: "Label", featureColumnName: "Features", numberOfLeaves: 2, numberOfTrees: 1, minimumExampleCountPerLeaf: 2)); var model = pipeline.Fit(data); diff --git a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs index db49bbf161..ac86a8703f 100644 --- a/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs +++ b/test/Microsoft.ML.Tests/PermutationFeatureImportanceTests.cs @@ -421,7 +421,7 @@ private IDataView GetDenseDataset(TaskType task = TaskType.Regression) var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2Important", "X3", "X4Rand") - .Append(ML.Transforms.Normalize("Features")); + .Append(ML.Transforms.NormalizeMinMax("Features")); if (task == TaskType.BinaryClassification) return pipeline.Append(ML.Transforms.Conversion.ConvertType("Label", outputKind: DataKind.Boolean)) .Fit(srcDV).Transform(srcDV); @@ -501,7 +501,7 @@ private IDataView GetSparseDataset(TaskType task = TaskType.Regression) var srcDV = bldr.GetDataView(); var pipeline = ML.Transforms.Concatenate("Features", "X1", "X2VBuffer", "X3Important") - .Append(ML.Transforms.Normalize("Features")); + .Append(ML.Transforms.NormalizeMinMax("Features")); if (task == TaskType.BinaryClassification) { return pipeline.Append(ML.Transforms.Conversion.ConvertType("Label", outputKind: DataKind.Boolean)) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index cce1de27b5..d6c80e9482 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -5,7 +5,6 @@ using System; using System.Collections.Generic; using System.Collections.Immutable; -using System.IO; using System.Linq; using Microsoft.ML; using Microsoft.ML.Data; diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 867925788f..cb4ad1f999 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; -using System.IO; using System.Linq; using Microsoft.ML.Data; using Microsoft.ML.RunTests; @@ -93,7 +92,7 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m var pipeline = // First 'normalize' the data (rescale to be // between -1 and 1 for all examples), and then train the model. - mlContext.Transforms.Normalize("FeatureVector") + mlContext.Transforms.NormalizeMinMax("FeatureVector") // We add a step for caching data in memory so that the downstream iterative training // algorithm can efficiently scan through the data multiple times. Otherwise, the following // trainer will read data from disk multiple times. The caching mechanism uses an on-demand strategy. diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs index b92b95ad32..5f89e1ad79 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs @@ -29,7 +29,7 @@ public void TrainAndPredictIrisModelTest() ); var pipe = mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) .AppendCacheCheckpoint(mlContext) .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs index 088f164b28..fdef9d0513 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs @@ -34,7 +34,7 @@ public void TrainAndPredictIrisModelWithStringLabelTest() // Create Estimator var pipe = mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.Transforms.Conversion.MapValueToKey("Label", "IrisPlantType"), TransformerScope.TrainTest) .AppendCacheCheckpoint(mlContext) .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs index f2098c68e5..36eb2d1b6b 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs @@ -27,7 +27,7 @@ public void TrainAndPredictIrisModelUsingDirectInstantiationTest() ); var pipe = mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") - .Append(mlContext.Transforms.Normalize("Features")) + .Append(mlContext.Transforms.NormalizeMinMax("Features")) .Append(mlContext.Transforms.Conversion.MapValueToKey("Label")) .AppendCacheCheckpoint(mlContext) .Append(mlContext.MulticlassClassification.Trainers.SdcaMaximumEntropy( diff --git a/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs b/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs index 775289ca07..17d41c418e 100644 --- a/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs @@ -225,8 +225,8 @@ public void SimpleConstructorsAndExtensions() var est1 = new NormalizingEstimator(Env, "float4"); var est2 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.MinMax, ("float4", "float4")); var est3 = new NormalizingEstimator(Env, new NormalizingEstimator.MinMaxColumnOptions("float4")); - var est4 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.MinMax); - var est5 = ML.Transforms.Normalize("float4"); + var est4 = ML.Transforms.NormalizeMinMax("float4", "float4"); + var est5 = ML.Transforms.NormalizeMinMax("float4"); var data1 = est1.Fit(data).Transform(data); var data2 = est2.Fit(data).Transform(data); @@ -246,7 +246,7 @@ public void SimpleConstructorsAndExtensions() // Tests for MeanVariance var est6 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.MeanVariance, ("float4", "float4")); var est7 = new NormalizingEstimator(Env, new NormalizingEstimator.MeanVarianceColumnOptions("float4")); - var est8 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.MeanVariance); + var est8 = ML.Transforms.NormalizeMeanVariance("float4", "float4"); var data6 = est6.Fit(data).Transform(data); var data7 = est7.Fit(data).Transform(data); @@ -259,7 +259,7 @@ public void SimpleConstructorsAndExtensions() // Tests for LogMeanVariance var est9 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.LogMeanVariance, ("float4", "float4")); var est10 = new NormalizingEstimator(Env, new NormalizingEstimator.LogMeanVarianceColumnOptions("float4")); - var est11 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.LogMeanVariance); + var est11 = ML.Transforms.NormalizeLogMeanVariance("float4", "float4"); var data9 = est9.Fit(data).Transform(data); var data10 = est10.Fit(data).Transform(data); @@ -272,7 +272,7 @@ public void SimpleConstructorsAndExtensions() // Tests for Binning var est12 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.Binning, ("float4", "float4")); var est13 = new NormalizingEstimator(Env, new NormalizingEstimator.BinningColumnOptions("float4")); - var est14 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.Binning); + var est14 = ML.Transforms.NormalizeBinning("float4", "float4"); var data12 = est12.Fit(data).Transform(data); var data13 = est13.Fit(data).Transform(data); @@ -285,7 +285,7 @@ public void SimpleConstructorsAndExtensions() // Tests for SupervisedBinning var est15 = new NormalizingEstimator(Env, NormalizingEstimator.NormalizationMode.SupervisedBinning, ("float4", "float4")); var est16 = new NormalizingEstimator(Env, new NormalizingEstimator.SupervisedBinningColumOptions("float4")); - var est17 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.SupervisedBinning); + var est17 = ML.Transforms.NormalizeSupervisedBinning("float4", "float4"); var data15 = est15.Fit(data).Transform(data); var data16 = est16.Fit(data).Transform(data); @@ -314,11 +314,11 @@ public void NormalizerExperimentalExtensions() var data = loader.Load(dataPath); // Normalizer Extensions - var est1 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.MinMax); - var est2 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.MeanVariance); - var est3 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.LogMeanVariance); - var est4 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.Binning); - var est5 = ML.Transforms.Normalize("float4", "float4", NormalizingEstimator.NormalizationMode.SupervisedBinning); + var est1 = ML.Transforms.NormalizeMinMax("float4", "float4"); + var est2 = ML.Transforms.NormalizeMeanVariance("float4", "float4"); + var est3 = ML.Transforms.NormalizeLogMeanVariance("float4", "float4"); + var est4 = ML.Transforms.NormalizeBinning("float4", "float4"); + var est5 = ML.Transforms.NormalizeSupervisedBinning("float4", "float4"); // Normalizer Extensions (Experimental) var est6 = ML.Transforms.NormalizeMinMax("float4", "float4"); @@ -370,7 +370,7 @@ public void NormalizerExperimentalExtensionGetColumnPairs() }); var data = loader.Load(dataPath); - var est = ML.Transforms.Normalize("output", "input", NormalizingEstimator.NormalizationMode.MinMax); + var est = ML.Transforms.NormalizeMinMax("output", "input"); var t = est.Fit(data); Assert.Single(t.GetColumnPairs());