diff --git a/src/Microsoft.ML.Data/Transforms/Normalizer.cs b/src/Microsoft.ML.Data/Transforms/Normalizer.cs index e0f5d5f019..bf2c84c4c4 100644 --- a/src/Microsoft.ML.Data/Transforms/Normalizer.cs +++ b/src/Microsoft.ML.Data/Transforms/Normalizer.cs @@ -26,6 +26,43 @@ namespace Microsoft.ML.Transforms { + /// + /// for the . + /// + /// + /// or or a known-sized vector of those types. | + /// | Output column data type | The same data type as the input column | + /// + /// The resulting NormalizingEstimator will normalize the data in one of the following ways based upon how it was created: + /// * Min Max - A linear rescale that is based upon the minimum and maximum values for each row. + /// * Mean Variance - Rescale each row to unit variance and, optionally, zero mean. + /// * Log Mean Variance - Rescale each row to unit variance based on a log scale. + /// * Binning - Bucketizes the data in each row and performs a linear rescale based on the calculated bins. + /// * Supervised Binning - Bucketize the data in each row and performas a linear rescale based on the calculated bins. The bin calculation is based on correlation of the Label column. + /// + /// ### Estimator Details + /// The interval of the normalized data depends on whether fixZero is specified or not. fixZero defaults to true. + /// When fixZero is false, the normalized interval is $[0,1]$ and the distribution of the normalized values depends on the normalization mode. For example, with Min Max, the minimum + /// and maximum values are mapped to 0 and 1 respectively and remaining values fall in between. + /// When fixZero is set, the normalized interval is $[-1,1]$ with the distribution of the normalized values depending on the normalization mode, but the behavior is different. + /// With Min Max, the distribution depends on how far away the number is from 0, resulting in the number with the largest distance being mapped to 1 if its a positive number + /// or -1 if its a negative number. The distance from 0 will affect the distribution with a majority of numbers that are closer together normalizing towards 0. + /// + /// To create this estimator use one of the following: + /// * [NormalizeMinMax](xref:Microsoft.ML.NormalizationCatalog.NormalizeMinMax(Microsoft.ML.TransformsCatalog, System.String, System.String, System.Int64, System.Boolean)) + /// * [NormalizeMeanVariance](xref:Microsoft.ML.NormalizationCatalog.NormalizeMeanVariance(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int64,System.Boolean,System.Boolean)) + /// * [NormalizeLogMeanVariance](xref:Microsoft.ML.NormalizationCatalog.NormalizeLogMeanVariance(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int64,System.Boolean)) + /// * [NormalizeBinning](xref:Microsoft.ML.NormalizationCatalog.NormalizeBinning(Microsoft.ML.TransformsCatalog,System.String,System.String,System.Int64,System.Boolean,System.Int32)) + /// * [NormalizeSupervisedBinning](xref:Microsoft.ML.NormalizationCatalog.NormalizeSupervisedBinning(Microsoft.ML.TransformsCatalog,System.String,System.String,System.String,System.Int64,System.Boolean,System.Int32,System.Int32)) + /// ]]> + /// + /// public sealed class NormalizingEstimator : IEstimator { [BestFriend] @@ -284,6 +321,9 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) } } + /// + /// resulting from fitting an . + /// public sealed partial class NormalizingTransformer : OneToOneTransformerBase { internal const string LoaderSignature = "Normalizer"; diff --git a/src/Microsoft.ML.Transforms/NormalizerCatalog.cs b/src/Microsoft.ML.Transforms/NormalizerCatalog.cs index 7453efbaac..ccf0f35bb7 100644 --- a/src/Microsoft.ML.Transforms/NormalizerCatalog.cs +++ b/src/Microsoft.ML.Transforms/NormalizerCatalog.cs @@ -31,11 +31,13 @@ internal static NormalizingEstimator Normalize(this TransformsCatalog catalog, } /// - /// It normalizes the data based on the observed minimum and maximum values of the data. + /// Create a , which normalizes based on the observed minimum and maximum values of the data. /// /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// The data type on this column is the same as the input column. + /// Name of the column to transform. If set to , the value of the will be used as source. + /// The data type on this column should be , or a known-sized vector of those types. /// Maximum number of examples used to train the normalizer. /// Whether to map zero to zero, preserving sparsity. /// @@ -55,10 +57,12 @@ public static NormalizingEstimator NormalizeMinMax(this TransformsCatalog catalo } /// - /// It normalizes the data based on the observed minimum and maximum values of the data. + /// Create a , which normalizes based on the observed minimum and maximum values of the data. /// /// The transform catalog - /// List of Output and Input column pairs. + /// The pairs of input and output columns. + /// The input columns must be of data type , or a known-sized vector of those types. + /// The data type for the output column will be the same as the associated input column. /// Maximum number of examples used to train the normalizer. /// Whether to map zero to zero, preserving sparsity. public static NormalizingEstimator NormalizeMinMax(this TransformsCatalog catalog, InputOutputColumnPair[] columns, @@ -69,11 +73,13 @@ public static NormalizingEstimator NormalizeMinMax(this TransformsCatalog catalo new NormalizingEstimator.MinMaxColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, fixZero)).ToArray()); /// - /// It normalizes the data based on the computed mean and variance of the data. + /// Create a , which normalizes based on the computed mean and variance of the data. /// /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// The data type on this column is the same as the input column. + /// Name of the column to transform. If set to , the value of the will be used as source. + /// The data type on this column should be , or a known-sized vector of those types. /// Maximum number of examples used to train the normalizer. /// Whether to map zero to zero, preserving sparsity. /// Whether to use CDF as the output. @@ -95,10 +101,12 @@ public static NormalizingEstimator NormalizeMeanVariance(this TransformsCatalog } /// - /// It normalizes the data based on the computed mean and variance of the data. + /// Create a , which normalizes based on the computed mean and variance of the data. /// /// The transform catalog - /// List of Output and Input column pairs. + /// The pairs of input and output columns. + /// The input columns must be of data type , or a known-sized vector of those types. + /// The data type for the output column will be the same as the associated input column. /// Maximum number of examples used to train the normalizer. /// Whether to map zero to zero, preserving sparsity. /// Whether to use CDF as the output. @@ -111,11 +119,13 @@ public static NormalizingEstimator NormalizeMeanVariance(this TransformsCatalog new NormalizingEstimator.MeanVarianceColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, fixZero, useCdf)).ToArray()); /// - /// It normalizes the data based on the computed mean and variance of the logarithm of the data. + /// Create a , which normalizes based on the computed mean and variance of the logarithm of the data. /// /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// The data type on this column is the same as the input column. + /// Name of the column to transform. If set to , the value of the will be used as source. + /// The data type on this column should be , or a known-sized vector of those types. /// Maximum number of examples used to train the normalizer. /// Whether to use CDF as the output. /// @@ -135,10 +145,12 @@ public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatal } /// - /// It normalizes the data based on the computed mean and variance of the logarithm of the data. + /// Create a , which normalizes based on the computed mean and variance of the logarithm of the data. /// /// The transform catalog - /// List of Output and Input column pairs. + /// The pairs of input and output columns. + /// The input columns must be of data type , or a known-sized vector of those types. + /// The data type for the output column will be the same as the associated input column. /// Maximum number of examples used to train the normalizer. /// Whether to use CDF as the output. public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatalog catalog, InputOutputColumnPair[] columns, @@ -149,11 +161,13 @@ public static NormalizingEstimator NormalizeLogMeanVariance(this TransformsCatal new NormalizingEstimator.LogMeanVarianceColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, useCdf)).ToArray()); /// - /// The values are assigned into bins with equal density. + /// Create a , which normalizes by assigning the data into bins with equal density. /// /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// The data type on this column is the same as the input column. + /// Name of the column to transform. If set to , the value of the will be used as source. + /// The data type on this column should be , or a known-sized vector of those types. /// Maximum number of examples used to train the normalizer. /// Whether to map zero to zero, preserving sparsity. /// Maximum number of bins (power of 2 recommended). @@ -175,10 +189,12 @@ public static NormalizingEstimator NormalizeBinning(this TransformsCatalog catal } /// - /// The values are assigned into bins with equal density. + /// Create a , which normalizes by assigning the data into bins with equal density. /// /// The transform catalog - /// List of Output and Input column pairs. + /// The pairs of input and output columns. + /// The input columns must be of data type , or a known-sized vector of those types. + /// The data type for the output column will be the same as the associated input column. /// Maximum number of examples used to train the normalizer. /// Whether to map zero to zero, preserving sparsity. /// Maximum number of bins (power of 2 recommended). @@ -191,11 +207,13 @@ public static NormalizingEstimator NormalizeBinning(this TransformsCatalog catal new NormalizingEstimator.BinningColumnOptions(column.OutputColumnName, column.InputColumnName, maximumExampleCount, fixZero, maximumBinCount)).ToArray()); /// - /// The values are assigned into bins based on correlation with the column. + /// Create a , which normalizes by assigning the data into bins based on correlation with the column. /// /// The transform catalog - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. + /// Name of the column resulting from the transformation of . + /// The data type on this column is the same as the input column. + /// Name of the column to transform. If set to , the value of the will be used as source. + /// The data type on this column should be , or a known-sized vector of those types. /// Name of the label column for supervised binning. /// Maximum number of examples used to train the normalizer. /// Whether to map zero to zero, preserving sparsity. @@ -221,10 +239,12 @@ public static NormalizingEstimator NormalizeSupervisedBinning(this TransformsCat } /// - /// The values are assigned into bins based on correlation with the column. + /// Create a , which normalizes by assigning the data into bins based on correlation with the column. /// /// The transform catalog - /// List of Output and Input column pairs. + /// The pairs of input and output columns. + /// The input columns must be of data type , or a known-sized vector of those types. + /// The data type for the output column will be the same as the associated input column. /// Name of the label column for supervised binning. /// Maximum number of examples used to train the normalizer. /// Whether to map zero to zero, preserving sparsity. @@ -256,7 +276,8 @@ internal static NormalizingEstimator Normalize(this TransformsCatalog catalog, /// a pre-processing step would be applied to make the specified column's mean be a zero vector. /// /// The transform's catalog. - /// Name of the column resulting from the transformation of . + /// Name of the column resulting from the transformation of . + /// The data type on this column is the same as the input column. /// Name of column to transform. If set to , the value of the will be used as source. /// Type of norm to use to normalize each sample. The indicated norm of the resulted vector will be normalized to one. /// If , subtract mean from each value before normalizing and use the raw input otherwise. @@ -290,7 +311,8 @@ internal static LpNormNormalizingEstimator NormalizeLpNorm(this TransformsCatalo /// a pre-processing step would be applied to make the specified column's mean be a zero vector. /// /// The transform's catalog. - /// Name of the column resulting from the transformation of . + /// Name of the column resulting from the transformation of . + /// The data type on this column is the same as the input column. /// Name of column to transform. If set to , the value of the will be used as source. /// If , subtract mean from each value before normalizing and use the raw input otherwise. /// If , resulted vector's standard deviation would be one. Otherwise, resulted vector's L2-norm would be one.