diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs index 113b4794fb..de98b1ddb0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs @@ -55,7 +55,7 @@ public static void Example() // specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information // between features and label. var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation( - outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumn: "Label", slotsInOutput: 5); + outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumnName: "Label", slotsInOutput: 5); // Now, we can put the previous two transformations together in a pipeline. var pipeline = countSelectEst.Append(mutualInfoEst); diff --git a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs index 4e719a4cd9..7d16a7936e 100644 --- a/src/Microsoft.ML.Transforms/CountFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/CountFeatureSelection.cs @@ -10,7 +10,6 @@ using Microsoft.ML; using Microsoft.ML.CommandLine; using Microsoft.ML.Data; -using Microsoft.ML.EntryPoints; using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Transforms.FeatureSelection; @@ -54,23 +53,25 @@ public sealed class ColumnOptions public readonly string Name; /// <summary> Name of the column to transform.</summary> public readonly string InputColumnName; - /// <summary> If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</summary> - public readonly long MinCount; + /// <summary>If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</summary> + public readonly long Count; /// <summary> /// Describes the parameters of the feature selection process for a column pair. /// </summary> /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param> - /// <param name="minCount">If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</param> - public ColumnOptions(string name, string inputColumnName = null, long minCount = Defaults.Count) + /// <param name="count">If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</param> + + public ColumnOptions(string name, string inputColumnName = null, long count = Defaults.Count) { Name = name; Contracts.CheckValue(Name, nameof(Name)); InputColumnName = inputColumnName ?? name; Contracts.CheckValue(InputColumnName, nameof(InputColumnName)); - MinCount = minCount; + Contracts.CheckParam(count >= 0, nameof(count), "Must be non-negative."); + Count = count; } } @@ -183,7 +184,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa host.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns)); host.CheckUserArg(options.Count > 0, nameof(options.Count)); - var columnOptions = options.Columns.Select(inColName => new ColumnOptions(inColName, minCount: options.Count)).ToArray(); + var columnOptions = options.Columns.Select(inColName => new ColumnOptions(inColName, count: options.Count)).ToArray(); return new CountFeatureSelectingEstimator(env, columnOptions).Fit(input).Transform(input) as IDataTransform; } @@ -206,11 +207,11 @@ private static void CreateDropAndCopyColumns(ColumnOptions[] columnOptions, int selectedCount[i] = 0; for (int j = 0; j < score.Length; j++) { - if (score[j] < columnOptions[i].MinCount) + if (score[j] < columnOptions[i].Count) { // Adjacent slots are combined into a single range. int min = j; - while (j < score.Length && score[j] < columnOptions[i].MinCount) + while (j < score.Length && score[j] < columnOptions[i].Count) j++; int max = j - 1; slots.Add((min, max)); diff --git a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs index 5217043a50..32a977be53 100644 --- a/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs +++ b/src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs @@ -14,9 +14,9 @@ public static class FeatureSelectionCatalog { /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' /> /// <param name="catalog">The transform's catalog.</param> - /// <param name="labelColumn">Name of the column to use for labels.</param> + /// <param name="labelColumnName">The name of the label column.</param> /// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param> - /// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param> + /// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param> /// <param name="columns">Specifies the names of the input columns for the transformation, and their respective output column names.</param> /// <example> /// <format type="text/markdown"> @@ -26,20 +26,20 @@ public static class FeatureSelectionCatalog /// </format> /// </example> public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, - string labelColumn = MutualInfoSelectDefaults.LabelColumn, + string labelColumnName = MutualInfoSelectDefaults.LabelColumn, int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput, - int numBins = MutualInfoSelectDefaults.NumBins, + int numberOfBins = MutualInfoSelectDefaults.NumBins, params ColumnOptions[] columns) - => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumn, slotsInOutput, numBins, + => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumnName, slotsInOutput, numberOfBins, ColumnOptions.ConvertToValueTuples(columns)); /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' /> /// <param name="catalog">The transform's catalog.</param> /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param> - /// <param name="labelColumn">Name of the column to use for labels.</param> + /// <param name="labelColumnName">The name of the label column.</param> /// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param> - /// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param> + /// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param> /// <example> /// <format type="text/markdown"> /// <![CDATA[ @@ -49,10 +49,10 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu /// </example> public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog, string outputColumnName, string inputColumnName = null, - string labelColumn = MutualInfoSelectDefaults.LabelColumn, + string labelColumnName = MutualInfoSelectDefaults.LabelColumn, int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput, - int numBins = MutualInfoSelectDefaults.NumBins) - => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumn, slotsInOutput, numBins); + int numberOfBins = MutualInfoSelectDefaults.NumBins) + => new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumnName, slotsInOutput, numberOfBins); /// <include file='doc.xml' path='doc/members/member[@name="CountFeatureSelection"]' /> /// <param name="catalog">The transform's catalog.</param> diff --git a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs index 4c0066e1b9..05d67116a2 100644 --- a/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs @@ -66,7 +66,7 @@ internal sealed class Options : TransformInputBase /// <param name="env">The environment to use.</param> /// <param name="labelColumn">Name of the column to use for labels.</param> /// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param> - /// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param> + /// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param> /// <param name="columns">Specifies the names of the input columns for the transformation, and their respective output column names.</param> /// <example> /// <format type="text/markdown"> @@ -78,7 +78,7 @@ internal sealed class Options : TransformInputBase internal MutualInformationFeatureSelectingEstimator(IHostEnvironment env, string labelColumn = Defaults.LabelColumn, int slotsInOutput = Defaults.SlotsInOutput, - int numBins = Defaults.NumBins, + int numberOfBins = Defaults.NumBins, params (string outputColumnName, string inputColumnName)[] columns) { Contracts.CheckValue(env, nameof(env)); @@ -87,12 +87,12 @@ internal MutualInformationFeatureSelectingEstimator(IHostEnvironment env, _host.CheckUserArg(Utils.Size(columns) > 0, nameof(columns)); _host.CheckUserArg(slotsInOutput > 0, nameof(slotsInOutput)); _host.CheckNonWhiteSpace(labelColumn, nameof(labelColumn)); - _host.Check(numBins > 1, "numBins must be greater than 1."); + _host.Check(numberOfBins > 1, "numBins must be greater than 1."); _columns = columns; _labelColumn = labelColumn; _slotsInOutput = slotsInOutput; - _numBins = numBins; + _numBins = numberOfBins; } /// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' /> diff --git a/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs b/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs index f5c143d793..55d802bc83 100644 --- a/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs @@ -42,7 +42,7 @@ public void FeatureSelectionWorkout() var est = new WordBagEstimator(ML, "bag_of_words", "text") .AppendCacheCheckpoint(ML) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words_count", "bag_of_words", 10) - .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumn: "label"))); + .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumnName: "label"))); var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv"); using (var ch = Env.Start("save")) @@ -115,11 +115,11 @@ public void CountFeatureSelectionWorkout() var data = ML.Data.Cache(reader.Load(new MultiFileSource(dataPath)).AsDynamic); var columns = new[] { - new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", minCount: 1), - new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", minCount: 690), - new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", minCount: 100), - new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", minCount: 690), - new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", minCount: 100) + new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", count: 1), + new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", count: 690), + new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", count: 100), + new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", count: 690), + new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", count: 100) }; var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("FeatureSelect", "VectorFloat", count: 1) .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(columns)); @@ -182,8 +182,8 @@ public void MutualInformationSelectionWorkout() var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic; - var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumn: "Label") - .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumn: "Label", slotsInOutput: 2, numBins: 100, + var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label") + .Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumnName: "Label", slotsInOutput: 2, numberOfBins: 100, columns: new ColumnOptions[] { ("out1", "VectorFloat"), ("out2", "VectorDouble") @@ -220,7 +220,7 @@ public void TestMutualInformationOldSavingAndLoading() var dataView = reader.Load(new MultiFileSource(dataPath)).AsDynamic; - var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumn: "Label"); + var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label"); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result);