Skip to content

Scrubbing feature selection #2852

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public static void Example()
// specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information
// between features and label.
var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(
outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumn: "Label", slotsInOutput: 5);
outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumnName: "Label", slotsInOutput: 5);

// Now, we can put the previous two transformations together in a pipeline.
var pipeline = countSelectEst.Append(mutualInfoEst);
Expand Down
19 changes: 10 additions & 9 deletions src/Microsoft.ML.Transforms/CountFeatureSelection.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
using Microsoft.ML;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
using Microsoft.ML.EntryPoints;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Transforms.FeatureSelection;

Expand Down Expand Up @@ -54,23 +53,25 @@ public sealed class ColumnOptions
public readonly string Name;
/// <summary> Name of the column to transform.</summary>
public readonly string InputColumnName;
/// <summary> If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</summary>
public readonly long MinCount;
/// <summary>If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</summary>
public readonly long Count;

/// <summary>
/// Describes the parameters of the feature selection process for a column pair.
/// </summary>
/// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param>
/// <param name="minCount">If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</param>
public ColumnOptions(string name, string inputColumnName = null, long minCount = Defaults.Count)
/// <param name="count">If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</param>

public ColumnOptions(string name, string inputColumnName = null, long count = Defaults.Count)
{
Name = name;
Contracts.CheckValue(Name, nameof(Name));

InputColumnName = inputColumnName ?? name;
Contracts.CheckValue(InputColumnName, nameof(InputColumnName));
MinCount = minCount;
Contracts.CheckParam(count >= 0, nameof(count), "Must be non-negative.");
Count = count;
}
}

Expand Down Expand Up @@ -183,7 +184,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
host.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns));
host.CheckUserArg(options.Count > 0, nameof(options.Count));

var columnOptions = options.Columns.Select(inColName => new ColumnOptions(inColName, minCount: options.Count)).ToArray();
var columnOptions = options.Columns.Select(inColName => new ColumnOptions(inColName, count: options.Count)).ToArray();

return new CountFeatureSelectingEstimator(env, columnOptions).Fit(input).Transform(input) as IDataTransform;
}
Expand All @@ -206,11 +207,11 @@ private static void CreateDropAndCopyColumns(ColumnOptions[] columnOptions, int
selectedCount[i] = 0;
for (int j = 0; j < score.Length; j++)
{
if (score[j] < columnOptions[i].MinCount)
if (score[j] < columnOptions[i].Count)
{
// Adjacent slots are combined into a single range.
int min = j;
while (j < score.Length && score[j] < columnOptions[i].MinCount)
while (j < score.Length && score[j] < columnOptions[i].Count)
j++;
int max = j - 1;
slots.Add((min, max));
Expand Down
20 changes: 10 additions & 10 deletions src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ public static class FeatureSelectionCatalog
{
/// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
/// <param name="catalog">The transform's catalog.</param>
/// <param name="labelColumn">Name of the column to use for labels.</param>
/// <param name="labelColumnName">The name of the label column.</param>
/// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param>
/// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
/// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
/// <param name="columns">Specifies the names of the input columns for the transformation, and their respective output column names.</param>
/// <example>
/// <format type="text/markdown">
Expand All @@ -26,20 +26,20 @@ public static class FeatureSelectionCatalog
/// </format>
/// </example>
public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog,
string labelColumn = MutualInfoSelectDefaults.LabelColumn,
string labelColumnName = MutualInfoSelectDefaults.LabelColumn,
int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput,
int numBins = MutualInfoSelectDefaults.NumBins,
int numberOfBins = MutualInfoSelectDefaults.NumBins,
params ColumnOptions[] columns)
=> new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumn, slotsInOutput, numBins,
=> new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumnName, slotsInOutput, numberOfBins,
ColumnOptions.ConvertToValueTuples(columns));

/// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
/// <param name="catalog">The transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="labelColumn">Name of the column to use for labels.</param>
/// <param name="labelColumnName">The name of the label column.</param>
/// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param>
/// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
/// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
Expand All @@ -49,10 +49,10 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu
/// </example>
public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog,
string outputColumnName, string inputColumnName = null,
string labelColumn = MutualInfoSelectDefaults.LabelColumn,
string labelColumnName = MutualInfoSelectDefaults.LabelColumn,
int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput,
int numBins = MutualInfoSelectDefaults.NumBins)
=> new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumn, slotsInOutput, numBins);
int numberOfBins = MutualInfoSelectDefaults.NumBins)
=> new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumnName, slotsInOutput, numberOfBins);

/// <include file='doc.xml' path='doc/members/member[@name="CountFeatureSelection"]' />
/// <param name="catalog">The transform's catalog.</param>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ internal sealed class Options : TransformInputBase
/// <param name="env">The environment to use.</param>
/// <param name="labelColumn">Name of the column to use for labels.</param>
/// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param>
/// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
/// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
/// <param name="columns">Specifies the names of the input columns for the transformation, and their respective output column names.</param>
/// <example>
/// <format type="text/markdown">
Expand All @@ -78,7 +78,7 @@ internal sealed class Options : TransformInputBase
internal MutualInformationFeatureSelectingEstimator(IHostEnvironment env,
string labelColumn = Defaults.LabelColumn,
int slotsInOutput = Defaults.SlotsInOutput,
int numBins = Defaults.NumBins,
int numberOfBins = Defaults.NumBins,
params (string outputColumnName, string inputColumnName)[] columns)
{
Contracts.CheckValue(env, nameof(env));
Expand All @@ -87,12 +87,12 @@ internal MutualInformationFeatureSelectingEstimator(IHostEnvironment env,
_host.CheckUserArg(Utils.Size(columns) > 0, nameof(columns));
_host.CheckUserArg(slotsInOutput > 0, nameof(slotsInOutput));
_host.CheckNonWhiteSpace(labelColumn, nameof(labelColumn));
_host.Check(numBins > 1, "numBins must be greater than 1.");
_host.Check(numberOfBins > 1, "numBins must be greater than 1.");

_columns = columns;
_labelColumn = labelColumn;
_slotsInOutput = slotsInOutput;
_numBins = numBins;
_numBins = numberOfBins;
}

/// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
Expand Down
18 changes: 9 additions & 9 deletions test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public void FeatureSelectionWorkout()
var est = new WordBagEstimator(ML, "bag_of_words", "text")
.AppendCacheCheckpoint(ML)
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words_count", "bag_of_words", 10)
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumn: "label")));
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumnName: "label")));

var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv");
using (var ch = Env.Start("save"))
Expand Down Expand Up @@ -115,11 +115,11 @@ public void CountFeatureSelectionWorkout()
var data = ML.Data.Cache(reader.Load(new MultiFileSource(dataPath)).AsDynamic);

var columns = new[] {
new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", minCount: 1),
new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", minCount: 690),
new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", minCount: 100),
new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", minCount: 690),
new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", minCount: 100)
new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", count: 1),
new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", count: 690),
new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", count: 100),
new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", count: 690),
new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", count: 100)
};
var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("FeatureSelect", "VectorFloat", count: 1)
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(columns));
Expand Down Expand Up @@ -182,8 +182,8 @@ public void MutualInformationSelectionWorkout()

var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic;

var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumn: "Label")
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumn: "Label", slotsInOutput: 2, numBins: 100,
var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label")
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumnName: "Label", slotsInOutput: 2, numberOfBins: 100,
columns: new ColumnOptions[] {
("out1", "VectorFloat"),
("out2", "VectorDouble")
Expand Down Expand Up @@ -220,7 +220,7 @@ public void TestMutualInformationOldSavingAndLoading()

var dataView = reader.Load(new MultiFileSource(dataPath)).AsDynamic;

var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumn: "Label");
var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label");

var result = pipe.Fit(dataView).Transform(dataView);
var resultRoles = new RoleMappedData(result);
Expand Down