Skip to content

Commit 61aa540

Browse files
authored
First iteration (#2852)
1 parent b70b424 commit 61aa540

File tree

5 files changed

+34
-33
lines changed

5 files changed

+34
-33
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public static void Example()
5555
// specify the parameter `numBins', which controls the number of bins used in the approximation of the mutual information
5656
// between features and label.
5757
var mutualInfoEst = ml.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(
58-
outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumn: "Label", slotsInOutput: 5);
58+
outputColumnName: "FeaturesMISelect", inputColumnName: "FeaturesCountSelect", labelColumnName: "Label", slotsInOutput: 5);
5959

6060
// Now, we can put the previous two transformations together in a pipeline.
6161
var pipeline = countSelectEst.Append(mutualInfoEst);

src/Microsoft.ML.Transforms/CountFeatureSelection.cs

+10-9
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
using Microsoft.ML;
1111
using Microsoft.ML.CommandLine;
1212
using Microsoft.ML.Data;
13-
using Microsoft.ML.EntryPoints;
1413
using Microsoft.ML.Internal.Utilities;
1514
using Microsoft.ML.Transforms.FeatureSelection;
1615

@@ -54,23 +53,25 @@ public sealed class ColumnOptions
5453
public readonly string Name;
5554
/// <summary> Name of the column to transform.</summary>
5655
public readonly string InputColumnName;
57-
/// <summary> If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</summary>
58-
public readonly long MinCount;
56+
/// <summary>If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</summary>
57+
public readonly long Count;
5958

6059
/// <summary>
6160
/// Describes the parameters of the feature selection process for a column pair.
6261
/// </summary>
6362
/// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
6463
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param>
65-
/// <param name="minCount">If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</param>
66-
public ColumnOptions(string name, string inputColumnName = null, long minCount = Defaults.Count)
64+
/// <param name="count">If the count of non-default values for a slot is greater than or equal to this threshold in the training data, the slot is preserved.</param>
65+
66+
public ColumnOptions(string name, string inputColumnName = null, long count = Defaults.Count)
6767
{
6868
Name = name;
6969
Contracts.CheckValue(Name, nameof(Name));
7070

7171
InputColumnName = inputColumnName ?? name;
7272
Contracts.CheckValue(InputColumnName, nameof(InputColumnName));
73-
MinCount = minCount;
73+
Contracts.CheckParam(count >= 0, nameof(count), "Must be non-negative.");
74+
Count = count;
7475
}
7576
}
7677

@@ -183,7 +184,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
183184
host.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns));
184185
host.CheckUserArg(options.Count > 0, nameof(options.Count));
185186

186-
var columnOptions = options.Columns.Select(inColName => new ColumnOptions(inColName, minCount: options.Count)).ToArray();
187+
var columnOptions = options.Columns.Select(inColName => new ColumnOptions(inColName, count: options.Count)).ToArray();
187188

188189
return new CountFeatureSelectingEstimator(env, columnOptions).Fit(input).Transform(input) as IDataTransform;
189190
}
@@ -206,11 +207,11 @@ private static void CreateDropAndCopyColumns(ColumnOptions[] columnOptions, int
206207
selectedCount[i] = 0;
207208
for (int j = 0; j < score.Length; j++)
208209
{
209-
if (score[j] < columnOptions[i].MinCount)
210+
if (score[j] < columnOptions[i].Count)
210211
{
211212
// Adjacent slots are combined into a single range.
212213
int min = j;
213-
while (j < score.Length && score[j] < columnOptions[i].MinCount)
214+
while (j < score.Length && score[j] < columnOptions[i].Count)
214215
j++;
215216
int max = j - 1;
216217
slots.Add((min, max));

src/Microsoft.ML.Transforms/FeatureSelectionCatalog.cs

+10-10
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ public static class FeatureSelectionCatalog
1414
{
1515
/// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
1616
/// <param name="catalog">The transform's catalog.</param>
17-
/// <param name="labelColumn">Name of the column to use for labels.</param>
17+
/// <param name="labelColumnName">The name of the label column.</param>
1818
/// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param>
19-
/// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
19+
/// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
2020
/// <param name="columns">Specifies the names of the input columns for the transformation, and their respective output column names.</param>
2121
/// <example>
2222
/// <format type="text/markdown">
@@ -26,20 +26,20 @@ public static class FeatureSelectionCatalog
2626
/// </format>
2727
/// </example>
2828
public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog,
29-
string labelColumn = MutualInfoSelectDefaults.LabelColumn,
29+
string labelColumnName = MutualInfoSelectDefaults.LabelColumn,
3030
int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput,
31-
int numBins = MutualInfoSelectDefaults.NumBins,
31+
int numberOfBins = MutualInfoSelectDefaults.NumBins,
3232
params ColumnOptions[] columns)
33-
=> new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumn, slotsInOutput, numBins,
33+
=> new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), labelColumnName, slotsInOutput, numberOfBins,
3434
ColumnOptions.ConvertToValueTuples(columns));
3535

3636
/// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />
3737
/// <param name="catalog">The transform's catalog.</param>
3838
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
3939
/// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
40-
/// <param name="labelColumn">Name of the column to use for labels.</param>
40+
/// <param name="labelColumnName">The name of the label column.</param>
4141
/// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param>
42-
/// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
42+
/// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
4343
/// <example>
4444
/// <format type="text/markdown">
4545
/// <![CDATA[
@@ -49,10 +49,10 @@ public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMu
4949
/// </example>
5050
public static MutualInformationFeatureSelectingEstimator SelectFeaturesBasedOnMutualInformation(this TransformsCatalog.FeatureSelectionTransforms catalog,
5151
string outputColumnName, string inputColumnName = null,
52-
string labelColumn = MutualInfoSelectDefaults.LabelColumn,
52+
string labelColumnName = MutualInfoSelectDefaults.LabelColumn,
5353
int slotsInOutput = MutualInfoSelectDefaults.SlotsInOutput,
54-
int numBins = MutualInfoSelectDefaults.NumBins)
55-
=> new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumn, slotsInOutput, numBins);
54+
int numberOfBins = MutualInfoSelectDefaults.NumBins)
55+
=> new MutualInformationFeatureSelectingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, labelColumnName, slotsInOutput, numberOfBins);
5656

5757
/// <include file='doc.xml' path='doc/members/member[@name="CountFeatureSelection"]' />
5858
/// <param name="catalog">The transform's catalog.</param>

src/Microsoft.ML.Transforms/MutualInformationFeatureSelection.cs

+4-4
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ internal sealed class Options : TransformInputBase
6666
/// <param name="env">The environment to use.</param>
6767
/// <param name="labelColumn">Name of the column to use for labels.</param>
6868
/// <param name="slotsInOutput">The maximum number of slots to preserve in the output. The number of slots to preserve is taken across all input columns.</param>
69-
/// <param name="numBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
69+
/// <param name="numberOfBins">Max number of bins used to approximate mutual information between each input column and the label column. Power of 2 recommended.</param>
7070
/// <param name="columns">Specifies the names of the input columns for the transformation, and their respective output column names.</param>
7171
/// <example>
7272
/// <format type="text/markdown">
@@ -78,7 +78,7 @@ internal sealed class Options : TransformInputBase
7878
internal MutualInformationFeatureSelectingEstimator(IHostEnvironment env,
7979
string labelColumn = Defaults.LabelColumn,
8080
int slotsInOutput = Defaults.SlotsInOutput,
81-
int numBins = Defaults.NumBins,
81+
int numberOfBins = Defaults.NumBins,
8282
params (string outputColumnName, string inputColumnName)[] columns)
8383
{
8484
Contracts.CheckValue(env, nameof(env));
@@ -87,12 +87,12 @@ internal MutualInformationFeatureSelectingEstimator(IHostEnvironment env,
8787
_host.CheckUserArg(Utils.Size(columns) > 0, nameof(columns));
8888
_host.CheckUserArg(slotsInOutput > 0, nameof(slotsInOutput));
8989
_host.CheckNonWhiteSpace(labelColumn, nameof(labelColumn));
90-
_host.Check(numBins > 1, "numBins must be greater than 1.");
90+
_host.Check(numberOfBins > 1, "numBins must be greater than 1.");
9191

9292
_columns = columns;
9393
_labelColumn = labelColumn;
9494
_slotsInOutput = slotsInOutput;
95-
_numBins = numBins;
95+
_numBins = numberOfBins;
9696
}
9797

9898
/// <include file='doc.xml' path='doc/members/member[@name="MutualInformationFeatureSelection"]/*' />

test/Microsoft.ML.Tests/Transformers/FeatureSelectionTests.cs

+9-9
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public void FeatureSelectionWorkout()
4242
var est = new WordBagEstimator(ML, "bag_of_words", "text")
4343
.AppendCacheCheckpoint(ML)
4444
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("bag_of_words_count", "bag_of_words", 10)
45-
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumn: "label")));
45+
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("bag_of_words_mi", "bag_of_words", labelColumnName: "label")));
4646

4747
var outputPath = GetOutputPath("FeatureSelection", "featureselection.tsv");
4848
using (var ch = Env.Start("save"))
@@ -115,11 +115,11 @@ public void CountFeatureSelectionWorkout()
115115
var data = ML.Data.Cache(reader.Load(new MultiFileSource(dataPath)).AsDynamic);
116116

117117
var columns = new[] {
118-
new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", minCount: 1),
119-
new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", minCount: 690),
120-
new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", minCount: 100),
121-
new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", minCount: 690),
122-
new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", minCount: 100)
118+
new CountFeatureSelectingEstimator.ColumnOptions("FeatureSelectDouble", "VectorDouble", count: 1),
119+
new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing690", "ScalarFloat", count: 690),
120+
new CountFeatureSelectingEstimator.ColumnOptions("ScalFeatureSelectMissing100", "ScalarFloat", count: 100),
121+
new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing690", "VectorDouble", count: 690),
122+
new CountFeatureSelectingEstimator.ColumnOptions("VecFeatureSelectMissing100", "VectorDouble", count: 100)
123123
};
124124
var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount("FeatureSelect", "VectorFloat", count: 1)
125125
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnCount(columns));
@@ -182,8 +182,8 @@ public void MutualInformationSelectionWorkout()
182182

183183
var data = reader.Load(new MultiFileSource(dataPath)).AsDynamic;
184184

185-
var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumn: "Label")
186-
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumn: "Label", slotsInOutput: 2, numBins: 100,
185+
var est = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label")
186+
.Append(ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation(labelColumnName: "Label", slotsInOutput: 2, numberOfBins: 100,
187187
columns: new ColumnOptions[] {
188188
("out1", "VectorFloat"),
189189
("out2", "VectorDouble")
@@ -220,7 +220,7 @@ public void TestMutualInformationOldSavingAndLoading()
220220

221221
var dataView = reader.Load(new MultiFileSource(dataPath)).AsDynamic;
222222

223-
var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumn: "Label");
223+
var pipe = ML.Transforms.FeatureSelection.SelectFeaturesBasedOnMutualInformation("FeatureSelect", "VectorFloat", slotsInOutput: 1, labelColumnName: "Label");
224224

225225
var result = pipe.Fit(dataView).Transform(dataView);
226226
var resultRoles = new RoleMappedData(result);

0 commit comments

Comments
 (0)