Skip to content

Commit 0064b05

Browse files
committed
Addressing PR comments
1 parent 5fae68c commit 0064b05

File tree

16 files changed

+116
-120
lines changed

16 files changed

+116
-120
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ public static void KeyToValue_Term()
1212
{
1313
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
1414
// as well as the source of randomness.
15-
var ml = new MLContext();
15+
var mlContext = new MLContext();
1616

1717
// Get a small dataset as an IEnumerable.
1818
IEnumerable<SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData();
19-
var trainData = ml.CreateStreamingDataView(data);
19+
var trainData = mlContext.CreateStreamingDataView(data);
2020

2121
// Preview of one of the columns of the the topics data.
2222
// The Review column contains the keys associated with a particular body of text.
@@ -31,16 +31,16 @@ public static void KeyToValue_Term()
3131
// making use of default settings.
3232
string defaultColumnName = "DefaultKeys";
3333
// REVIEW create through the catalog extension
34-
var default_pipeline = new WordTokenizingEstimator(ml, "Review")
35-
.Append(new ValueToKeyMappingEstimator(ml, "Review", defaultColumnName));
34+
var default_pipeline = mlContext.Transforms.Text.TokenizeWords("Review")
35+
.Append(mlContext.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));
3636

3737
// Another pipeline, that customizes the advanced settings of the TermEstimator.
3838
// We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
3939
// and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
4040
// to value/alphabetically.
4141
string customizedColumnName = "CustomizedKeys";
42-
var customized_pipeline = new WordTokenizingEstimator(ml, "Review")
43-
.Append(new ValueToKeyMappingEstimator(ml, "Review", customizedColumnName, maxNumTerms: 10, sort: ValueToKeyMappingTransformer.SortOrder.Value));
42+
var customized_pipeline = mlContext.Transforms.Text.TokenizeWords("Review")
43+
.Append(mlContext.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumTerms: 10, sort: ValueToKeyMappingTransformer.SortOrder.Value));
4444

4545
// The transformed data.
4646
var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData);
@@ -61,7 +61,7 @@ public static void KeyToValue_Term()
6161
};
6262

6363
// Preview of the DefaultKeys column obtained after processing the input.
64-
var defaultColumn = transformedData_default.GetColumn<VBuffer<uint>>(ml, defaultColumnName);
64+
var defaultColumn = transformedData_default.GetColumn<VBuffer<uint>>(mlContext, defaultColumnName);
6565
printHelper(defaultColumnName, defaultColumn);
6666

6767
// DefaultKeys column obtained post-transformation.
@@ -72,7 +72,7 @@ public static void KeyToValue_Term()
7272
// 9 10 11 12 13 6
7373

7474
// Previewing the CustomizedKeys column obtained after processing the input.
75-
var customizedColumn = transformedData_customized.GetColumn<VBuffer<uint>>(ml, customizedColumnName);
75+
var customizedColumn = transformedData_customized.GetColumn<VBuffer<uint>>(mlContext, customizedColumnName);
7676
printHelper(customizedColumnName, customizedColumn);
7777

7878
// CustomizedKeys column obtained post-transformation.
@@ -84,11 +84,11 @@ public static void KeyToValue_Term()
8484

8585
// Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines
8686
// to convert the keys back to the strings.
87-
var pipeline = default_pipeline.Append(new KeyToValueMappingEstimator(ml, defaultColumnName));
87+
var pipeline = default_pipeline.Append(new KeyToValueMappingEstimator(mlContext, defaultColumnName));
8888
transformedData_default = pipeline.Fit(trainData).Transform(trainData);
8989

9090
// Preview of the DefaultColumnName column obtained.
91-
var originalColumnBack = transformedData_default.GetColumn<VBuffer<ReadOnlyMemory<char>>>(ml, defaultColumnName);
91+
var originalColumnBack = transformedData_default.GetColumn<VBuffer<ReadOnlyMemory<char>>>(mlContext, defaultColumnName);
9292

9393
foreach (var row in originalColumnBack)
9494
{

src/Microsoft.ML.Data/Commands/CrossValidationCommand.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ private string GetSplitColumn(IChannel ch, IDataView input, ref IDataView output
330330
int inc = 0;
331331
while (input.Schema.TryGetColumnIndex(stratificationColumn, out tmp))
332332
stratificationColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
333-
output = new HashingEstimator(Host, origStratCol, stratificationColumn, 30).Fit(input).Transform(input);
333+
output = new HashingEstimator(Host, stratificationColumn, origStratCol, 30).Fit(input).Transform(input);
334334
}
335335
}
336336

src/Microsoft.ML.Data/TrainContext.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ private void EnsureStratificationColumn(ref IDataView data, ref string stratific
151151
// Generate a new column with the hashed stratification column.
152152
while (data.Schema.TryGetColumnIndex(stratificationColumn, out tmp))
153153
stratificationColumn = string.Format("{0}_{1:000}", origStratCol, ++inc);
154-
data = new HashingEstimator(Host, origStratCol, stratificationColumn, 30).Fit(data).Transform(data);
154+
data = new HashingEstimator(Host, stratificationColumn, origStratCol, 30).Fit(data).Transform(data);
155155
}
156156
}
157157
}

src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs

Lines changed: 15 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,17 @@ public static class ConversionsExtensionsCatalog
2020
/// Hashes the values in the input column.
2121
/// </summary>
2222
/// <param name="catalog">The transform's catalog.</param>
23-
/// <param name="inputColumn">Name of the input column.</param>
2423
/// <param name="outputColumn">Name of the column to be transformed.</param>
24+
/// <param name="inputColumn">Name of the input column. If set to <see langword="null"/>, the value of the <paramref name="outputColumn"/>
25+
/// will be used as input.</param>
2526
/// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
2627
/// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values.
2728
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
2829
/// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
2930
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
30-
public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn,
31+
public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn = null,
3132
int hashBits = HashDefaults.HashBits, int invertHash = HashDefaults.InvertHash)
32-
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog), inputColumn, outputColumn, hashBits, invertHash);
33+
=> new HashingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn ?? outputColumn, hashBits, invertHash);
3334

3435
/// <summary>
3536
/// Hashes the values in the input column.
@@ -43,10 +44,11 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms
4344
/// Changes column type of the input column.
4445
/// </summary>
4546
/// <param name="catalog">The transform's catalog.</param>
46-
/// <param name="inputColumn">Name of the input column.</param>
4747
/// <param name="outputColumn">Name of the column to be transformed.</param>
48+
/// <param name="inputColumn">Name of the input column. If set to <see langword="null"/>, the value of the <paramref name="outputColumn"/>
49+
/// will be used as input.</param>
4850
/// <param name="outputKind">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
49-
public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn,
51+
public static TypeConvertingEstimator ConvertType(this TransformsCatalog.ConversionTransforms catalog, string outputColumn, string inputColumn = null,
5052
DataKind outputKind = ConvertDefaults.DefaultOutputKind)
5153
=> new TypeConvertingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, outputKind);
5254

@@ -88,44 +90,31 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.
8890
/// Convert the key types back to their original vectors.
8991
/// </summary>
9092
/// <param name="catalog">The categorical transform's catalog.</param>
91-
/// <param name="inputColumn">The name of the input column.</param>
93+
/// <param name="inputColumn">The name of the input column. If set to <see langword="null"/>, the value of the <paramref name="outputColumn"/>
94+
/// will be used as input.</param>
9295
/// <param name="outputColumn">The name of the output column.</param>
9396
/// <param name="bag">Whether bagging is used for the conversion. </param>
9497
public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog,
95-
string outputColumn, string inputColumn, bool bag = KeyToVectorMappingEstimator.Defaults.Bag)
96-
=> new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, bag);
98+
string outputColumn, string inputColumn = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag)
99+
=> new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn ?? outputColumn, bag);
97100

98101
/// <summary>
99102
/// Converts value types into <see cref="KeyType"/>.
100103
/// </summary>
101104
/// <param name="catalog">The categorical transform's catalog.</param>
102-
/// <param name="inputColumn">Name of the column to be transformed.</param>
105+
/// <param name="inputColumn">Name of the column to be transformed. If set to <see langword="null"/>, the value of the <paramref name="outputColumn"/>
106+
/// will be used as input.</param>
103107
/// <param name="outputColumn">Name of the output column.</param>
104108
/// <param name="maxNumTerms">Maximum number of keys to keep per column when auto-training.</param>
105109
/// <param name="sort">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingTransformer.SortOrder.Occurrence"/> choosen they will be in the order encountered.
106110
/// If <see cref="ValueToKeyMappingTransformer.SortOrder.Value"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
107111
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
108112
string outputColumn,
109-
string inputColumn,
113+
string inputColumn = null,
110114
int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms,
111115
ValueToKeyMappingTransformer.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort)
112116
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumn, inputColumn, maxNumTerms, sort);
113117

114-
/// <summary>
115-
/// Converts value types into <see cref="KeyType"/> loading the keys to use from <paramref name="file"/>.
116-
/// </summary>
117-
/// <param name="catalog">The categorical transform's catalog.</param>
118-
/// <param name="columns">The data columns to map to keys.</param>
119-
/// <param name="file">The path of the file containing the terms.</param>
120-
/// <param name="termsColumn"></param>
121-
/// <param name="loaderFactory"></param>
122-
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
123-
ValueToKeyMappingTransformer.ColumnInfo[] columns,
124-
string file = null,
125-
string termsColumn = null,
126-
IComponentFactory<IMultiStreamSource, IDataLoader> loaderFactory = null)
127-
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns, file, termsColumn, loaderFactory);
128-
129118
/// <summary>
130119
/// Maps specified keys to specified values
131120
/// </summary>
@@ -141,7 +130,7 @@ public static ValueMappingEstimator<TInputType, TOutputType> ValueMap<TInputType
141130
this TransformsCatalog.ConversionTransforms catalog,
142131
IEnumerable<TInputType> keys,
143132
IEnumerable<TOutputType> values,
144-
params (string source, string name)[] columns)
133+
params (string inputColumn, string outputColumn)[] columns)
145134
=> new ValueMappingEstimator<TInputType, TOutputType>(CatalogUtils.GetEnvironment(catalog), keys, values, columns);
146135
}
147136
}

src/Microsoft.ML.Data/Transforms/Hashing.cs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ public sealed class ColumnInfo
135135
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
136136
/// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
137137
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
138-
public ColumnInfo(string input,
139-
string output = null,
138+
public ColumnInfo(string output,
139+
string input = null,
140140
int hashBits = HashingEstimator.Defaults.HashBits,
141141
uint seed = HashingEstimator.Defaults.Seed,
142142
bool ordered = HashingEstimator.Defaults.Ordered,
@@ -146,7 +146,7 @@ public ColumnInfo(string input,
146146
throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger");
147147
if (invertHash != 0 && hashBits >= 31)
148148
throw Contracts.ExceptParam(nameof(hashBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", hashBits);
149-
Contracts.CheckNonWhiteSpace(input, nameof(input));
149+
Contracts.CheckNonWhiteSpace(output, nameof(output));
150150
Input = input;
151151
Output = output ?? input;
152152
HashBits = hashBits;
@@ -157,6 +157,8 @@ public ColumnInfo(string input,
157157

158158
internal ColumnInfo(string input, string output, ModelLoadContext ctx)
159159
{
160+
Contracts.CheckNonWhiteSpace(output, nameof(output));
161+
160162
Input = input;
161163
Output = output;
162164
// *** Binary format ***
@@ -386,8 +388,8 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData
386388
{
387389
var item = args.Column[i];
388390
var kind = item.InvertHash ?? args.InvertHash;
389-
cols[i] = new ColumnInfo(item.Source ?? item.Name,
390-
item.Name,
391+
cols[i] = new ColumnInfo(item.Name,
392+
item.Source ?? item.Name,
391393
item.HashBits ?? args.HashBits,
392394
item.Seed ?? args.Seed,
393395
item.Ordered ?? args.Ordered,
@@ -1211,16 +1213,17 @@ internal static bool IsColumnTypeValid(ColumnType type)
12111213
/// Initializes a new instance of <see cref="HashingEstimator"/>.
12121214
/// </summary>
12131215
/// <param name="env">Host Environment.</param>
1214-
/// <param name="inputColumn">Name of the column to be transformed.</param>
1216+
/// <param name="inputColumn">Name of the column to be transformed.
1217+
/// If set to <see langword="null"/> the value specified for the <paramref name="outputColumn"/> will be used.</param>
12151218
/// <param name="outputColumn">Name of the output column. </param>
12161219
/// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param>
12171220
/// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values.
12181221
/// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one.
12191222
/// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained.
12201223
/// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param>
1221-
internal HashingEstimator(IHostEnvironment env, string inputColumn, string outputColumn,
1224+
internal HashingEstimator(IHostEnvironment env, string outputColumn, string inputColumn = null,
12221225
int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash)
1223-
: this(env, new HashingTransformer.ColumnInfo(inputColumn, outputColumn, hashBits: hashBits, invertHash: invertHash))
1226+
: this(env, new HashingTransformer.ColumnInfo(outputColumn, inputColumn ?? outputColumn, hashBits: hashBits, invertHash: invertHash))
12241227
{
12251228
}
12261229

0 commit comments

Comments
 (0)