Skip to content

Creation of components through MLContext and cleanup (KeyToValue, ValueToKey, OneHotEncoding) #2340

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Feb 2, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

namespace Microsoft.ML.Samples.Dynamic
{
public class KeyToValue_TermExample
public class KeyToValueValueToKeyExample
{
public static void KeyToValue_Term()
public static void KeyToValueValueToKey()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
Expand All @@ -32,15 +32,15 @@ public static void KeyToValue_Term()
string defaultColumnName = "DefaultKeys";
// REVIEW create through the catalog extension
var default_pipeline = new WordTokenizingEstimator(ml, "Review")
.Append(new ValueToKeyMappingEstimator(ml, defaultColumnName, "Review"));
.Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));

// Another pipeline, that customizes the advanced settings of the TermEstimator.
// Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
// We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
// and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
// to value/alphabetically.
string customizedColumnName = "CustomizedKeys";
var customized_pipeline = new WordTokenizingEstimator(ml, "Review")
.Append(new ValueToKeyMappingEstimator(ml,customizedColumnName, "Review", maxNumTerms: 10, sort: ValueToKeyMappingTransformer.SortOrder.Value));
.Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value));

// The transformed data.
var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData);
Expand Down Expand Up @@ -84,7 +84,7 @@ public static void KeyToValue_Term()

// Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines
// to convert the keys back to the strings.
var pipeline = default_pipeline.Append(new KeyToValueMappingEstimator(ml, defaultColumnName));
var pipeline = default_pipeline.Append(ml.Transforms.Conversion.MapKeyToValue(defaultColumnName));
transformedData_default = pipeline.Fit(trainData).Transform(trainData);

// Preview of the DefaultColumnName column obtained.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public static void Run()
// The KeyToValueMappingEstimator is added to provide a reverse lookup of the KeyType, converting the KeyType value back
// to the original value.
var pipeline = new ValueMappingEstimator<string, string>(mlContext, educationKeys, educationValues, true, ("EducationKeyType", "Education"))
.Append(new KeyToValueMappingEstimator(mlContext, ("EducationCategory", "EducationKeyType")));
.Append(mlContext.Transforms.Conversion.MapKeyToValue(("EducationCategory", "EducationKeyType")));

// Fits the ValueMappingEstimator and transforms the data adding the EducationKeyType column.
IDataView transformedData = pipeline.Fit(trainData).Transform(trainData);
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Transforms/ColumnBindingsBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ protected ColumnBindingsBase(Schema input, bool user, params string[] names)
// warning if we decide to rename this argument, and so know to change the below hard-coded
// standard column name.
const string standardColumnArgName = "Columns";
Contracts.Assert(nameof(ValueToKeyMappingTransformer.Arguments.Columns) == standardColumnArgName);
Contracts.Assert(nameof(ValueToKeyMappingTransformer.Options.Columns) == standardColumnArgName);
Contracts.Assert(nameof(ColumnConcatenatingTransformer.Arguments.Columns) == standardColumnArgName);

for (int iinfo = 0; iinfo < names.Length; iinfo++)
Expand Down
30 changes: 22 additions & 8 deletions src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -113,26 +113,40 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.
/// <param name="catalog">The categorical transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="maxNumTerms">Maximum number of keys to keep per column when auto-training.</param>
/// <param name="sort">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingTransformer.SortOrder.Occurrence"/> choosen they will be in the order encountered.
/// If <see cref="ValueToKeyMappingTransformer.SortOrder.Value"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
/// <param name="maxNumKeys">Maximum number of keys to keep per column when auto-training.</param>
/// <param name="sort">How items should be ordered when vectorized. If <see cref="ValueToKeyMappingEstimator.SortOrder.Occurrence"/> choosen they will be in the order encountered.
/// If <see cref="ValueToKeyMappingEstimator.SortOrder.Value"/>, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[ValueToKey](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs)]
/// ]]>
/// </format>
/// </example>
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
string outputColumnName,
string inputColumnName = null,
int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms,
ValueToKeyMappingTransformer.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort)
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, maxNumTerms, sort);
int maxNumKeys = ValueToKeyMappingEstimator.Defaults.MaxNumKeys,
ValueToKeyMappingEstimator.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort)
=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, maxNumKeys, sort);

/// <summary>
/// Converts value types into <see cref="KeyType"/>, optionally loading the keys to use from <paramref name="keyData"/>.
/// </summary>
/// <param name="catalog">The categorical transform's catalog.</param>
/// <param name="columns">The data columns to map to keys.</param>
/// <param name="keyData">The data view containing the terms. If specified, this should be a single column data
/// view, and the key-values will be taken from taht column. If unspecified, the key-values will be determined
/// view, and the key-values will be taken from that column. If unspecified, the key-values will be determined
/// from the input data upon fitting.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[ValueToKey](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs)]
/// ]]>
/// </format>
/// </example>
public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog,
ValueToKeyMappingTransformer.ColumnInfo[] columns, IDataView keyData = null)
ValueToKeyMappingEstimator.ColumnInfo[] columns, IDataView keyData = null)
Copy link
Contributor

@Ivanidzo4ka Ivanidzo4ka Feb 1, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ValueToKeyMappingEstimator.ColumnInfo[] columns, IDataView keyData = null [](start = 12, length = 73)

I don't understand why we have this constructor.
I can have list of terms in each columnInfo, and I can specify keyData, what will happen? Is keyData get affected by maxNum and sort?
It brings so many questions. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, we should clean this one out in a separate PR. Do you want to create an issue? or should I do it?


In reply to: 252895984 [](ancestors = 252895984)

=> new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns, keyData);

/// <summary>
Expand Down
27 changes: 16 additions & 11 deletions src/Microsoft.ML.Data/Transforms/KeyToValue.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
using Microsoft.ML.Transforms.Conversions;
using Newtonsoft.Json.Linq;

[assembly: LoadableClass(typeof(IDataTransform), typeof(KeyToValueMappingTransformer), typeof(KeyToValueMappingTransformer.Arguments), typeof(SignatureDataTransform),
[assembly: LoadableClass(typeof(IDataTransform), typeof(KeyToValueMappingTransformer), typeof(KeyToValueMappingTransformer.Options), typeof(SignatureDataTransform),
KeyToValueMappingTransformer.UserName, KeyToValueMappingTransformer.LoaderSignature, "KeyToValue", "KeyToVal", "Unterm")]

[assembly: LoadableClass(typeof(IDataTransform), typeof(KeyToValueMappingTransformer), null, typeof(SignatureLoadDataTransform),
Expand All @@ -41,7 +41,7 @@ namespace Microsoft.ML.Transforms.Conversions
/// </summary>
public sealed class KeyToValueMappingTransformer : OneToOneTransformerBase
{
public sealed class Column : OneToOneColumn
internal sealed class Column : OneToOneColumn
{
internal static Column Parse(string str)
{
Expand All @@ -58,7 +58,8 @@ internal bool TryUnparse(StringBuilder sb)
}
}

public sealed class Arguments : TransformInputBase
[BestFriend]
internal sealed class Options : TransformInputBase
{
[Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)",
Name = "Column", ShortName = "col", SortOrder = 1)]
Expand Down Expand Up @@ -86,15 +87,15 @@ private static VersionInfo GetVersionInfo()
/// <summary>
/// Create a <see cref="KeyToValueMappingTransformer"/> that takes and transforms one column.
/// </summary>
public KeyToValueMappingTransformer(IHostEnvironment env, string columnName)
internal KeyToValueMappingTransformer(IHostEnvironment env, string columnName)
: this(env, (columnName, columnName))
{
}

/// <summary>
/// Create a <see cref="KeyToValueMappingTransformer"/> that takes multiple pairs of columns.
/// </summary>
public KeyToValueMappingTransformer(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns)
internal KeyToValueMappingTransformer(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns)
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(KeyToValueMappingTransformer)), columns)
{
}
Expand All @@ -103,14 +104,14 @@ public KeyToValueMappingTransformer(IHostEnvironment env, params (string outputC
/// Factory method for SignatureDataTransform.
/// </summary>
[BestFriend]
internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input)
internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input)
{
Contracts.CheckValue(env, nameof(env));
env.CheckValue(args, nameof(args));
env.CheckValue(options, nameof(options));
env.CheckValue(input, nameof(input));
env.CheckNonEmpty(args.Columns, nameof(args.Columns));
env.CheckNonEmpty(options.Columns, nameof(options.Columns));

var transformer = new KeyToValueMappingTransformer(env, args.Columns.Select(c => (c.Name, c.Source ?? c.Name)).ToArray());
var transformer = new KeyToValueMappingTransformer(env, options.Columns.Select(c => (c.Name, c.Source ?? c.Name)).ToArray());
return transformer.MakeDataTransform(input);
}

Expand Down Expand Up @@ -506,16 +507,20 @@ public override JToken SavePfa(BoundPfaContext ctx, JToken srcToken)

public sealed class KeyToValueMappingEstimator : TrivialEstimator<KeyToValueMappingTransformer>
{
public KeyToValueMappingEstimator(IHostEnvironment env, string columnName)
internal KeyToValueMappingEstimator(IHostEnvironment env, string columnName)
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(KeyToValueMappingEstimator)), new KeyToValueMappingTransformer(env, columnName))
{
}

public KeyToValueMappingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns)
internal KeyToValueMappingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns)
: base(Contracts.CheckRef(env, nameof(env)).Register(nameof(KeyToValueMappingEstimator)), new KeyToValueMappingTransformer(env, columns))
{
}

/// <summary>
/// Returns the <see cref="SchemaShape"/> of the schema which will be produced by the transformer.
/// Used for schema propagation and verification in a pipeline.
/// </summary>
public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
{
Host.CheckValue(inputSchema, nameof(inputSchema));
Expand Down
Loading