diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs similarity index 90% rename from docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs rename to docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs index 882000a773..ceca716c34 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValue_Term.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs @@ -6,9 +6,9 @@ namespace Microsoft.ML.Samples.Dynamic { - public class KeyToValue_TermExample + public class KeyToValueValueToKeyExample { - public static void KeyToValue_Term() + public static void KeyToValueValueToKey() { // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, // as well as the source of randomness. @@ -32,15 +32,15 @@ public static void KeyToValue_Term() string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension var default_pipeline = new WordTokenizingEstimator(ml, "Review") - .Append(new ValueToKeyMappingEstimator(ml, defaultColumnName, "Review")); + .Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review")); - // Another pipeline, that customizes the advanced settings of the TermEstimator. + // Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator. // We can change the maxNumTerm to limit how many keys will get generated out of the set of words, // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered) // to value/alphabetically. string customizedColumnName = "CustomizedKeys"; var customized_pipeline = new WordTokenizingEstimator(ml, "Review") - .Append(new ValueToKeyMappingEstimator(ml,customizedColumnName, "Review", maxNumTerms: 10, sort: ValueToKeyMappingTransformer.SortOrder.Value)); + .Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value)); // The transformed data. var transformedData_default = default_pipeline.Fit(trainData).Transform(trainData); @@ -84,7 +84,7 @@ public static void KeyToValue_Term() // Retrieve the original values, by appending the KeyToValue etimator to the existing pipelines // to convert the keys back to the strings. - var pipeline = default_pipeline.Append(new KeyToValueMappingEstimator(ml, defaultColumnName)); + var pipeline = default_pipeline.Append(ml.Transforms.Conversion.MapKeyToValue(defaultColumnName)); transformedData_default = pipeline.Fit(trainData).Transform(trainData); // Preview of the DefaultColumnName column obtained. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs index 39a14d2442..dc7575e67e 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/ValueMappingStringToKeyType.cs @@ -59,7 +59,7 @@ public static void Run() // The KeyToValueMappingEstimator is added to provide a reverse lookup of the KeyType, converting the KeyType value back // to the original value. var pipeline = new ValueMappingEstimator(mlContext, educationKeys, educationValues, true, ("EducationKeyType", "Education")) - .Append(new KeyToValueMappingEstimator(mlContext, ("EducationCategory", "EducationKeyType"))); + .Append(mlContext.Transforms.Conversion.MapKeyToValue(("EducationCategory", "EducationKeyType"))); // Fits the ValueMappingEstimator and transforms the data adding the EducationKeyType column. IDataView transformedData = pipeline.Fit(trainData).Transform(trainData); diff --git a/src/Microsoft.ML.Data/Transforms/ColumnBindingsBase.cs b/src/Microsoft.ML.Data/Transforms/ColumnBindingsBase.cs index 93f016be37..3c376d4145 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnBindingsBase.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnBindingsBase.cs @@ -312,7 +312,7 @@ protected ColumnBindingsBase(Schema input, bool user, params string[] names) // warning if we decide to rename this argument, and so know to change the below hard-coded // standard column name. const string standardColumnArgName = "Columns"; - Contracts.Assert(nameof(ValueToKeyMappingTransformer.Arguments.Columns) == standardColumnArgName); + Contracts.Assert(nameof(ValueToKeyMappingTransformer.Options.Columns) == standardColumnArgName); Contracts.Assert(nameof(ColumnConcatenatingTransformer.Arguments.Columns) == standardColumnArgName); for (int iinfo = 0; iinfo < names.Length; iinfo++) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index f83e5fd95e..3261638c5e 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -113,15 +113,22 @@ public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog. /// The categorical transform's catalog. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// Maximum number of keys to keep per column when auto-training. - /// How items should be ordered when vectorized. If choosen they will be in the order encountered. - /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). + /// Maximum number of keys to keep per column when auto-training. + /// How items should be ordered when vectorized. If choosen they will be in the order encountered. + /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). + /// + /// + /// + /// + /// public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog, string outputColumnName, string inputColumnName = null, - int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms, - ValueToKeyMappingTransformer.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort) - => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, maxNumTerms, sort); + int maxNumKeys = ValueToKeyMappingEstimator.Defaults.MaxNumKeys, + ValueToKeyMappingEstimator.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort) + => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, maxNumKeys, sort); /// /// Converts value types into , optionally loading the keys to use from . @@ -129,10 +136,17 @@ public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.Co /// The categorical transform's catalog. /// The data columns to map to keys. /// The data view containing the terms. If specified, this should be a single column data - /// view, and the key-values will be taken from taht column. If unspecified, the key-values will be determined + /// view, and the key-values will be taken from that column. If unspecified, the key-values will be determined /// from the input data upon fitting. + /// + /// + /// + /// + /// public static ValueToKeyMappingEstimator MapValueToKey(this TransformsCatalog.ConversionTransforms catalog, - ValueToKeyMappingTransformer.ColumnInfo[] columns, IDataView keyData = null) + ValueToKeyMappingEstimator.ColumnInfo[] columns, IDataView keyData = null) => new ValueToKeyMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns, keyData); /// diff --git a/src/Microsoft.ML.Data/Transforms/KeyToValue.cs b/src/Microsoft.ML.Data/Transforms/KeyToValue.cs index af063bda07..d7b775431b 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToValue.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToValue.cs @@ -19,7 +19,7 @@ using Microsoft.ML.Transforms.Conversions; using Newtonsoft.Json.Linq; -[assembly: LoadableClass(typeof(IDataTransform), typeof(KeyToValueMappingTransformer), typeof(KeyToValueMappingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(typeof(IDataTransform), typeof(KeyToValueMappingTransformer), typeof(KeyToValueMappingTransformer.Options), typeof(SignatureDataTransform), KeyToValueMappingTransformer.UserName, KeyToValueMappingTransformer.LoaderSignature, "KeyToValue", "KeyToVal", "Unterm")] [assembly: LoadableClass(typeof(IDataTransform), typeof(KeyToValueMappingTransformer), null, typeof(SignatureLoadDataTransform), @@ -41,7 +41,7 @@ namespace Microsoft.ML.Transforms.Conversions /// public sealed class KeyToValueMappingTransformer : OneToOneTransformerBase { - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { internal static Column Parse(string str) { @@ -58,7 +58,8 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class Arguments : TransformInputBase + [BestFriend] + internal sealed class Options : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] @@ -86,7 +87,7 @@ private static VersionInfo GetVersionInfo() /// /// Create a that takes and transforms one column. /// - public KeyToValueMappingTransformer(IHostEnvironment env, string columnName) + internal KeyToValueMappingTransformer(IHostEnvironment env, string columnName) : this(env, (columnName, columnName)) { } @@ -94,7 +95,7 @@ public KeyToValueMappingTransformer(IHostEnvironment env, string columnName) /// /// Create a that takes multiple pairs of columns. /// - public KeyToValueMappingTransformer(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) + internal KeyToValueMappingTransformer(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(KeyToValueMappingTransformer)), columns) { } @@ -103,14 +104,14 @@ public KeyToValueMappingTransformer(IHostEnvironment env, params (string outputC /// Factory method for SignatureDataTransform. /// [BestFriend] - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckNonEmpty(args.Columns, nameof(args.Columns)); + env.CheckNonEmpty(options.Columns, nameof(options.Columns)); - var transformer = new KeyToValueMappingTransformer(env, args.Columns.Select(c => (c.Name, c.Source ?? c.Name)).ToArray()); + var transformer = new KeyToValueMappingTransformer(env, options.Columns.Select(c => (c.Name, c.Source ?? c.Name)).ToArray()); return transformer.MakeDataTransform(input); } @@ -506,16 +507,20 @@ public override JToken SavePfa(BoundPfaContext ctx, JToken srcToken) public sealed class KeyToValueMappingEstimator : TrivialEstimator { - public KeyToValueMappingEstimator(IHostEnvironment env, string columnName) + internal KeyToValueMappingEstimator(IHostEnvironment env, string columnName) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(KeyToValueMappingEstimator)), new KeyToValueMappingTransformer(env, columnName)) { } - public KeyToValueMappingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) + internal KeyToValueMappingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(KeyToValueMappingEstimator)), new KeyToValueMappingTransformer(env, columns)) { } + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public override SchemaShape GetOutputSchema(SchemaShape inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs index 18718ae1f2..f0e52510e0 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingEstimator.cs @@ -12,14 +12,67 @@ namespace Microsoft.ML.Transforms.Conversions /// public sealed class ValueToKeyMappingEstimator : IEstimator { - public static class Defaults + [BestFriend] + internal static class Defaults { - public const int MaxNumTerms = 1000000; - public const ValueToKeyMappingTransformer.SortOrder Sort = ValueToKeyMappingTransformer.SortOrder.Occurrence; + public const int MaxNumKeys = 1000000; + public const SortOrder Sort = SortOrder.Occurrence; + } + + /// + /// Controls how the order of the output keys. + /// + public enum SortOrder : byte + { + Occurrence = 0, + Value = 1, + // REVIEW: We can think about having a frequency order option. What about + // other things, like case insensitive (where appropriate), culturally aware, etc.? + } + + /// + /// Describes how the transformer handles one column pair. + /// + public class ColumnInfo + { + public readonly string OutputColumnName; + public readonly string InputColumnName; + public readonly SortOrder Sort; + public readonly int MaxNumKeys; + public readonly string[] Term; + public readonly bool TextKeyValues; + + protected internal string Terms { get; set; } + + /// + /// Describes how the transformer handles one column pair. + /// + /// Name of the column resulting from the transformation of . + /// Name of the column to transform. If set to , the value of the will be used as source. + /// Maximum number of keys to keep per column when auto-training. + /// How items should be ordered when vectorized. If choosen they will be in the order encountered. + /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). + /// List of terms. + /// Whether key value metadata should be text, regardless of the actual input type. + public ColumnInfo(string outputColumnName, string inputColumnName = null, + int maxNumKeys = Defaults.MaxNumKeys, + SortOrder sort = Defaults.Sort, + string[] term = null, + bool textKeyValues = false + ) + { + Contracts.CheckNonWhiteSpace(outputColumnName, nameof(outputColumnName)); + OutputColumnName = outputColumnName; + InputColumnName = inputColumnName ?? outputColumnName; + Sort = sort; + MaxNumKeys = maxNumKeys; + Term = term; + TextKeyValues = textKeyValues; + } } private readonly IHost _host; - private readonly ValueToKeyMappingTransformer.ColumnInfo[] _columns; + private readonly ColumnInfo[] _columns; private readonly IDataView _keyData; /// @@ -28,15 +81,15 @@ public static class Defaults /// Host Environment. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// Maximum number of keys to keep per column when auto-training. - /// How items should be ordered when vectorized. If choosen they will be in the order encountered. - /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). - public ValueToKeyMappingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int maxNumTerms = Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = Defaults.Sort) : - this(env, new [] { new ValueToKeyMappingTransformer.ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, maxNumTerms, sort) }) + /// Maximum number of keys to keep per column when auto-training. + /// How items should be ordered when vectorized. If choosen they will be in the order encountered. + /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). + internal ValueToKeyMappingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int maxNumKeys = Defaults.MaxNumKeys, SortOrder sort = Defaults.Sort) : + this(env, new [] { new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, maxNumKeys, sort) }) { } - public ValueToKeyMappingEstimator(IHostEnvironment env, ValueToKeyMappingTransformer.ColumnInfo[] columns, IDataView keyData = null) + internal ValueToKeyMappingEstimator(IHostEnvironment env, ColumnInfo[] columns, IDataView keyData = null) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(ValueToKeyMappingEstimator)); @@ -53,8 +106,15 @@ public ValueToKeyMappingEstimator(IHostEnvironment env, ValueToKeyMappingTransfo _keyData = keyData; } + /// + /// Trains and returns a . + /// public ValueToKeyMappingTransformer Fit(IDataView input) => new ValueToKeyMappingTransformer(_host, input, _columns, _keyData, false); + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public SchemaShape GetOutputSchema(SchemaShape inputSchema) { _host.CheckValue(inputSchema, nameof(inputSchema)); @@ -81,7 +141,7 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) metadata = new SchemaShape(new[] { slotMeta, kv }); else metadata = new SchemaShape(new[] { kv }); - result[colInfo.Name] = new SchemaShape.Column(colInfo.Name, col.Kind, NumberType.U4, true, metadata); + result[colInfo.OutputColumnName] = new SchemaShape.Column(colInfo.OutputColumnName, col.Kind, NumberType.U4, true, metadata); } return new SchemaShape(result.Values); @@ -93,12 +153,12 @@ public enum KeyValueOrder : byte /// /// Terms will be assigned ID in the order in which they appear. /// - Occurence = ValueToKeyMappingTransformer.SortOrder.Occurrence, + Occurence = ValueToKeyMappingEstimator.SortOrder.Occurrence, /// /// Terms will be assigned ID according to their sort via an ordinal comparison for the type. /// - Value = ValueToKeyMappingTransformer.SortOrder.Value + Value = ValueToKeyMappingEstimator.SortOrder.Value } /// diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs index d6a8dfa0c5..011d491608 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs @@ -22,7 +22,7 @@ using Newtonsoft.Json.Linq; [assembly: LoadableClass(ValueToKeyMappingTransformer.Summary, typeof(IDataTransform), typeof(ValueToKeyMappingTransformer), - typeof(ValueToKeyMappingTransformer.Arguments), typeof(SignatureDataTransform), + typeof(ValueToKeyMappingTransformer.Options), typeof(SignatureDataTransform), ValueToKeyMappingTransformer.UserName, "Term", "AutoLabel", "TermTransform", "AutoLabelTransform", DocName = "transform/TermTransform.md")] [assembly: LoadableClass(ValueToKeyMappingTransformer.Summary, typeof(IDataTransform), typeof(ValueToKeyMappingTransformer), null, typeof(SignatureLoadDataTransform), @@ -45,7 +45,8 @@ namespace Microsoft.ML.Transforms.Conversions /// public sealed partial class ValueToKeyMappingTransformer : OneToOneTransformerBase { - public abstract class ColumnBase : OneToOneColumn + [BestFriend] + internal abstract class ColumnBase : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of terms to keep when auto-training", ShortName = "max")] public int? MaxNumTerms; @@ -58,7 +59,7 @@ public abstract class ColumnBase : OneToOneColumn [Argument(ArgumentType.AtMostOnce, HelpText = "How items should be ordered when vectorized. By default, they will be in the order encountered. " + "If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').")] - public SortOrder? Sort; + public ValueToKeyMappingEstimator.SortOrder? Sort; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether key value metadata should be text, regardless of the actual input type", ShortName = "textkv", Hide = true)] public bool? TextKeyValues; @@ -80,7 +81,8 @@ private protected override bool TryUnparseCore(StringBuilder sb) } } - public sealed class Column : ColumnBase + [BestFriend] + internal sealed class Column : ColumnBase { internal static Column Parse(string str) { @@ -97,21 +99,11 @@ internal bool TryUnparse(StringBuilder sb) } } - /// - /// Controls how the order of the output keys. - /// - public enum SortOrder : byte - { - Occurrence = 0, - Value = 1, - // REVIEW: We can think about having a frequency order option. What about - // other things, like case insensitive (where appropriate), culturally aware, etc.? - } - - public abstract class ArgumentsBase : TransformInputBase + [BestFriend] + internal abstract class ArgumentsBase : TransformInputBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of terms to keep per column when auto-training", ShortName = "max", SortOrder = 5)] - public int MaxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms; + public int MaxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumKeys; [Argument(ArgumentType.AtMostOnce, HelpText = "Comma separated list of terms", Name = "Terms", SortOrder = 105, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly)] public string Term; @@ -136,7 +128,7 @@ public abstract class ArgumentsBase : TransformInputBase // REVIEW: Should we always sort? Opinions are mixed. See work item 7797429. [Argument(ArgumentType.AtMostOnce, HelpText = "How items should be ordered when vectorized. By default, they will be in the order encountered. " + "If by value items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", SortOrder = 113)] - public SortOrder Sort = ValueToKeyMappingEstimator.Defaults.Sort; + public ValueToKeyMappingEstimator.SortOrder Sort = ValueToKeyMappingEstimator.Defaults.Sort; // REVIEW: Should we do this here, or correct the various pieces of code here and in MRS etc. that // assume key-values will be string? Once we correct these things perhaps we can see about removing it. @@ -144,7 +136,8 @@ public abstract class ArgumentsBase : TransformInputBase public bool TextKeyValues; } - public sealed class Arguments : ArgumentsBase + [BestFriend] + internal sealed class Options : ArgumentsBase { [Argument(ArgumentType.Multiple, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; @@ -164,46 +157,6 @@ public ColInfo(string name, string inputColumnName, ColumnType type) } } - /// - /// Describes how the transformer handles one column pair. - /// - public class ColumnInfo - { - public readonly string Name; - public readonly string InputColumnName; - public readonly SortOrder Sort; - public readonly int MaxNumTerms; - public readonly string[] Term; - public readonly bool TextKeyValues; - - protected internal string Terms { get; set; } - - /// - /// Describes how the transformer handles one column pair. - /// - /// Name of the column resulting from the transformation of . - /// Name of the column to transform. If set to , the value of the will be used as source. - /// Maximum number of terms to keep per column when auto-training. - /// How items should be ordered when vectorized. If choosen they will be in the order encountered. - /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). - /// List of terms. - /// Whether key value metadata should be text, regardless of the actual input type. - public ColumnInfo(string name, string inputColumnName = null, - int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms, - SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort, - string[] term = null, - bool textKeyValues = false - ) - { - Contracts.CheckNonWhiteSpace(name, nameof(name)); - Name = name; - InputColumnName = inputColumnName ?? name; - Sort = sort; - MaxNumTerms = maxNumTerms; - Term = term; - TextKeyValues = textKeyValues; - } - } [BestFriend] internal const string Summary = "Converts input values (words, numbers, etc.) to index in a dictionary."; [BestFriend] @@ -262,10 +215,10 @@ private static VersionInfo GetTermManagerVersionInfo() private readonly bool[] _textMetadata; private const string RegistrationName = "Term"; - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfo[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ValueToKeyMappingEstimator.ColumnInfo[] columns) { Contracts.CheckValue(columns, nameof(columns)); - return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); + return columns.Select(x => (x.OutputColumnName, x.InputColumnName)).ToArray(); } private string TestIsKnownDataKind(ColumnType type) @@ -296,12 +249,12 @@ private ColInfo[] CreateInfos(Schema inputSchema) } internal ValueToKeyMappingTransformer(IHostEnvironment env, IDataView input, - params ColumnInfo[] columns) : + params ValueToKeyMappingEstimator.ColumnInfo[] columns) : this(env, input, columns, null, false) { } internal ValueToKeyMappingTransformer(IHostEnvironment env, IDataView input, - ColumnInfo[] columns, IDataView keyData, bool autoConvert) + ValueToKeyMappingEstimator.ColumnInfo[] columns, IDataView keyData, bool autoConvert) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { using (var ch = Host.Start("Training")) @@ -317,42 +270,42 @@ internal ValueToKeyMappingTransformer(IHostEnvironment env, IDataView input, [BestFriend] // Factory method for SignatureDataTransform. - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new ColumnInfo[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new ValueToKeyMappingEstimator.ColumnInfo[options.Columns.Length]; using (var ch = env.Start("ValidateArgs")) { - if ((args.Terms != null || !string.IsNullOrEmpty(args.Term)) && - (!string.IsNullOrWhiteSpace(args.DataFile) || args.Loader != null || - !string.IsNullOrWhiteSpace(args.TermsColumn))) + if ((options.Terms != null || !string.IsNullOrEmpty(options.Term)) && + (!string.IsNullOrWhiteSpace(options.DataFile) || options.Loader != null || + !string.IsNullOrWhiteSpace(options.TermsColumn))) { ch.Warning("Explicit term list specified. Data file arguments will be ignored"); } - if (!Enum.IsDefined(typeof(SortOrder), args.Sort)) - throw ch.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected", args.Sort); + if (!Enum.IsDefined(typeof(ValueToKeyMappingEstimator.SortOrder), options.Sort)) + throw ch.ExceptUserArg(nameof(options.Sort), "Undefined sorting criteria '{0}' detected", options.Sort); for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; - var sortOrder = item.Sort ?? args.Sort; - if (!Enum.IsDefined(typeof(SortOrder), sortOrder)) - throw env.ExceptUserArg(nameof(args.Sort), "Undefined sorting criteria '{0}' detected for column '{1}'", sortOrder, item.Name); + var item = options.Columns[i]; + var sortOrder = item.Sort ?? options.Sort; + if (!Enum.IsDefined(typeof(ValueToKeyMappingEstimator.SortOrder), sortOrder)) + throw env.ExceptUserArg(nameof(options.Sort), "Undefined sorting criteria '{0}' detected for column '{1}'", sortOrder, item.Name); - cols[i] = new ColumnInfo( + cols[i] = new ValueToKeyMappingEstimator.ColumnInfo( item.Name, item.Source ?? item.Name, - item.MaxNumTerms ?? args.MaxNumTerms, + item.MaxNumTerms ?? options.MaxNumTerms, sortOrder, item.Terms, - item.TextKeyValues ?? args.TextKeyValues); - cols[i].Terms = item.Term ?? args.Term; + item.TextKeyValues ?? options.TextKeyValues); + cols[i].Terms = item.Term ?? options.Term; }; - var keyData = GetKeyDataViewOrNull(env, ch, args.DataFile, args.TermsColumn, args.Loader, out bool autoLoaded); + var keyData = GetKeyDataViewOrNull(env, ch, options.DataFile, options.TermsColumn, options.Loader, out bool autoLoaded); return new ValueToKeyMappingTransformer(env, input, cols, keyData, autoLoaded).MakeDataTransform(input); } } @@ -421,7 +374,7 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Sch => Create(env, ctx).MakeRowMapper(inputSchema); /// - /// Returns a single-column , based on values from , + /// Returns a single-column , based on values from , /// in the case where is set. If that is not set, this will /// return . /// @@ -484,7 +437,7 @@ internal static IDataView GetKeyDataViewOrNull(IHostEnvironment env, IChannel ch { ch.Warning( "{0} should not be specified when default loader is " + nameof(TextLoader) + ". Ignoring {0}={1}", - nameof(Arguments.TermsColumn), src); + nameof(Options.TermsColumn), src); } keyData = new TextLoader(env, columns: new[] { new TextLoader.Column("Term", DataKind.TX, 0) }, @@ -552,7 +505,7 @@ private static TermMap CreateTermMapFromData(IHostEnvironment env, IChannel ch, /// This builds the instances per column. /// private static TermMap[] Train(IHostEnvironment env, IChannel ch, ColInfo[] infos, - IDataView keyData, ColumnInfo[] columns, IDataView trainingData, bool autoConvert) + IDataView keyData, ValueToKeyMappingEstimator.ColumnInfo[] columns, IDataView trainingData, bool autoConvert) { Contracts.AssertValue(env); env.AssertValue(ch); @@ -608,7 +561,7 @@ private static TermMap[] Train(IHostEnvironment env, IChannel ch, ColInfo[] info else { // Auto train this column. Leave the term map null for now, but set the lim appropriately. - lims[iinfo] = columns[iinfo].MaxNumTerms; + lims[iinfo] = columns[iinfo].MaxNumKeys; ch.CheckUserArg(lims[iinfo] > 0, nameof(Column.MaxNumTerms), "Must be positive"); Contracts.Check(trainingData.Schema.TryGetColumnIndex(infos[iinfo].InputColumnName, out int colIndex)); Utils.Add(ref toTrain, colIndex); diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformerImpl.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformerImpl.cs index 21ec0c3d08..fbea7bd7f0 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformerImpl.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformerImpl.cs @@ -37,7 +37,7 @@ protected Builder(PrimitiveType type) ItemType = type; } - public static Builder Create(ColumnType type, SortOrder sortOrder) + public static Builder Create(ColumnType type, ValueToKeyMappingEstimator.SortOrder sortOrder) { Contracts.AssertValue(type); Contracts.Assert(type is VectorType || type is PrimitiveType); @@ -45,8 +45,8 @@ public static Builder Create(ColumnType type, SortOrder sortOrder) // accept any value, but currently the internal implementations of Builder are split // along this being a purely binary option, for now (though this can easily change // with mot implementations of Builder). - Contracts.Assert(sortOrder == SortOrder.Occurrence || sortOrder == SortOrder.Value); - bool sorted = sortOrder == SortOrder.Value; + Contracts.Assert(sortOrder == ValueToKeyMappingEstimator.SortOrder.Occurrence || sortOrder == ValueToKeyMappingEstimator.SortOrder.Value); + bool sorted = sortOrder == ValueToKeyMappingEstimator.SortOrder.Value; PrimitiveType itemType = type.GetItemType() as PrimitiveType; Contracts.AssertValue(itemType); @@ -220,7 +220,7 @@ public override void ParseAddTermArg(ref ReadOnlyMemory terms, IChannel ch } if (Count == 0) - throw ch.ExceptUserArg(nameof(Arguments.Term), "Nothing parsed as '{0}'", ItemType); + throw ch.ExceptUserArg(nameof(Options.Term), "Nothing parsed as '{0}'", ItemType); } /// @@ -245,7 +245,7 @@ public override void ParseAddTermArg(string[] terms, IChannel ch) } if (Count == 0) - throw ch.ExceptUserArg(nameof(Arguments.Term), "Nothing parsed as '{0}'", ItemType); + throw ch.ExceptUserArg(nameof(Options.Term), "Nothing parsed as '{0}'", ItemType); } } diff --git a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs index 19538888bd..da5b3dcb7a 100644 --- a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs +++ b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs @@ -99,7 +99,7 @@ private static IDataView ApplyKeyToVec(List new ValueToKeyMappingTransformer.Column() { Name = c.Name, Source = c.Name, Term = GetTerms(viewTrain, c.InputColumnName) }) @@ -243,7 +243,7 @@ public static CommonOutputs.TransformOutput PrepareClassificationLabel(IHostEnvi return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, nop, input.Data), OutputData = nop }; } - var args = new ValueToKeyMappingTransformer.Arguments() + var args = new ValueToKeyMappingTransformer.Options() { Columns = new[] { @@ -252,7 +252,7 @@ public static CommonOutputs.TransformOutput PrepareClassificationLabel(IHostEnvi Name = input.LabelColumn, Source = input.LabelColumn, TextKeyValues = input.TextKeyValues, - Sort = ValueToKeyMappingTransformer.SortOrder.Value + Sort = ValueToKeyMappingEstimator.SortOrder.Value } } }; diff --git a/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs index 05e5d4f5e2..fc7f202ebf 100644 --- a/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/CategoricalStaticExtensions.cs @@ -45,7 +45,7 @@ public enum OneHotScalarOutputKind : byte } private const KeyValueOrder DefSort = (KeyValueOrder)ValueToKeyMappingEstimator.Defaults.Sort; - private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaxNumTerms; + private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaxNumKeys; private const OneHotVectorOutputKind DefOut = (OneHotVectorOutputKind)OneHotEncodingEstimator.Defaults.OutKind; private readonly struct Config @@ -115,7 +115,7 @@ public override IEstimator Reconcile(IHostEnvironment env, Pipelin { var tcol = (ICategoricalCol)toOutput[i]; infos[i] = new OneHotEncodingEstimator.ColumnInfo(outputNames[toOutput[i]], inputNames[tcol.Input], (OneHotEncodingTransformer.OutputKind)tcol.Config.OutputKind, - tcol.Config.Max, (ValueToKeyMappingTransformer.SortOrder)tcol.Config.Order); + tcol.Config.Max, (ValueToKeyMappingEstimator.SortOrder)tcol.Config.Order); if (tcol.Config.OnFit != null) { int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. diff --git a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs index 4c02d2c098..47e2aef5f1 100644 --- a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs +++ b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs @@ -951,7 +951,7 @@ public static partial class TermStaticExtensions // class, and all the public facing extension methods for each possible type are in a T4 generated result. private const KeyValueOrder DefSort = (KeyValueOrder)ValueToKeyMappingEstimator.Defaults.Sort; - private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaxNumTerms; + private const int DefMax = ValueToKeyMappingEstimator.Defaults.MaxNumKeys; private readonly struct Config { @@ -1025,13 +1025,13 @@ public override IEstimator Reconcile(IHostEnvironment env, Pipelin IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) { - var infos = new ValueToKeyMappingTransformer.ColumnInfo[toOutput.Length]; + var infos = new ValueToKeyMappingEstimator.ColumnInfo[toOutput.Length]; Action onFit = null; for (int i = 0; i < toOutput.Length; ++i) { var tcol = (ITermCol)toOutput[i]; - infos[i] = new ValueToKeyMappingTransformer.ColumnInfo(outputNames[toOutput[i]], inputNames[tcol.Input], - tcol.Config.Max, (ValueToKeyMappingTransformer.SortOrder)tcol.Config.Order); + infos[i] = new ValueToKeyMappingEstimator.ColumnInfo(outputNames[toOutput[i]], inputNames[tcol.Input], + tcol.Config.Max, (ValueToKeyMappingEstimator.SortOrder)tcol.Config.Order); if (tcol.Config.OnFit != null) { int ii = i; // Necessary because if we capture i that will change to toOutput.Length on call. diff --git a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs index 68fb4a15ab..06ad7db7df 100644 --- a/src/Microsoft.ML.Transforms/CategoricalCatalog.cs +++ b/src/Microsoft.ML.Transforms/CategoricalCatalog.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using Microsoft.Data.DataView; using Microsoft.ML.Data; using Microsoft.ML.Transforms.Categorical; @@ -19,7 +20,6 @@ public static class CategoricalCatalog /// Name of the column resulting from the transformation of . /// Name of column to transform. If set to , the value of the will be used as source. /// The conversion mode. - /// public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog, string outputColumnName, string inputColumnName = null, @@ -31,11 +31,22 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate /// /// The transform catalog /// The column settings. - /// public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog, params OneHotEncodingEstimator.ColumnInfo[] columns) => new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columns); + /// + /// Convert several text column into one-hot encoded vectors. + /// + /// The transform catalog + /// The column settings. + /// Specifies an ordering for the encoding. If specified, this should be a single column data view, + /// and the key-values will be taken from that column. If unspecified, the ordering will be determined from the input data upon fitting. + public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.CategoricalTransforms catalog, + OneHotEncodingEstimator.ColumnInfo[] columns, + IDataView keyData = null) + => new OneHotEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columns, keyData); + /// /// Convert a text column into hash-based one-hot encoded vector. /// @@ -48,7 +59,6 @@ public static OneHotEncodingEstimator OneHotEncoding(this TransformsCatalog.Cate /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. /// The conversion mode. - /// public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog, string outputColumnName, string inputColumnName = null, @@ -62,7 +72,6 @@ public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCata /// /// The transform catalog /// The column settings. - /// public static OneHotHashEncodingEstimator OneHotHashEncoding(this TransformsCatalog.CategoricalTransforms catalog, params OneHotHashEncodingEstimator.ColumnInfo[] columns) => new OneHotHashEncodingEstimator(CatalogUtils.GetEnvironment(catalog), columns); diff --git a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs index 123a1fa96f..4f5caadc87 100644 --- a/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs +++ b/src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs @@ -66,7 +66,7 @@ public static CommonOutputs.TransformOutput NGramTransform(IHostEnvironment env, Desc = ValueToKeyMappingTransformer.Summary, UserName = ValueToKeyMappingTransformer.UserName, ShortName = ValueToKeyMappingTransformer.LoaderSignature)] - public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env, ValueToKeyMappingTransformer.Arguments input) + public static CommonOutputs.TransformOutput TermTransform(IHostEnvironment env, ValueToKeyMappingTransformer.Options input) { var h = EntryPointUtils.CheckArgsAndCreateHost(env, "TermTransform", input); var xf = ValueToKeyMappingTransformer.Create(h, input, input.Data); diff --git a/src/Microsoft.ML.Transforms/OneHotEncoding.cs b/src/Microsoft.ML.Transforms/OneHotEncoding.cs index 5771589375..bcfdf0c942 100644 --- a/src/Microsoft.ML.Transforms/OneHotEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotEncoding.cs @@ -18,7 +18,7 @@ using Microsoft.ML.Transforms.Categorical; using Microsoft.ML.Transforms.Conversions; -[assembly: LoadableClass(OneHotEncodingTransformer.Summary, typeof(IDataTransform), typeof(OneHotEncodingTransformer), typeof(OneHotEncodingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(OneHotEncodingTransformer.Summary, typeof(IDataTransform), typeof(OneHotEncodingTransformer), typeof(OneHotEncodingTransformer.Options), typeof(SignatureDataTransform), OneHotEncodingTransformer.UserName, "CategoricalTransform", "CatTransform", "Categorical", "Cat")] [assembly: LoadableClass(typeof(void), typeof(Categorical), null, typeof(SignatureEntryPointModule), "Categorical")] @@ -55,7 +55,7 @@ public enum OutputKind : byte Bin = 4, } - public sealed class Column : ValueToKeyMappingTransformer.ColumnBase + internal sealed class Column : ValueToKeyMappingTransformer.ColumnBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector", ShortName = "kind")] public OutputKind? OutputKind; @@ -97,7 +97,7 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class Arguments : ValueToKeyMappingTransformer.ArgumentsBase + internal sealed class Options : ValueToKeyMappingTransformer.ArgumentsBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; @@ -106,7 +106,7 @@ public sealed class Arguments : ValueToKeyMappingTransformer.ArgumentsBase ShortName = "kind", SortOrder = 102)] public OutputKind OutputKind = OneHotEncodingEstimator.Defaults.OutKind; - public Arguments() + public Options() { // Unlike in the term transform, we want the text key values for the categorical transform // to default to true. @@ -119,32 +119,32 @@ public Arguments() internal const string UserName = "Categorical Transform"; - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("Categorical"); - h.CheckValue(args, nameof(args)); + h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); - h.CheckUserArg(Utils.Size(args.Columns) > 0, nameof(args.Columns)); + h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns)); var columns = new List(); - foreach (var column in args.Columns) + foreach (var column in options.Columns) { var col = new OneHotEncodingEstimator.ColumnInfo( column.Name, column.Source ?? column.Name, - column.OutputKind ?? args.OutputKind, - column.MaxNumTerms ?? args.MaxNumTerms, - column.Sort ?? args.Sort, - column.Terms ?? args.Terms); - col.SetTerms(column.Term ?? args.Term); + column.OutputKind ?? options.OutputKind, + column.MaxNumTerms ?? options.MaxNumTerms, + column.Sort ?? options.Sort, + column.Terms ?? options.Terms); + col.SetTerms(column.Term ?? options.Term); columns.Add(col); } IDataView keyData = null; - if (!string.IsNullOrEmpty(args.DataFile)) + if (!string.IsNullOrEmpty(options.DataFile)) { using (var ch = h.Start("Load term data")) - keyData = ValueToKeyMappingTransformer.GetKeyDataViewOrNull(env, ch, args.DataFile, args.TermsColumn, args.Loader, out bool autoLoaded); + keyData = ValueToKeyMappingTransformer.GetKeyDataViewOrNull(env, ch, options.DataFile, options.TermsColumn, options.Loader, out bool autoLoaded); h.AssertValue(keyData); } var transformed = new OneHotEncodingEstimator(env, columns.ToArray(), keyData).Fit(input).Transform(input); @@ -185,7 +185,7 @@ internal static class Defaults /// /// Describes how the transformer handles one column pair. /// - public class ColumnInfo : ValueToKeyMappingTransformer.ColumnInfo + public class ColumnInfo : ValueToKeyMappingEstimator.ColumnInfo { public readonly OneHotEncodingTransformer.OutputKind OutputKind; /// @@ -195,12 +195,12 @@ public class ColumnInfo : ValueToKeyMappingTransformer.ColumnInfo /// Name of the column to transform. If set to , the value of the will be used as source. /// Output kind: Bag (multi-set vector), Ind (indicator vector), Key (index), or Binary encoded indicator vector. /// Maximum number of terms to keep per column when auto-training. - /// How items should be ordered when vectorized. If choosen they will be in the order encountered. - /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). + /// How items should be ordered when vectorized. If choosen they will be in the order encountered. + /// If , items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a'). /// List of terms. public ColumnInfo(string name, string inputColumnName = null, OneHotEncodingTransformer.OutputKind outputKind = Defaults.OutKind, - int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumTerms, ValueToKeyMappingTransformer.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort, + int maxNumTerms = ValueToKeyMappingEstimator.Defaults.MaxNumKeys, ValueToKeyMappingEstimator.SortOrder sort = ValueToKeyMappingEstimator.Defaults.Sort, string[] term = null) : base(name, inputColumnName ?? name, maxNumTerms, sort, term, true) { @@ -223,13 +223,13 @@ internal void SetTerms(string terms) /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// The type of output expected. - public OneHotEncodingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, + internal OneHotEncodingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, OneHotEncodingTransformer.OutputKind outputKind = Defaults.OutKind) : this(env, new[] { new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, outputKind) }) { } - public OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns, IDataView keyData = null) + internal OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns, IDataView keyData = null) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(OneHotEncodingEstimator)); @@ -247,13 +247,13 @@ public OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns, IData case OneHotEncodingTransformer.OutputKind.Key: continue; case OneHotEncodingTransformer.OutputKind.Bin: - binaryCols.Add((column.Name, column.Name)); + binaryCols.Add((column.OutputColumnName, column.OutputColumnName)); break; case OneHotEncodingTransformer.OutputKind.Ind: - cols.Add((column.Name, column.Name, false)); + cols.Add((column.OutputColumnName, column.OutputColumnName, false)); break; case OneHotEncodingTransformer.OutputKind.Bag: - cols.Add((column.Name, column.Name, true)); + cols.Add((column.OutputColumnName, column.OutputColumnName, true)); break; } } @@ -275,6 +275,10 @@ public OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns, IData } } + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public SchemaShape GetOutputSchema(SchemaShape inputSchema) { if (_toSomething != null) @@ -283,6 +287,9 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) return _term.GetOutputSchema(inputSchema); } + /// + /// Trains and returns a . + /// public OneHotEncodingTransformer Fit(IDataView input) => new OneHotEncodingTransformer(_term, _toSomething, input); [BestFriend] @@ -297,7 +304,7 @@ internal static class Categorical [TlcModule.EntryPoint(Name = "Transforms.CategoricalOneHotVectorizer", Desc = OneHotEncodingTransformer.Summary, UserName = OneHotEncodingTransformer.UserName)] - public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment env, OneHotEncodingTransformer.Arguments input) + public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment env, OneHotEncodingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("CatTransformDict"); @@ -325,7 +332,7 @@ public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment en [TlcModule.EntryPoint(Name = "Transforms.TextToKeyConverter", Desc = ValueToKeyMappingTransformer.Summary, UserName = ValueToKeyMappingTransformer.FriendlyName)] - public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, ValueToKeyMappingTransformer.Arguments input) + public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, ValueToKeyMappingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("Term"); @@ -339,7 +346,7 @@ public static CommonOutputs.TransformOutput TextToKey(IHostEnvironment env, Valu [TlcModule.EntryPoint(Name = "Transforms.KeyToTextConverter", Desc = "KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata.", UserName = KeyToValueMappingTransformer.UserName)] - public static CommonOutputs.TransformOutput KeyToText(IHostEnvironment env, KeyToValueMappingTransformer.Arguments input) + public static CommonOutputs.TransformOutput KeyToText(IHostEnvironment env, KeyToValueMappingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("KeyToValue"); diff --git a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs index a3924dbf58..b90aa03371 100644 --- a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs @@ -219,7 +219,7 @@ internal bool TryUnparse(StringBuilder sb) } /// - /// This class is a merger of and + /// This class is a merger of and /// , with the allLength option removed. /// public abstract class ArgumentsBase @@ -300,12 +300,12 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV // of args.column are not text nor keys). if (termCols.Count > 0) { - ValueToKeyMappingTransformer.Arguments termArgs = null; + ValueToKeyMappingTransformer.Options termArgs = null; string[] missingDropColumns = null; if (termLoaderArgs != null) { termArgs = - new ValueToKeyMappingTransformer.Arguments() + new ValueToKeyMappingTransformer.Options() { MaxNumTerms = int.MaxValue, Term = termLoaderArgs.Term, @@ -322,7 +322,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV else { termArgs = - new ValueToKeyMappingTransformer.Arguments() + new ValueToKeyMappingTransformer.Options() { MaxNumTerms = Utils.Size(args.MaxNumTerms) > 0 ? args.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms, Columns = new ValueToKeyMappingTransformer.Column[termCols.Count] @@ -432,7 +432,7 @@ public sealed class TermLoaderArguments [Argument(ArgumentType.AtMostOnce, HelpText = "How items should be ordered when vectorized. By default, they will be in the order encountered. " + "If by value, items are sorted according to their default comparison, for example, text sorting will be case sensitive (for example, 'A' then 'Z' then 'a').", SortOrder = 5)] - public ValueToKeyMappingTransformer.SortOrder Sort = ValueToKeyMappingTransformer.SortOrder.Occurrence; + public ValueToKeyMappingEstimator.SortOrder Sort = ValueToKeyMappingEstimator.SortOrder.Occurrence; [Argument(ArgumentType.AtMostOnce, HelpText = "Drop unknown terms instead of mapping them to NA term.", ShortName = "dropna", SortOrder = 6)] public bool DropUnknowns = false; diff --git a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs index 27e627b900..49aae725d4 100644 --- a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs @@ -380,7 +380,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV { h.Assert(Utils.Size(termCols) == hashColumns.Count); var termArgs = - new ValueToKeyMappingTransformer.Arguments() + new ValueToKeyMappingTransformer.Options() { MaxNumTerms = int.MaxValue, Term = termLoaderArgs.Term, diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 9df43c0e84..842e73dc7c 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -74,7 +74,7 @@ Transforms.ApproximateBootstrapSampler Approximate bootstrap sampling. Microsoft Transforms.BinaryPredictionScoreColumnsRenamer For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class. Microsoft.ML.EntryPoints.ScoreModel RenameBinaryPredictionScoreColumns Microsoft.ML.EntryPoints.ScoreModel+RenameBinaryPredictionScoreColumnsInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.BinNormalizer The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins. Microsoft.ML.Data.Normalize Bin Microsoft.ML.Transforms.Normalizers.NormalizeTransform+BinArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CategoricalHashOneHotVectorizer Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the bag. If the input column is a vector, a single indicator bag is returned for it. Microsoft.ML.Transforms.Categorical.Categorical CatTransformHash Microsoft.ML.Transforms.Categorical.OneHotHashEncoding+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.CategoricalOneHotVectorizer Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. Microsoft.ML.Transforms.Categorical.Categorical CatTransformDict Microsoft.ML.Transforms.Categorical.OneHotEncodingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.CategoricalOneHotVectorizer Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. Microsoft.ML.Transforms.Categorical.Categorical CatTransformDict Microsoft.ML.Transforms.Categorical.OneHotEncodingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CharacterTokenizer Character-oriented tokenizer where text is considered a sequence of characters. Microsoft.ML.Transforms.Text.TextAnalytics CharTokenize Microsoft.ML.Transforms.Text.TokenizingByCharactersTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ColumnConcatenator Concatenates one or more columns of the same item type. Microsoft.ML.EntryPoints.SchemaManipulation ConcatColumns Microsoft.ML.Data.ColumnConcatenatingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ColumnCopier Duplicates columns from the dataset Microsoft.ML.EntryPoints.SchemaManipulation CopyColumns Microsoft.ML.Transforms.ColumnCopyingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput @@ -85,7 +85,7 @@ Transforms.ConditionalNormalizer Normalize the columns only if needed Microsoft. Transforms.DataCache Caches using the specified cache option. Microsoft.ML.EntryPoints.Cache CacheData Microsoft.ML.EntryPoints.Cache+CacheInput Microsoft.ML.EntryPoints.Cache+CacheOutput Transforms.DatasetScorer Score a dataset with a predictor model Microsoft.ML.EntryPoints.ScoreModel Score Microsoft.ML.EntryPoints.ScoreModel+Input Microsoft.ML.EntryPoints.ScoreModel+Output Transforms.DatasetTransformScorer Score a dataset with a transform model Microsoft.ML.EntryPoints.ScoreModel ScoreUsingTransform Microsoft.ML.EntryPoints.ScoreModel+InputTransformScorer Microsoft.ML.EntryPoints.ScoreModel+Output -Transforms.Dictionarizer Converts input values (words, numbers, etc.) to index in a dictionary. Microsoft.ML.Transforms.Text.TextAnalytics TermTransform Microsoft.ML.Transforms.Conversions.ValueToKeyMappingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.Dictionarizer Converts input values (words, numbers, etc.) to index in a dictionary. Microsoft.ML.Transforms.Text.TextAnalytics TermTransform Microsoft.ML.Transforms.Conversions.ValueToKeyMappingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.FeatureCombiner Combines all the features into one feature column. Microsoft.ML.EntryPoints.FeatureCombiner PrepareFeatures Microsoft.ML.EntryPoints.FeatureCombiner+FeatureCombinerInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.FeatureContributionCalculationTransformer For each data point, calculates the contribution of individual features to the model prediction. Microsoft.ML.Data.FeatureContributionEntryPoint FeatureContributionCalculation Microsoft.ML.Data.FeatureContributionCalculatingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.FeatureSelectorByCount Selects the slots for which the count of non-default values is greater than or equal to a threshold. Microsoft.ML.Transforms.SelectFeatures CountSelect Microsoft.ML.Transforms.FeatureSelection.CountFeatureSelectingEstimator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput @@ -96,7 +96,7 @@ Transforms.ImageGrayscale Convert image into grayscale. Microsoft.ML.ImageAnalyt Transforms.ImageLoader Load images from files. Microsoft.ML.ImageAnalytics.EntryPoints.ImageAnalytics ImageLoader Microsoft.ML.ImageAnalytics.ImageLoaderTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ImagePixelExtractor Extract color plane(s) from an image. Options include scaling, offset and conversion to floating point. Microsoft.ML.ImageAnalytics.EntryPoints.ImageAnalytics ImagePixelExtractor Microsoft.ML.ImageAnalytics.ImagePixelExtractorTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ImageResizer Scales an image to specified dimensions using one of the three scale types: isotropic with padding, isotropic with cropping or anisotropic. In case of isotropic padding, transparent color is used to pad resulting image. Microsoft.ML.ImageAnalytics.EntryPoints.ImageAnalytics ImageResizer Microsoft.ML.ImageAnalytics.ImageResizerTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.KeyToTextConverter KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata. Microsoft.ML.Transforms.Categorical.Categorical KeyToText Microsoft.ML.Transforms.Conversions.KeyToValueMappingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.KeyToTextConverter KeyToValueTransform utilizes KeyValues metadata to map key indices to the corresponding values in the KeyValues metadata. Microsoft.ML.Transforms.Categorical.Categorical KeyToText Microsoft.ML.Transforms.Conversions.KeyToValueMappingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.LabelColumnKeyBooleanConverter Transforms the label to either key or bool (if needed) to make it suitable for classification. Microsoft.ML.EntryPoints.FeatureCombiner PrepareClassificationLabel Microsoft.ML.EntryPoints.FeatureCombiner+ClassificationLabelInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.LabelIndicator Label remapper used by OVA Microsoft.ML.Transforms.LabelIndicatorTransform LabelIndicator Microsoft.ML.Transforms.LabelIndicatorTransform+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.LabelToFloatConverter Transforms the label to float to make it suitable for regression. Microsoft.ML.EntryPoints.FeatureCombiner PrepareRegressionLabel Microsoft.ML.EntryPoints.FeatureCombiner+RegressionLabelInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput @@ -128,7 +128,7 @@ Transforms.Segregator Un-groups vector columns into sequences of rows, inverse o Transforms.SentimentAnalyzer Uses a pretrained sentiment model to score input strings Microsoft.ML.Transforms.Text.TextAnalytics AnalyzeSentiment Microsoft.ML.Transforms.Text.SentimentAnalyzingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.TensorFlowScorer Transforms the data using the TensorFlow model. Microsoft.ML.Transforms.TensorFlowTransformer TensorFlowScorer Microsoft.ML.Transforms.TensorFlowTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.TextFeaturizer A transform that turns a collection of text documents into numerical feature vectors. The feature vectors are normalized counts of (word and/or character) ngrams in a given tokenized text. Microsoft.ML.Transforms.Text.TextAnalytics TextTransform Microsoft.ML.Transforms.Text.TextFeaturizingEstimator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.TextToKeyConverter Converts input values (words, numbers, etc.) to index in a dictionary. Microsoft.ML.Transforms.Categorical.Categorical TextToKey Microsoft.ML.Transforms.Conversions.ValueToKeyMappingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.TextToKeyConverter Converts input values (words, numbers, etc.) to index in a dictionary. Microsoft.ML.Transforms.Categorical.Categorical TextToKey Microsoft.ML.Transforms.Conversions.ValueToKeyMappingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.TrainTestDatasetSplitter Split the dataset into train and test sets Microsoft.ML.EntryPoints.TrainTestSplit Split Microsoft.ML.EntryPoints.TrainTestSplit+Input Microsoft.ML.EntryPoints.TrainTestSplit+Output Transforms.TreeLeafFeaturizer Trains a tree ensemble, or loads it from a file, then maps a numeric feature vector to three outputs: 1. A vector containing the individual tree outputs of the tree ensemble. 2. A vector indicating the leaves that the feature vector falls on in the tree ensemble. 3. A vector indicating the paths that the feature vector falls on in the tree ensemble. If a both a model file and a trainer are specified - will use the model file. If neither are specified, will train a default FastTree model. This can handle key labels by training a regression model towards their optionally permuted indices. Microsoft.ML.Data.TreeFeaturize Featurizer Microsoft.ML.Data.TreeEnsembleFeaturizerTransform+ArgumentsForEntryPoint Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.TwoHeterogeneousModelCombiner Combines a TransformModel and a PredictorModel into a single PredictorModel. Microsoft.ML.EntryPoints.ModelOperations CombineTwoModels Microsoft.ML.EntryPoints.ModelOperations+SimplePredictorModelInput Microsoft.ML.EntryPoints.ModelOperations+PredictorModelOutput diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 6b33ab0951..a75ac92051 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -751,7 +751,7 @@ public void EntryPointPipelineEnsemble() }).Fit(data).Transform(data); data = new ColumnConcatenatingTransformer(Env, "Features", new[] { "Features1", "Features2" }).Transform(data); - data = new ValueToKeyMappingEstimator(Env, "Label", "Label", sort: ValueToKeyMappingTransformer.SortOrder.Value).Fit(data).Transform(data); + data = new ValueToKeyMappingEstimator(Env, "Label", "Label", sort: ValueToKeyMappingEstimator.SortOrder.Value).Fit(data).Transform(data); var lrInput = new LogisticRegression.Options { @@ -3541,7 +3541,7 @@ public void EntryPointTreeLeafFeaturizer() #pragma warning disable 0618 var dataView = EntryPoints.ImportTextData.ImportText(Env, new EntryPoints.ImportTextData.Input { InputFile = inputFile }).Data; #pragma warning restore 0618 - var cat = Categorical.CatTransformDict(Env, new OneHotEncodingTransformer.Arguments() + var cat = Categorical.CatTransformDict(Env, new OneHotEncodingTransformer.Options() { Data = dataView, Columns = new[] { new OneHotEncodingTransformer.Column { Name = "Categories", Source = "Categories" } } diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs index e1666d8d77..339bede3e5 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs @@ -631,7 +631,7 @@ public void TestTreeEnsembleCombinerWithCategoricalSplits() var dataPath = GetDataPath("adult.tiny.with-schema.txt"); var dataView = ML.Data.ReadFromTextFile(dataPath); - var cat = new OneHotEncodingEstimator(ML, "Features", "Categories").Fit(dataView).Transform(dataView); + var cat = ML.Transforms.Categorical.OneHotEncoding("Features", "Categories").Fit(dataView).Transform(dataView); var fastTrees = new PredictorModel[3]; for (int i = 0; i < 3; i++) { diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index bb46984a38..630c081f28 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -449,7 +449,7 @@ public void TensorFlowTransformMNISTLRTrainingTest() ReTrain = true })) .Append(mlContext.Transforms.Concatenate("Features", "Prediction")) - .Append(mlContext.Transforms.Conversion.MapValueToKey("KeyLabel","Label", maxNumTerms: 10)) + .Append(mlContext.Transforms.Conversion.MapValueToKey("KeyLabel","Label", maxNumKeys: 10)) .Append(mlContext.MulticlassClassification.Trainers.LightGbm("KeyLabel", "Features")); var trainedModel = pipe.Fit(trainData); diff --git a/test/Microsoft.ML.Tests/TermEstimatorTests.cs b/test/Microsoft.ML.Tests/TermEstimatorTests.cs index d0a806a60f..90e03dabd8 100644 --- a/test/Microsoft.ML.Tests/TermEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/TermEstimatorTests.cs @@ -71,13 +71,13 @@ void TestDifferentTypes() }, new MultiFileSource(dataPath)); var pipe = new ValueToKeyMappingEstimator(Env, new[]{ - new ValueToKeyMappingTransformer.ColumnInfo("TermFloat1", "float1"), - new ValueToKeyMappingTransformer.ColumnInfo("TermFloat4", "float4"), - new ValueToKeyMappingTransformer.ColumnInfo("TermDouble1", "double1"), - new ValueToKeyMappingTransformer.ColumnInfo("TermDouble4", "double4"), - new ValueToKeyMappingTransformer.ColumnInfo("TermInt1", "int1"), - new ValueToKeyMappingTransformer.ColumnInfo("TermText1", "text1"), - new ValueToKeyMappingTransformer.ColumnInfo("TermText2", "text2") + new ValueToKeyMappingEstimator.ColumnInfo("TermFloat1", "float1"), + new ValueToKeyMappingEstimator.ColumnInfo("TermFloat4", "float4"), + new ValueToKeyMappingEstimator.ColumnInfo("TermDouble1", "double1"), + new ValueToKeyMappingEstimator.ColumnInfo("TermDouble4", "double4"), + new ValueToKeyMappingEstimator.ColumnInfo("TermInt1", "int1"), + new ValueToKeyMappingEstimator.ColumnInfo("TermText1", "text1"), + new ValueToKeyMappingEstimator.ColumnInfo("TermText2", "text2") }); var data = loader.Read(dataPath); data = TakeFilter.Create(Env, data, 10); @@ -102,9 +102,9 @@ void TestSimpleCase() var stringData = new[] { new TestClassDifferentTypes { A = "1", B = "c", C = "b" } }; var dataView = ML.Data.ReadFromEnumerable(data); var pipe = new ValueToKeyMappingEstimator(Env, new[]{ - new ValueToKeyMappingTransformer.ColumnInfo("TermA", "A"), - new ValueToKeyMappingTransformer.ColumnInfo("TermB", "B"), - new ValueToKeyMappingTransformer.ColumnInfo("TermC", "C") + new ValueToKeyMappingEstimator.ColumnInfo("TermA", "A"), + new ValueToKeyMappingEstimator.ColumnInfo("TermB", "B"), + new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C") }); var invalidData = ML.Data.ReadFromEnumerable(xydata); var validFitNotValidTransformData = ML.Data.ReadFromEnumerable(stringData); @@ -117,9 +117,9 @@ void TestOldSavingAndLoading() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); var est = new ValueToKeyMappingEstimator(Env, new[]{ - new ValueToKeyMappingTransformer.ColumnInfo("TermA", "A"), - new ValueToKeyMappingTransformer.ColumnInfo("TermB", "B"), - new ValueToKeyMappingTransformer.ColumnInfo("TermC", "C") + new ValueToKeyMappingEstimator.ColumnInfo("TermA", "A"), + new ValueToKeyMappingEstimator.ColumnInfo("TermB", "B"), + new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C") }); var transformer = est.Fit(dataView); var result = transformer.Transform(dataView); @@ -139,7 +139,7 @@ void TestMetadataCopy() var data = new[] { new TestMetaClass() { Term = "A", NotUsed = 1 }, new TestMetaClass() { Term = "B" }, new TestMetaClass() { Term = "C" } }; var dataView = ML.Data.ReadFromEnumerable(data); var termEst = new ValueToKeyMappingEstimator(Env, new[] { - new ValueToKeyMappingTransformer.ColumnInfo("T", "Term") }); + new ValueToKeyMappingEstimator.ColumnInfo("T", "Term") }); var termTransformer = termEst.Fit(dataView); var result = termTransformer.Transform(dataView); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs index d6ecf7853b..cc75f6cdeb 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/TrainerEstimators.cs @@ -150,8 +150,8 @@ public void TestEstimatorMultiClassNaiveBayesTrainer() // Pipeline. var pipeline = new ValueToKeyMappingEstimator(Env, new[]{ - new ValueToKeyMappingTransformer.ColumnInfo("Group", "Workclass"), - new ValueToKeyMappingTransformer.ColumnInfo("Label0", "Label") }); + new ValueToKeyMappingEstimator.ColumnInfo("Group", "Workclass"), + new ValueToKeyMappingEstimator.ColumnInfo("Label0", "Label") }); return (pipeline, data); } diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs index d252f3d1d9..ca09a02fc4 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalTests.cs @@ -59,7 +59,7 @@ public void CategoricalWorkout() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new OneHotEncodingEstimator(Env, new[]{ + var pipe = ML.Transforms.Categorical.OneHotEncoding(new[]{ new OneHotEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag), new OneHotEncodingEstimator.ColumnInfo("CatB", "A", OneHotEncodingTransformer.OutputKind.Bin), new OneHotEncodingEstimator.ColumnInfo("CatC", "A", OneHotEncodingTransformer.OutputKind.Ind), @@ -122,7 +122,7 @@ public void CategoricalOneHotEncodingFromSideData() var sideData = sideDataBuilder.GetDataView(); var ci = new OneHotEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag); - var pipe = new OneHotEncodingEstimator(mlContext, new[] { ci }, sideData); + var pipe = mlContext.Transforms.Categorical.OneHotEncoding(new[] { ci }, sideData); var output = pipe.Fit(dataView).Transform(dataView); @@ -183,7 +183,7 @@ public void TestMetadataPropagation() var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new OneHotEncodingEstimator(Env, new[] { + var pipe = ML.Transforms.Categorical.OneHotEncoding(new[] { new OneHotEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag), new OneHotEncodingEstimator.ColumnInfo("CatB", "B", OneHotEncodingTransformer.OutputKind.Bag), new OneHotEncodingEstimator.ColumnInfo("CatC", "C", OneHotEncodingTransformer.OutputKind.Bag), @@ -306,7 +306,7 @@ public void TestOldSavingAndLoading() { var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new OneHotEncodingEstimator(Env, new[]{ + var pipe = ML.Transforms.Categorical.OneHotEncoding(new[]{ new OneHotEncodingEstimator.ColumnInfo("TermA", "A"), new OneHotEncodingEstimator.ColumnInfo("TermB", "B"), new OneHotEncodingEstimator.ColumnInfo("TermC", "C") diff --git a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs index ef48abfc43..29c7f8bf18 100644 --- a/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ConvertTests.cs @@ -163,7 +163,7 @@ public void ValueToKeyFromSideData() var sideData = sideDataBuilder.GetDataView(); // For some reason the column info is on the *transformer*, not the estimator. Already tracked as issue #1760. - var ci = new ValueToKeyMappingTransformer.ColumnInfo("CatA", "A"); + var ci = new ValueToKeyMappingEstimator.ColumnInfo("CatA", "A"); var pipe = mlContext.Transforms.Conversion.MapValueToKey(new[] { ci }, sideData); var output = pipe.Fit(dataView).Transform(dataView); @@ -210,7 +210,7 @@ public void TestMetadata() { var data = new[] { new MetaClass() { A = 1, B = "A" }, new MetaClass() { A = 2, B = "B" }}; - var pipe = new OneHotEncodingEstimator(Env, new[] { + var pipe = ML.Transforms.Categorical.OneHotEncoding(new[] { new OneHotEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Ind), new OneHotEncodingEstimator.ColumnInfo("CatB", "B", OneHotEncodingTransformer.OutputKind.Key) }).Append(new TypeConvertingEstimator(Env, new[] { diff --git a/test/Microsoft.ML.Tests/Transformers/CopyColumnEstimatorTests.cs b/test/Microsoft.ML.Tests/Transformers/CopyColumnEstimatorTests.cs index bd325e4c17..5847d36148 100644 --- a/test/Microsoft.ML.Tests/Transformers/CopyColumnEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CopyColumnEstimatorTests.cs @@ -127,7 +127,7 @@ void TestMetadataCopy() var data = new[] { new TestMetaClass() { Term = "A", NotUsed = 1 }, new TestMetaClass() { Term = "B" }, new TestMetaClass() { Term = "C" } }; var env = new MLContext(); var dataView = env.Data.ReadFromEnumerable(data); - var term = ValueToKeyMappingTransformer.Create(env, new ValueToKeyMappingTransformer.Arguments() + var term = ValueToKeyMappingTransformer.Create(env, new ValueToKeyMappingTransformer.Options() { Columns = new[] { new ValueToKeyMappingTransformer.Column() { Source = "Term", Name = "T" } } }, dataView); diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs b/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs index 6099a1955f..9bb78cbcac 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToBinaryVectorEstimatorTest.cs @@ -47,9 +47,9 @@ public void KeyToBinaryVectorWorkout() var dataView = ML.Data.ReadFromEnumerable(data); dataView = new ValueToKeyMappingEstimator(Env, new[]{ - new ValueToKeyMappingTransformer.ColumnInfo("TermA", "A"), - new ValueToKeyMappingTransformer.ColumnInfo("TermB", "B"), - new ValueToKeyMappingTransformer.ColumnInfo("TermC", "C", textKeyValues:true) + new ValueToKeyMappingEstimator.ColumnInfo("TermA", "A"), + new ValueToKeyMappingEstimator.ColumnInfo("TermB", "B"), + new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C", textKeyValues:true) }).Fit(dataView).Transform(dataView); var pipe = new KeyToBinaryVectorMappingEstimator(Env, new KeyToBinaryVectorMappingTransformer.ColumnInfo("CatA", "TermA"), @@ -71,8 +71,8 @@ public void KeyToBinaryVectorStatic() // Non-pigsty Term. var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { - new ValueToKeyMappingTransformer.ColumnInfo("A", "ScalarString"), - new ValueToKeyMappingTransformer.ColumnInfo("B", "VectorString") }) + new ValueToKeyMappingEstimator.ColumnInfo("A", "ScalarString"), + new ValueToKeyMappingEstimator.ColumnInfo("B", "VectorString") }) .Fit(data.AsDynamic).Transform(data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( @@ -100,10 +100,10 @@ public void TestMetadataPropagation() var dataView = ML.Data.ReadFromEnumerable(data); var termEst = new ValueToKeyMappingEstimator(Env, new[] { - new ValueToKeyMappingTransformer.ColumnInfo("TA", "A", textKeyValues: true), - new ValueToKeyMappingTransformer.ColumnInfo("TB", "B", textKeyValues: true), - new ValueToKeyMappingTransformer.ColumnInfo("TC", "C"), - new ValueToKeyMappingTransformer.ColumnInfo("TD", "D") }); + new ValueToKeyMappingEstimator.ColumnInfo("TA", "A", textKeyValues: true), + new ValueToKeyMappingEstimator.ColumnInfo("TB", "B", textKeyValues: true), + new ValueToKeyMappingEstimator.ColumnInfo("TC", "C"), + new ValueToKeyMappingEstimator.ColumnInfo("TD", "D") }); var termTransformer = termEst.Fit(dataView); dataView = termTransformer.Transform(dataView); @@ -155,9 +155,9 @@ public void TestOldSavingAndLoading() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); var est = new ValueToKeyMappingEstimator(Env, new[]{ - new ValueToKeyMappingTransformer.ColumnInfo("TermA", "A"), - new ValueToKeyMappingTransformer.ColumnInfo("TermB", "B", textKeyValues:true), - new ValueToKeyMappingTransformer.ColumnInfo("TermC", "C") + new ValueToKeyMappingEstimator.ColumnInfo("TermA", "A"), + new ValueToKeyMappingEstimator.ColumnInfo("TermB", "B", textKeyValues:true), + new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C") }); var transformer = est.Fit(dataView); dataView = transformer.Transform(dataView); diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToValueTests.cs b/test/Microsoft.ML.Tests/Transformers/KeyToValueTests.cs index 3138b9d68a..7c41b2df62 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToValueTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToValueTests.cs @@ -45,8 +45,8 @@ public void KeyToValueWorkout() var data = reader.Read(dataPath); data = new ValueToKeyMappingEstimator(Env, new[] { - new ValueToKeyMappingTransformer.ColumnInfo("A", "ScalarString"), - new ValueToKeyMappingTransformer.ColumnInfo("B", "VectorString") }).Fit(data).Transform(data); + new ValueToKeyMappingEstimator.ColumnInfo("A", "ScalarString"), + new ValueToKeyMappingEstimator.ColumnInfo("B", "VectorString") }).Fit(data).Transform(data); var badData1 = new ColumnCopyingTransformer(Env, ("A", "BareKey")).Transform(data); var badData2 = new ColumnCopyingTransformer(Env, ("B", "VectorString")).Transform(data); @@ -82,8 +82,8 @@ public void KeyToValuePigsty() // Non-pigsty Term. var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { - new ValueToKeyMappingTransformer.ColumnInfo("A", "ScalarString"), - new ValueToKeyMappingTransformer.ColumnInfo("B", "VectorString") }) + new ValueToKeyMappingEstimator.ColumnInfo("A", "ScalarString"), + new ValueToKeyMappingEstimator.ColumnInfo("B", "VectorString") }) .Fit(data.AsDynamic).Transform(data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs index 9c33ec87b7..ab9dc36206 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs @@ -53,9 +53,9 @@ public void KeyToVectorWorkout() var dataView = ML.Data.ReadFromEnumerable(data); dataView = new ValueToKeyMappingEstimator(Env, new[]{ - new ValueToKeyMappingTransformer.ColumnInfo("TermA", "A"), - new ValueToKeyMappingTransformer.ColumnInfo("TermB", "B"), - new ValueToKeyMappingTransformer.ColumnInfo("TermC", "C", textKeyValues:true) + new ValueToKeyMappingEstimator.ColumnInfo("TermA", "A"), + new ValueToKeyMappingEstimator.ColumnInfo("TermB", "B"), + new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C", textKeyValues:true) }).Fit(dataView).Transform(dataView); var pipe = new KeyToVectorMappingEstimator(Env, new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TermA", false), @@ -79,8 +79,8 @@ public void KeyToVectorStatic() // Non-pigsty Term. var dynamicData = new ValueToKeyMappingEstimator(Env, new[] { - new ValueToKeyMappingTransformer.ColumnInfo("A", "ScalarString"), - new ValueToKeyMappingTransformer.ColumnInfo("B", "VectorString") }) + new ValueToKeyMappingEstimator.ColumnInfo("A", "ScalarString"), + new ValueToKeyMappingEstimator.ColumnInfo("B", "VectorString") }) .Fit(data.AsDynamic).Transform(data.AsDynamic); var data2 = dynamicData.AssertStatic(Env, ctx => ( @@ -110,14 +110,14 @@ public void TestMetadataPropagation() var dataView = ML.Data.ReadFromEnumerable(data); var termEst = new ValueToKeyMappingEstimator(Env, new[] { - new ValueToKeyMappingTransformer.ColumnInfo("TA", "A", textKeyValues: true), - new ValueToKeyMappingTransformer.ColumnInfo("TB", "B"), - new ValueToKeyMappingTransformer.ColumnInfo("TC", "C", textKeyValues: true), - new ValueToKeyMappingTransformer.ColumnInfo("TD", "D", textKeyValues: true), - new ValueToKeyMappingTransformer.ColumnInfo("TE", "E"), - new ValueToKeyMappingTransformer.ColumnInfo("TF", "F"), - new ValueToKeyMappingTransformer.ColumnInfo("TG", "G"), - new ValueToKeyMappingTransformer.ColumnInfo("TH", "H", textKeyValues: true) }); + new ValueToKeyMappingEstimator.ColumnInfo("TA", "A", textKeyValues: true), + new ValueToKeyMappingEstimator.ColumnInfo("TB", "B"), + new ValueToKeyMappingEstimator.ColumnInfo("TC", "C", textKeyValues: true), + new ValueToKeyMappingEstimator.ColumnInfo("TD", "D", textKeyValues: true), + new ValueToKeyMappingEstimator.ColumnInfo("TE", "E"), + new ValueToKeyMappingEstimator.ColumnInfo("TF", "F"), + new ValueToKeyMappingEstimator.ColumnInfo("TG", "G"), + new ValueToKeyMappingEstimator.ColumnInfo("TH", "H", textKeyValues: true) }); var termTransformer = termEst.Fit(dataView); dataView = termTransformer.Transform(dataView); @@ -215,9 +215,9 @@ public void TestOldSavingAndLoading() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); var est = new ValueToKeyMappingEstimator(Env, new[]{ - new ValueToKeyMappingTransformer.ColumnInfo("TermA", "A"), - new ValueToKeyMappingTransformer.ColumnInfo("TermB", "B"), - new ValueToKeyMappingTransformer.ColumnInfo("TermC", "C") + new ValueToKeyMappingEstimator.ColumnInfo("TermA", "A"), + new ValueToKeyMappingEstimator.ColumnInfo("TermB", "B"), + new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C") }); var transformer = est.Fit(dataView); dataView = transformer.Transform(dataView); diff --git a/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs b/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs index e30b43eff6..1642c5a6ad 100644 --- a/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/NAIndicatorTests.cs @@ -125,7 +125,7 @@ public void NAIndicatorMetadataTest() }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new OneHotEncodingEstimator(Env, "CatA", "A"); + var pipe = ML.Transforms.Categorical.OneHotEncoding("CatA", "A"); var newpipe = pipe.Append(new MissingValueIndicatorEstimator(Env, new (string name, string source)[] { ("NAA", "CatA") })); var result = newpipe.Fit(dataView).Transform(dataView); Assert.True(result.Schema.TryGetColumnIndex("NAA", out var col));