diff --git a/src/Microsoft.ML.Data/EntryPoints/SchemaManipulation.cs b/src/Microsoft.ML.Data/EntryPoints/SchemaManipulation.cs index 63409f78d7..6296c2aa55 100644 --- a/src/Microsoft.ML.Data/EntryPoints/SchemaManipulation.cs +++ b/src/Microsoft.ML.Data/EntryPoints/SchemaManipulation.cs @@ -38,7 +38,7 @@ public static CommonOutputs.TransformOutput SelectColumns(IHostEnvironment env, } [TlcModule.EntryPoint(Name = "Transforms.ColumnCopier", Desc = "Duplicates columns from the dataset", UserName = ColumnCopyingTransformer.UserName, ShortName = ColumnCopyingTransformer.ShortName)] - public static CommonOutputs.TransformOutput CopyColumns(IHostEnvironment env, ColumnCopyingTransformer.Arguments input) + public static CommonOutputs.TransformOutput CopyColumns(IHostEnvironment env, ColumnCopyingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("CopyColumns"); diff --git a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs index 235c709af1..40729e6894 100644 --- a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs @@ -39,6 +39,10 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TensorFlow" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Transforms" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.AlexNet" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet101" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet18" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet50" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs index ee8e4ac548..f10d5dda6e 100644 --- a/src/Microsoft.ML.Data/TrainCatalog.cs +++ b/src/Microsoft.ML.Data/TrainCatalog.cs @@ -152,11 +152,11 @@ private void EnsureStratificationColumn(ref IDataView data, ref string stratific // Generate a new column with the hashed stratification column. while (data.Schema.TryGetColumnIndex(stratificationColumn, out tmp)) stratificationColumn = string.Format("{0}_{1:000}", origStratCol, ++inc); - HashingTransformer.ColumnInfo columnInfo; + HashingEstimator.ColumnInfo columnInfo; if (seed.HasValue) - columnInfo = new HashingTransformer.ColumnInfo(stratificationColumn, origStratCol, 30, seed.Value); + columnInfo = new HashingEstimator.ColumnInfo(stratificationColumn, origStratCol, 30, seed.Value); else - columnInfo = new HashingTransformer.ColumnInfo(stratificationColumn, origStratCol, 30); + columnInfo = new HashingEstimator.ColumnInfo(stratificationColumn, origStratCol, 30); data = new HashingEstimator(Host, columnInfo).Fit(data).Transform(data); } } diff --git a/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs b/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs index a880496d60..7dfdf17d23 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs @@ -18,7 +18,7 @@ using Microsoft.ML.Transforms; [assembly: LoadableClass(ColumnCopyingTransformer.Summary, typeof(IDataTransform), typeof(ColumnCopyingTransformer), - typeof(ColumnCopyingTransformer.Arguments), typeof(SignatureDataTransform), + typeof(ColumnCopyingTransformer.Options), typeof(SignatureDataTransform), ColumnCopyingTransformer.UserName, "CopyColumns", "CopyColumnsTransform", ColumnCopyingTransformer.ShortName, DocName = "transform/CopyColumnsTransformer.md")] @@ -33,18 +33,27 @@ namespace Microsoft.ML.Transforms { + /// <summary> + /// <see cref="ColumnCopyingEstimator"/> copies the input column to another column named as specified in the parameters of the transformation. + /// </summary> public sealed class ColumnCopyingEstimator : TrivialEstimator<ColumnCopyingTransformer> { - public ColumnCopyingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName) : + [BestFriend] + internal ColumnCopyingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName) : this(env, (outputColumnName, inputColumnName)) { } - public ColumnCopyingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) + [BestFriend] + internal ColumnCopyingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ColumnCopyingEstimator)), new ColumnCopyingTransformer(env, columns)) { } + /// <summary> + /// Returns the <see cref="SchemaShape"/> of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// </summary> public override SchemaShape GetOutputSchema(SchemaShape inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); @@ -69,6 +78,9 @@ public sealed class ColumnCopyingTransformer : OneToOneTransformerBase internal const string UserName = "Copy Columns Transform"; internal const string ShortName = "Copy"; + /// <summary> + /// Names of output and input column pairs on which the transformation is applied. + /// </summary> public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); private static VersionInfo GetVersionInfo() @@ -82,12 +94,12 @@ private static VersionInfo GetVersionInfo() loaderAssemblyName: typeof(ColumnCopyingTransformer).Assembly.FullName); } - public ColumnCopyingTransformer(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) + internal ColumnCopyingTransformer(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ColumnCopyingTransformer)), columns) { } - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { internal static Column Parse(string str) { @@ -106,7 +118,7 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class Arguments : TransformInputBase + internal sealed class Options : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] @@ -114,12 +126,12 @@ public sealed class Arguments : TransformInputBase } // Factory method corresponding to SignatureDataTransform. - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); - var transformer = new ColumnCopyingTransformer(env, args.Columns.Select(x => (x.Name, x.Source)).ToArray()); + var transformer = new ColumnCopyingTransformer(env, options.Columns.Select(x => (x.Name, x.Source)).ToArray()); return transformer.MakeDataTransform(input); } diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index 3261638c5e..6afc031fef 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -36,7 +36,7 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms /// </summary> /// <param name="catalog">The transform's catalog.</param> /// <param name="columns">Description of dataset columns and how to process them.</param> - public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, params HashingTransformer.ColumnInfo[] columns) + public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, params HashingEstimator.ColumnInfo[] columns) => new HashingEstimator(CatalogUtils.GetEnvironment(catalog), columns); /// <summary> @@ -93,7 +93,7 @@ public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.Co /// <param name="catalog">The categorical transform's catalog.</param> /// <param name="columns">The input column to map back to vectors.</param> public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, - params KeyToVectorMappingTransformer.ColumnInfo[] columns) + params KeyToVectorMappingEstimator.ColumnInfo[] columns) => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns); /// <summary> diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index be3d937c66..f98f4baa47 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -16,7 +16,7 @@ using Microsoft.ML.Model; using Microsoft.ML.Transforms.Conversions; -[assembly: LoadableClass(HashingTransformer.Summary, typeof(IDataTransform), typeof(HashingTransformer), typeof(HashingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(HashingTransformer.Summary, typeof(IDataTransform), typeof(HashingTransformer), typeof(HashingTransformer.Options), typeof(SignatureDataTransform), "Hash Transform", "HashTransform", "Hash", DocName = "transform/HashTransform.md")] [assembly: LoadableClass(HashingTransformer.Summary, typeof(IDataTransform), typeof(HashingTransformer), null, typeof(SignatureLoadDataTransform), @@ -37,7 +37,7 @@ namespace Microsoft.ML.Transforms.Conversions /// </summary> public sealed class HashingTransformer : OneToOneTransformerBase { - public sealed class Arguments + internal sealed class Options { [Argument(ArgumentType.Multiple, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] @@ -59,7 +59,7 @@ public sealed class Arguments public int InvertHash = HashingEstimator.Defaults.InvertHash; } - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 31, inclusive", ShortName = "bits")] public int? HashBits; @@ -115,76 +115,6 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class ColumnInfo - { - public readonly string Name; - public readonly string InputColumnName; - public readonly int HashBits; - public readonly uint Seed; - public readonly bool Ordered; - public readonly int InvertHash; - - /// <summary> - /// Describes how the transformer handles one column pair. - /// </summary> - /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> - /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param> - /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param> - /// <param name="seed">Hashing seed.</param> - /// <param name="ordered">Whether the position of each term should be included in the hash.</param> - /// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param> - public ColumnInfo(string name, - string inputColumnName = null, - int hashBits = HashingEstimator.Defaults.HashBits, - uint seed = HashingEstimator.Defaults.Seed, - bool ordered = HashingEstimator.Defaults.Ordered, - int invertHash = HashingEstimator.Defaults.InvertHash) - { - if (invertHash < -1) - throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger"); - if (invertHash != 0 && hashBits >= 31) - throw Contracts.ExceptParam(nameof(hashBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", hashBits); - Contracts.CheckNonWhiteSpace(name, nameof(name)); - Name = name; - InputColumnName = inputColumnName ?? name; - HashBits = hashBits; - Seed = seed; - Ordered = ordered; - InvertHash = invertHash; - } - - internal ColumnInfo(string name, string inputColumnName, ModelLoadContext ctx) - { - Name = name; - InputColumnName = inputColumnName; - // *** Binary format *** - // int: HashBits - // uint: HashSeed - // byte: Ordered - HashBits = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(HashingEstimator.NumBitsMin <= HashBits && HashBits < HashingEstimator.NumBitsLim); - Seed = ctx.Reader.ReadUInt32(); - Ordered = ctx.Reader.ReadBoolByte(); - } - - internal void Save(ModelSaveContext ctx) - { - // *** Binary format *** - // int: HashBits - // uint: HashSeed - // byte: Ordered - - Contracts.Assert(HashingEstimator.NumBitsMin <= HashBits && HashBits < HashingEstimator.NumBitsLim); - ctx.Writer.Write(HashBits); - - ctx.Writer.Write(Seed); - ctx.Writer.WriteBoolByte(Ordered); - } - } - private const string RegistrationName = "Hash"; internal const string Summary = "Converts column values into hashes. This transform accepts text and keys as inputs. It works on single- and vector-valued columns, " @@ -203,7 +133,7 @@ private static VersionInfo GetVersionInfo() loaderAssemblyName: typeof(HashingTransformer).Assembly.FullName); } - private readonly ColumnInfo[] _columns; + private readonly HashingEstimator.ColumnInfo[] _columns; private readonly VBuffer<ReadOnlyMemory<char>>[] _keyValues; private readonly VectorType[] _kvTypes; @@ -214,13 +144,13 @@ protected override void CheckInputColumn(Schema inputSchema, int col, int srcCol throw Host.ExceptParam(nameof(inputSchema), HashingEstimator.ExpectedColumnType); } - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfo[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(HashingEstimator.ColumnInfo[] columns) { Contracts.CheckNonEmpty(columns, nameof(columns)); return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); } - private ColumnType GetOutputType(Schema inputSchema, ColumnInfo column) + private ColumnType GetOutputType(Schema inputSchema, HashingEstimator.ColumnInfo column) { var keyCount = (ulong)1 << column.HashBits; inputSchema.TryGetColumnIndex(column.InputColumnName, out int srcCol); @@ -237,7 +167,7 @@ private ColumnType GetOutputType(Schema inputSchema, ColumnInfo column) /// </summary> /// <param name="env">Host Environment.</param> /// <param name="columns">Description of dataset columns and how to process them.</param> - public HashingTransformer(IHostEnvironment env, params ColumnInfo[] columns) : + internal HashingTransformer(IHostEnvironment env, params HashingEstimator.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { _columns = columns.ToArray(); @@ -248,7 +178,7 @@ public HashingTransformer(IHostEnvironment env, params ColumnInfo[] columns) : } } - internal HashingTransformer(IHostEnvironment env, IDataView input, params ColumnInfo[] columns) : + internal HashingTransformer(IHostEnvironment env, IDataView input, params HashingEstimator.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { _columns = columns.ToArray(); @@ -342,9 +272,9 @@ private HashingTransformer(IHost host, ModelLoadContext ctx) : base(host, ctx) { var columnsLength = ColumnPairs.Length; - _columns = new ColumnInfo[columnsLength]; + _columns = new HashingEstimator.ColumnInfo[columnsLength]; for (int i = 0; i < columnsLength; i++) - _columns[i] = new ColumnInfo(ColumnPairs[i].outputColumnName, ColumnPairs[i].inputColumnName, ctx); + _columns[i] = new HashingEstimator.ColumnInfo(ColumnPairs[i].outputColumnName, ColumnPairs[i].inputColumnName, ctx); TextModelHelper.LoadAll(Host, ctx, columnsLength, out _keyValues, out _kvTypes); } @@ -376,25 +306,25 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Sch => Create(env, ctx).MakeRowMapper(inputSchema); // Factory method for SignatureDataTransform. - private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new ColumnInfo[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new HashingEstimator.ColumnInfo[options.Columns.Length]; for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; - var kind = item.InvertHash ?? args.InvertHash; - cols[i] = new ColumnInfo( + var item = options.Columns[i]; + var kind = item.InvertHash ?? options.InvertHash; + cols[i] = new HashingEstimator.ColumnInfo( item.Name, item.Source ?? item.Name, - item.HashBits ?? args.HashBits, - item.Seed ?? args.Seed, - item.Ordered ?? args.Ordered, - item.InvertHash ?? args.InvertHash); + item.HashBits ?? options.HashBits, + item.Seed ?? options.Seed, + item.Ordered ?? options.Ordered, + item.InvertHash ?? options.InvertHash); }; return new HashingTransformer(env, input, cols).MakeDataTransform(input); } @@ -914,11 +844,11 @@ private abstract class InvertHashHelper { protected readonly Row Row; private readonly bool _includeSlot; - private readonly ColumnInfo _ex; + private readonly HashingEstimator.ColumnInfo _ex; private readonly ColumnType _srcType; private readonly int _srcCol; - private InvertHashHelper(Row row, ColumnInfo ex) + private InvertHashHelper(Row row, HashingEstimator.ColumnInfo ex) { Contracts.AssertValue(row); Row = row; @@ -939,7 +869,7 @@ private InvertHashHelper(Row row, ColumnInfo ex) /// <param name="ex">The extra column info</param> /// <param name="invertHashMaxCount">The number of input hashed valuPres to accumulate per output hash value</param> /// <param name="dstGetter">A hash getter, built on top of <paramref name="row"/>.</param> - public static InvertHashHelper Create(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) + public static InvertHashHelper Create(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) { row.Schema.TryGetColumnIndex(ex.InputColumnName, out int srcCol); ColumnType typeSrc = row.Schema[srcCol].Type; @@ -950,7 +880,7 @@ public static InvertHashHelper Create(Row row, ColumnInfo ex, int invertHashMaxC t = t.MakeGenericType(itemType.RawType); - var consTypes = new Type[] { typeof(Row), typeof(ColumnInfo), typeof(int), typeof(Delegate) }; + var consTypes = new Type[] { typeof(Row), typeof(HashingEstimator.ColumnInfo), typeof(int), typeof(Delegate) }; var constructorInfo = t.GetConstructor(consTypes); return (InvertHashHelper)constructorInfo.Invoke(new object[] { row, ex, invertHashMaxCount, dstGetter }); } @@ -1027,7 +957,7 @@ private abstract class Impl<T> : InvertHashHelper { protected readonly InvertHashCollector<T> Collector; - protected Impl(Row row, ColumnInfo ex, int invertHashMaxCount) + protected Impl(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount) : base(row, ex) { Contracts.AssertValue(row); @@ -1060,7 +990,7 @@ private sealed class ImplOne<T> : Impl<T> private T _value; private uint _hash; - public ImplOne(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) + public ImplOne(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) : base(row, ex, invertHashMaxCount) { _srcGetter = Row.GetGetter<T>(_srcCol); @@ -1094,7 +1024,7 @@ private sealed class ImplVec<T> : Impl<T> private VBuffer<T> _value; private VBuffer<uint> _hash; - public ImplVec(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) + public ImplVec(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) : base(row, ex, invertHashMaxCount) { _srcGetter = Row.GetGetter<VBuffer<T>>(_srcCol); @@ -1128,7 +1058,7 @@ private sealed class ImplVecOrdered<T> : Impl<KeyValuePair<int, T>> private VBuffer<T> _value; private VBuffer<uint> _hash; - public ImplVecOrdered(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) + public ImplVecOrdered(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) : base(row, ex, invertHashMaxCount) { _srcGetter = Row.GetGetter<VBuffer<T>>(_srcCol); @@ -1176,7 +1106,8 @@ public override void Process() } /// <summary> - /// Estimator for <see cref="HashingTransformer"/> + /// Estimator for <see cref="HashingTransformer"/> which can hash either single valued columns or vector columns. For vector columns, + /// it hashes each slot separately. It can hash either text values or key values. /// </summary> public sealed class HashingEstimator : IEstimator<HashingTransformer> { @@ -1191,8 +1122,94 @@ internal static class Defaults public const int InvertHash = 0; } + /// <summary> + /// Describes how the transformer handles one column pair. + /// </summary> + public sealed class ColumnInfo + { + /// <summary> + /// Name of the column resulting from the transformation of <see cref="InputColumnName"/>. + /// </summary> + public readonly string Name; + /// <summary> Name of column to transform.</summary> + public readonly string InputColumnName; + /// <summary> Number of bits to hash into. Must be between 1 and 31, inclusive.</summary> + public readonly int HashBits; + /// <summary> Hashing seed.</summary> + public readonly uint Seed; + /// <summary> Whether the position of each term should be included in the hash.</summary> + public readonly bool Ordered; + /// <summary> + /// During hashing we constuct mappings between original values and the produced hash values. + /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. + /// <see cref="InvertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. + /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash. + /// </summary> + public readonly int InvertHash; + + /// <summary> + /// Describes how the transformer handles one column pair. + /// </summary> + /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> + /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param> + /// <param name="hashBits">Number of bits to hash into. Must be between 1 and 31, inclusive.</param> + /// <param name="seed">Hashing seed.</param> + /// <param name="ordered">Whether the position of each term should be included in the hash.</param> + /// <param name="invertHash">During hashing we constuct mappings between original values and the produced hash values. + /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. + /// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. + /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param> + public ColumnInfo(string name, + string inputColumnName = null, + int hashBits = Defaults.HashBits, + uint seed = Defaults.Seed, + bool ordered = Defaults.Ordered, + int invertHash = Defaults.InvertHash) + { + if (invertHash < -1) + throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger"); + if (invertHash != 0 && hashBits >= 31) + throw Contracts.ExceptParam(nameof(hashBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", hashBits); + Contracts.CheckNonWhiteSpace(name, nameof(name)); + Name = name; + InputColumnName = inputColumnName ?? name; + HashBits = hashBits; + Seed = seed; + Ordered = ordered; + InvertHash = invertHash; + } + + internal ColumnInfo(string name, string inputColumnName, ModelLoadContext ctx) + { + Name = name; + InputColumnName = inputColumnName; + // *** Binary format *** + // int: HashBits + // uint: HashSeed + // byte: Ordered + HashBits = ctx.Reader.ReadInt32(); + Contracts.CheckDecode(HashingEstimator.NumBitsMin <= HashBits && HashBits < HashingEstimator.NumBitsLim); + Seed = ctx.Reader.ReadUInt32(); + Ordered = ctx.Reader.ReadBoolByte(); + } + + internal void Save(ModelSaveContext ctx) + { + // *** Binary format *** + // int: HashBits + // uint: HashSeed + // byte: Ordered + + Contracts.Assert(HashingEstimator.NumBitsMin <= HashBits && HashBits < HashingEstimator.NumBitsLim); + ctx.Writer.Write(HashBits); + + ctx.Writer.Write(Seed); + ctx.Writer.WriteBoolByte(Ordered); + } + } + private readonly IHost _host; - private readonly HashingTransformer.ColumnInfo[] _columns; + private readonly ColumnInfo[] _columns; internal static bool IsColumnTypeValid(ColumnType type) { @@ -1214,9 +1231,9 @@ internal static bool IsColumnTypeValid(ColumnType type) /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param> - public HashingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, + internal HashingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash) - : this(env, new HashingTransformer.ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, hashBits: hashBits, invertHash: invertHash)) + : this(env, new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, hashBits: hashBits, invertHash: invertHash)) { } @@ -1225,15 +1242,23 @@ public HashingEstimator(IHostEnvironment env, string outputColumnName, string in /// </summary> /// <param name="env">Host Environment.</param> /// <param name="columns">Description of dataset columns and how to process them.</param> - public HashingEstimator(IHostEnvironment env, params HashingTransformer.ColumnInfo[] columns) + [BestFriend] + internal HashingEstimator(IHostEnvironment env, params ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(HashingEstimator)); _columns = columns.ToArray(); } + /// <summary> + /// Trains and returns a <see cref="HashingTransformer"/>. + /// </summary> public HashingTransformer Fit(IDataView input) => new HashingTransformer(_host, input, _columns); + /// <summary> + /// Returns the <see cref="SchemaShape"/> of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// </summary> public SchemaShape GetOutputSchema(SchemaShape inputSchema) { _host.CheckValue(inputSchema, nameof(inputSchema)); diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 75b0017f1f..d2c7a5e260 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -18,7 +18,7 @@ using Microsoft.ML.Transforms.Conversions; using Newtonsoft.Json.Linq; -[assembly: LoadableClass(KeyToVectorMappingTransformer.Summary, typeof(IDataTransform), typeof(KeyToVectorMappingTransformer), typeof(KeyToVectorMappingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(KeyToVectorMappingTransformer.Summary, typeof(IDataTransform), typeof(KeyToVectorMappingTransformer), typeof(KeyToVectorMappingTransformer.Options), typeof(SignatureDataTransform), "Key To Vector Transform", KeyToVectorMappingTransformer.UserName, "KeyToVector", "ToVector", DocName = "transform/KeyToVectorTransform.md")] [assembly: LoadableClass(KeyToVectorMappingTransformer.Summary, typeof(IDataTransform), typeof(KeyToVectorMappingTransformer), null, typeof(SignatureLoadDataTransform), @@ -32,9 +32,12 @@ namespace Microsoft.ML.Transforms.Conversions { + /// <summary> + /// Converts the key types back to their original vectors. + /// </summary> public sealed class KeyToVectorMappingTransformer : OneToOneTransformerBase { - public abstract class ColumnBase : OneToOneColumn + internal abstract class ColumnBase : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input is a vector.")] @@ -62,7 +65,8 @@ private protected override bool TryUnparseCore(StringBuilder sb, string extra) } } - public sealed class Column : ColumnBase + [BestFriend] + internal sealed class Column : ColumnBase { internal static Column Parse(string str) { @@ -80,7 +84,7 @@ internal bool TryUnparse(StringBuilder sb) return TryUnparseCore(sb); } } - public sealed class Arguments + internal sealed class Options { [Argument(ArgumentType.Multiple, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] @@ -91,36 +95,12 @@ public sealed class Arguments public bool Bag = KeyToVectorMappingEstimator.Defaults.Bag; } - /// <summary> - /// Describes how the transformer handles one column pair. - /// </summary> - public sealed class ColumnInfo - { - public readonly string Name; - public readonly string InputColumnName; - public readonly bool Bag; - - /// <summary> - /// Describes how the transformer handles one column pair. - /// </summary> - /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> - /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param> - /// <param name="bag">Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector.</param> - public ColumnInfo(string name, string inputColumnName = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) - { - Contracts.CheckNonWhiteSpace(name, nameof(name)); - Name = name; - InputColumnName = inputColumnName ?? name; - Bag = bag; - } - } - private const string RegistrationName = "KeyToVector"; - public IReadOnlyCollection<ColumnInfo> Columns => _columns.AsReadOnly(); - private readonly ColumnInfo[] _columns; + public IReadOnlyCollection<KeyToVectorMappingEstimator.ColumnInfo> Columns => _columns.AsReadOnly(); + private readonly KeyToVectorMappingEstimator.ColumnInfo[] _columns; - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfo[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(KeyToVectorMappingEstimator.ColumnInfo[] columns) { Contracts.CheckValue(columns, nameof(columns)); return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); @@ -141,7 +121,7 @@ protected override void CheckInputColumn(Schema inputSchema, int col, int srcCol throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", ColumnPairs[col].inputColumnName, reason, type.ToString()); } - public KeyToVectorMappingTransformer(IHostEnvironment env, params ColumnInfo[] columns) : + internal KeyToVectorMappingTransformer(IHostEnvironment env, params KeyToVectorMappingEstimator.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { _columns = columns.ToArray(); @@ -206,28 +186,28 @@ private KeyToVectorMappingTransformer(IHost host, ModelLoadContext ctx) var bags = new bool[columnsLength]; bags = ctx.Reader.ReadBoolArray(columnsLength); - _columns = new ColumnInfo[columnsLength]; + _columns = new KeyToVectorMappingEstimator.ColumnInfo[columnsLength]; for (int i = 0; i < columnsLength; i++) - _columns[i] = new ColumnInfo(ColumnPairs[i].outputColumnName, ColumnPairs[i].inputColumnName, bags[i]); + _columns[i] = new KeyToVectorMappingEstimator.ColumnInfo(ColumnPairs[i].outputColumnName, ColumnPairs[i].inputColumnName, bags[i]); } // Factory method for SignatureDataTransform. - private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new ColumnInfo[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new KeyToVectorMappingEstimator.ColumnInfo[options.Columns.Length]; for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; + var item = options.Columns[i]; - cols[i] = new ColumnInfo( + cols[i] = new KeyToVectorMappingEstimator.ColumnInfo( item.Name, item.Source ?? item.Name, - item.Bag ?? args.Bag); + item.Bag ?? options.Bag); }; return new KeyToVectorMappingTransformer(env, cols).MakeDataTransform(input); } @@ -743,6 +723,9 @@ private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src } } + /// <summary> + /// Estimator for <see cref="KeyToVectorMappingTransformer"/>. Converts the key types back to their original vectors. + /// </summary> public sealed class KeyToVectorMappingEstimator : TrivialEstimator<KeyToVectorMappingTransformer> { internal static class Defaults @@ -750,13 +733,43 @@ internal static class Defaults public const bool Bag = false; } - public KeyToVectorMappingEstimator(IHostEnvironment env, params KeyToVectorMappingTransformer.ColumnInfo[] columns) + /// <summary> + /// Describes how the transformer handles one column pair. + /// </summary> + public sealed class ColumnInfo + { + /// <summary> Name of the column resulting from the transformation of <cref see="InputColumnName"/>.</summary> + public readonly string Name; + /// <summary> Name of column to transform.</summary> + public readonly string InputColumnName; + /// <summary> + /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. + /// This is only relevant when the input column is a vector. + /// </summary> + public readonly bool Bag; + + /// <summary> + /// Describes how the transformer handles one column pair. + /// </summary> + /// <param name="name">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> + /// <param name="inputColumnName">Name of column to transform. If set to <see langword="null"/>, the value of the <paramref name="name"/> will be used as source.</param> + /// <param name="bag">Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector.</param> + public ColumnInfo(string name, string inputColumnName = null, bool bag = Defaults.Bag) + { + Contracts.CheckNonWhiteSpace(name, nameof(name)); + Name = name; + InputColumnName = inputColumnName ?? name; + Bag = bag; + } + } + + internal KeyToVectorMappingEstimator(IHostEnvironment env, params ColumnInfo[] columns) : this(env, new KeyToVectorMappingTransformer(env, columns)) { } - public KeyToVectorMappingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, bool bag = Defaults.Bag) - : this(env, new KeyToVectorMappingTransformer(env, new KeyToVectorMappingTransformer.ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, bag))) + internal KeyToVectorMappingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, bool bag = Defaults.Bag) + : this(env, new KeyToVectorMappingTransformer(env, new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, bag))) { } @@ -765,6 +778,10 @@ private KeyToVectorMappingEstimator(IHostEnvironment env, KeyToVectorMappingTran { } + /// <summary> + /// Returns the <see cref="SchemaShape"/> of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// </summary> public override SchemaShape GetOutputSchema(SchemaShape inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); diff --git a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs index da5b3dcb7a..1844de7347 100644 --- a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs +++ b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs @@ -84,7 +84,7 @@ public static CommonOutputs.TransformOutput PrepareFeatures(IHostEnvironment env } } - private static IDataView ApplyKeyToVec(List<KeyToVectorMappingTransformer.ColumnInfo> ktv, IDataView viewTrain, IHost host) + private static IDataView ApplyKeyToVec(List<KeyToVectorMappingEstimator.ColumnInfo> ktv, IDataView viewTrain, IHost host) { Contracts.AssertValueOrNull(ktv); Contracts.AssertValue(viewTrain); @@ -107,7 +107,7 @@ private static IDataView ApplyKeyToVec(List<KeyToVectorMappingTransformer.Column TextKeyValues = true }, viewTrain); - viewTrain = new KeyToVectorMappingTransformer(host, ktv.Select(c => new KeyToVectorMappingTransformer.ColumnInfo(c.Name, c.Name)).ToArray()).Transform(viewTrain); + viewTrain = new KeyToVectorMappingTransformer(host, ktv.Select(c => new KeyToVectorMappingEstimator.ColumnInfo(c.Name, c.Name)).ToArray()).Transform(viewTrain); } return viewTrain; } @@ -149,14 +149,14 @@ private static IDataView ApplyConvert(List<TypeConvertingTransformer.ColumnInfo> return viewTrain; } - private static List<KeyToVectorMappingTransformer.ColumnInfo> ConvertFeatures(IEnumerable<Schema.Column> feats, HashSet<string> featNames, List<KeyValuePair<string, string>> concatNames, IChannel ch, + private static List<KeyToVectorMappingEstimator.ColumnInfo> ConvertFeatures(IEnumerable<Schema.Column> feats, HashSet<string> featNames, List<KeyValuePair<string, string>> concatNames, IChannel ch, out List<TypeConvertingTransformer.ColumnInfo> cvt, out int errCount) { Contracts.AssertValue(feats); Contracts.AssertValue(featNames); Contracts.AssertValue(concatNames); Contracts.AssertValue(ch); - List<KeyToVectorMappingTransformer.ColumnInfo> ktv = null; + List<KeyToVectorMappingEstimator.ColumnInfo> ktv = null; cvt = null; errCount = 0; foreach (var col in feats) @@ -174,7 +174,7 @@ private static IDataView ApplyConvert(List<TypeConvertingTransformer.ColumnInfo> { var colName = GetUniqueName(); concatNames.Add(new KeyValuePair<string, string>(col.Name, colName)); - Utils.Add(ref ktv, new KeyToVectorMappingTransformer.ColumnInfo(colName, col.Name)); + Utils.Add(ref ktv, new KeyToVectorMappingEstimator.ColumnInfo(colName, col.Name)); continue; } } diff --git a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs index 47e2aef5f1..b575777ba4 100644 --- a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs +++ b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs @@ -578,11 +578,11 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env, IReadOnlyDictionary<PipelineColumn, string> outputNames, IReadOnlyCollection<string> usedNames) { - var infos = new KeyToVectorMappingTransformer.ColumnInfo[toOutput.Length]; + var infos = new KeyToVectorMappingEstimator.ColumnInfo[toOutput.Length]; for (int i = 0; i < toOutput.Length; ++i) { var col = (IColInput)toOutput[i]; - infos[i] = new KeyToVectorMappingTransformer.ColumnInfo(outputNames[toOutput[i]], inputNames[col.Input], col.Bag); + infos[i] = new KeyToVectorMappingEstimator.ColumnInfo(outputNames[toOutput[i]], inputNames[col.Input], col.Bag); } return new KeyToVectorMappingEstimator(env, infos); } diff --git a/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs b/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs index 11166bafb3..9fab277205 100644 --- a/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs +++ b/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs @@ -31,7 +31,7 @@ namespace Microsoft.ML.Transforms.Conversions { public sealed class KeyToBinaryVectorMappingTransformer : OneToOneTransformerBase { - public sealed class Arguments + internal sealed class Arguments { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] diff --git a/src/Microsoft.ML.Transforms/OneHotEncoding.cs b/src/Microsoft.ML.Transforms/OneHotEncoding.cs index bcfdf0c942..c67c9991bf 100644 --- a/src/Microsoft.ML.Transforms/OneHotEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotEncoding.cs @@ -262,7 +262,7 @@ internal OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns, IDa if (binaryCols.Count > 0) toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorMappingTransformer.ColumnInfo(x.outputColumnName, x.inputColumnName)).ToArray()); if (cols.Count > 0) - toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.outputColumnName, x.inputColumnName, x.bag)).ToArray()); + toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingEstimator.ColumnInfo(x.outputColumnName, x.inputColumnName, x.bag)).ToArray()); if (toBinVector != null && toVector != null) _toSomething = toVector.Append(toBinVector); @@ -316,16 +316,16 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en } [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", - Desc = OneHotHashEncoding.Summary, - UserName = OneHotHashEncoding.UserName)] - public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, OneHotHashEncoding.Arguments input) + Desc = OneHotHashEncodingTransformer.Summary, + UserName = OneHotHashEncodingTransformer.UserName)] + public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, OneHotHashEncodingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("CatTransformDict"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); - var xf = OneHotHashEncoding.Create(host, input, input.Data); + var xf = OneHotHashEncodingTransformer.Create(host, input, input.Data); return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }; } diff --git a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs index c8ebd854d4..d9c5d880ca 100644 --- a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs @@ -16,14 +16,17 @@ using Microsoft.ML.Transforms.Categorical; using Microsoft.ML.Transforms.Conversions; -[assembly: LoadableClass(OneHotHashEncoding.Summary, typeof(IDataTransform), typeof(OneHotHashEncoding), typeof(OneHotHashEncoding.Arguments), typeof(SignatureDataTransform), - OneHotHashEncoding.UserName, "CategoricalHashTransform", "CatHashTransform", "CategoricalHash", "CatHash")] +[assembly: LoadableClass(OneHotHashEncodingTransformer.Summary, typeof(IDataTransform), typeof(OneHotHashEncodingTransformer), typeof(OneHotHashEncodingTransformer.Options), typeof(SignatureDataTransform), + OneHotHashEncodingTransformer.UserName, "CategoricalHashTransform", "CatHashTransform", "CategoricalHash", "CatHash")] namespace Microsoft.ML.Transforms.Categorical { - public sealed class OneHotHashEncoding : ITransformer, ICanSaveModel + /// <summary> + /// Produces a column of indicator vectors. The mapping between a value and a corresponding index is done through hashing. + /// </summary> + public sealed class OneHotHashEncodingTransformer : ITransformer, ICanSaveModel { - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "The number of bits to hash into. Must be between 1 and 30, inclusive.", @@ -83,42 +86,33 @@ internal bool TryUnparse(StringBuilder sb) } } - private static class Defaults - { - public const int HashBits = 16; - public const uint Seed = 314489979; - public const bool Ordered = true; - public const int InvertHash = 0; - public const OneHotEncodingTransformer.OutputKind OutputKind = OneHotEncodingTransformer.OutputKind.Bag; - } - /// <summary> - /// This class is a merger of <see cref="HashingTransformer.Arguments"/> and <see cref="KeyToVectorMappingTransformer.Arguments"/> + /// This class is a merger of <see cref="HashingTransformer.Options"/> and <see cref="KeyToVectorMappingTransformer.Options"/> /// with join option removed /// </summary> - public sealed class Arguments : TransformInputBase + internal sealed class Options : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:hashBits:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 30, inclusive.", ShortName = "bits", SortOrder = 2)] - public int HashBits = Defaults.HashBits; + public int HashBits = OneHotHashEncodingEstimator.Defaults.HashBits; [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")] - public uint Seed = Defaults.Seed; + public uint Seed = OneHotHashEncodingEstimator.Defaults.Seed; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the position of each term should be included in the hash", ShortName = "ord")] - public bool Ordered = Defaults.Ordered; + public bool Ordered = OneHotHashEncodingEstimator.Defaults.Ordered; [Argument(ArgumentType.AtMostOnce, HelpText = "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", ShortName = "ih")] - public int InvertHash = Defaults.InvertHash; + public int InvertHash = OneHotHashEncodingEstimator.Defaults.InvertHash; [Argument(ArgumentType.AtMostOnce, HelpText = "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", ShortName = "kind", SortOrder = 102)] - public OneHotEncodingTransformer.OutputKind OutputKind = Defaults.OutputKind; + public OneHotEncodingTransformer.OutputKind OutputKind = OneHotHashEncodingEstimator.Defaults.OutputKind; } internal const string Summary = "Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the " @@ -127,7 +121,7 @@ public sealed class Arguments : TransformInputBase internal const string UserName = "Categorical Hash Transform"; /// <summary> - /// A helper method to create <see cref="OneHotHashEncoding"/>. + /// A helper method to create <see cref="OneHotHashEncodingTransformer"/>. /// </summary> /// <param name="env">Host Environment.</param> /// <param name="input">Input <see cref="IDataView"/>. This is the output from previous transform or loader.</param> @@ -139,7 +133,7 @@ public sealed class Arguments : TransformInputBase /// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param> /// <param name="outputKind">The type of output expected.</param> - public static IDataView Create(IHostEnvironment env, + private static IDataView Create(IHostEnvironment env, IDataView input, string name, string source = null, @@ -150,25 +144,25 @@ public static IDataView Create(IHostEnvironment env, return new OneHotHashEncodingEstimator(env, name, source, hashBits, invertHash, outputKind).Fit(input).Transform(input) as IDataView; } - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("Categorical"); - h.CheckValue(args, nameof(args)); + h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); - h.CheckUserArg(Utils.Size(args.Columns) > 0, nameof(args.Columns)); + h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns)); var columns = new List<OneHotHashEncodingEstimator.ColumnInfo>(); - foreach (var column in args.Columns) + foreach (var column in options.Columns) { var col = new OneHotHashEncodingEstimator.ColumnInfo( column.Name, column.Source ?? column.Name, - column.OutputKind ?? args.OutputKind, - column.HashBits ?? args.HashBits, - column.Seed ?? args.Seed, - column.Ordered ?? args.Ordered, - column.InvertHash ?? args.InvertHash); + column.OutputKind ?? options.OutputKind, + column.HashBits ?? options.HashBits, + column.Seed ?? options.Seed, + column.Ordered ?? options.Ordered, + column.InvertHash ?? options.InvertHash); columns.Add(col); } return new OneHotHashEncodingEstimator(env, columns.ToArray()).Fit(input).Transform(input) as IDataTransform; @@ -176,29 +170,42 @@ internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDat private readonly TransformerChain<ITransformer> _transformer; - internal OneHotHashEncoding(HashingEstimator hash, IEstimator<ITransformer> keyToVector, IDataView input) + internal OneHotHashEncodingTransformer(HashingEstimator hash, IEstimator<ITransformer> keyToVector, IDataView input) { if (keyToVector != null) _transformer = hash.Append(keyToVector).Fit(input); else _transformer = new TransformerChain<ITransformer>(hash.Fit(input)); } - + /// <summary> + /// Schema propagation for transformers. Returns the output schema of the data, if + /// the input schema is like the one provided. + /// </summary> public Schema GetOutputSchema(Schema inputSchema) => _transformer.GetOutputSchema(inputSchema); + /// <summary> + /// Take the data in, make transformations, output the data. Note that <see cref="IDataView"/> + /// are lazy, so no actual transformations happen here, just schema validation. + /// </summary> public IDataView Transform(IDataView input) => _transformer.Transform(input); public void Save(ModelSaveContext ctx) => _transformer.Save(ctx); + /// <summary> + /// Whether a call to <see cref="GetRowToRowMapper"/> should succeed, on an appropriate schema. + /// </summary> public bool IsRowToRowMapper => _transformer.IsRowToRowMapper; + /// <summary> + /// Constructs a row-to-row mapper based on an input schema. + /// </summary> public IRowToRowMapper GetRowToRowMapper(Schema inputSchema) => _transformer.GetRowToRowMapper(inputSchema); } /// <summary> - /// Estimator which takes set of columns and produce for each column indicator array. Use hashing to determine indicator position. + /// Estimator that produces a column of indicator vectors. The mapping between a value and a corresponding index is done through hashing. /// </summary> - public sealed class OneHotHashEncodingEstimator : IEstimator<OneHotHashEncoding> + public sealed class OneHotHashEncodingEstimator : IEstimator<OneHotHashEncodingTransformer> { [BestFriend] internal static class Defaults @@ -210,9 +217,12 @@ internal static class Defaults public const OneHotEncodingTransformer.OutputKind OutputKind = OneHotEncodingTransformer.OutputKind.Bag; } + /// <summary> + /// Describes how the transformer handles one column pair. + /// </summary> public sealed class ColumnInfo { - public readonly HashingTransformer.ColumnInfo HashInfo; + public readonly HashingEstimator.ColumnInfo HashInfo; public readonly OneHotEncodingTransformer.OutputKind OutputKind; /// <summary> @@ -235,7 +245,7 @@ public ColumnInfo(string name, string inputColumnName = null, bool ordered = Defaults.Ordered, int invertHash = Defaults.InvertHash) { - HashInfo = new HashingTransformer.ColumnInfo(name, inputColumnName ?? name, hashBits, seed, ordered, invertHash); + HashInfo = new HashingEstimator.ColumnInfo(name, inputColumnName ?? name, hashBits, seed, ordered, invertHash); OutputKind = outputKind; } } @@ -245,7 +255,7 @@ public ColumnInfo(string name, string inputColumnName = null, private HashingEstimator _hash; /// <summary> - /// A helper method to create <see cref="OneHotHashEncodingEstimator"/> for public facing API. + /// Instantiates a new instance of <see cref="OneHotHashEncodingEstimator"/>. /// </summary> /// <param name="env">Host Environment.</param> /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param> @@ -257,17 +267,17 @@ public ColumnInfo(string name, string inputColumnName = null, /// <paramref name="invertHash"/> specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// <value>0</value> does not retain any input values. <value>-1</value> retains all input values mapping to each hash.</param> /// <param name="outputKind">The type of output expected.</param> - public OneHotHashEncodingEstimator(IHostEnvironment env, + internal OneHotHashEncodingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, - int hashBits = OneHotHashEncodingEstimator.Defaults.HashBits, - int invertHash = OneHotHashEncodingEstimator.Defaults.InvertHash, + int hashBits = Defaults.HashBits, + int invertHash = Defaults.InvertHash, OneHotEncodingTransformer.OutputKind outputKind = Defaults.OutputKind) : this(env, new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, outputKind, hashBits, invertHash: invertHash)) { } - public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] columns) + internal OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(ValueToKeyMappingEstimator)); @@ -304,7 +314,7 @@ public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] col if (binaryCols.Count > 0) toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorMappingTransformer.ColumnInfo(x.outputColumnName, x.inputColumnName)).ToArray()); if (cols.Count > 0) - toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.outputColumnName, x.inputColumnName, x.bag)).ToArray()); + toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingEstimator.ColumnInfo(x.outputColumnName, x.inputColumnName, x.bag)).ToArray()); if (toBinVector != null && toVector != null) _toSomething = toVector.Append(toBinVector); @@ -318,6 +328,10 @@ public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] col } } + /// <summary> + /// Returns the <see cref="SchemaShape"/> of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// </summary> public SchemaShape GetOutputSchema(SchemaShape inputSchema) { if (_toSomething != null) @@ -326,6 +340,9 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) return _hash.GetOutputSchema(inputSchema); } - public OneHotHashEncoding Fit(IDataView input) => new OneHotHashEncoding(_hash, _toSomething, input); + /// <summary> + /// Trains and returns a <see cref="OneHotHashEncodingTransformer"/>. + /// </summary> + public OneHotHashEncodingTransformer Fit(IDataView input) => new OneHotHashEncodingTransformer(_hash, _toSomething, input); } } diff --git a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs index 49aae725d4..e2ffc70609 100644 --- a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs @@ -245,7 +245,7 @@ internal bool TryUnparse(StringBuilder sb) } /// <summary> - /// This class is a merger of <see cref="HashingTransformer.Arguments"/> and + /// This class is a merger of <see cref="HashingTransformer.Options"/> and /// <see cref="NgramHashingTransformer.Arguments"/>, with the ordered option, /// the rehashUnigrams option and the allLength option removed. /// </summary> @@ -330,7 +330,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV List<ValueToKeyMappingTransformer.Column> termCols = null; if (termLoaderArgs != null) termCols = new List<ValueToKeyMappingTransformer.Column>(); - var hashColumns = new List<HashingTransformer.ColumnInfo>(); + var hashColumns = new List<HashingEstimator.ColumnInfo>(); var ngramHashColumns = new NgramHashingTransformer.ColumnInfo[args.Columns.Length]; var colCount = args.Columns.Length; @@ -360,7 +360,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV }); } - hashColumns.Add(new HashingTransformer.ColumnInfo(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName, + hashColumns.Add(new HashingEstimator.ColumnInfo(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName, 30, column.Seed ?? args.Seed, false, column.InvertHash ?? args.InvertHash)); } diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 8525c932d4..3618d6ea16 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -73,11 +73,11 @@ Trainers.SymSgdBinaryClassifier Train a symbolic SGD. Microsoft.ML.Trainers.SymS Transforms.ApproximateBootstrapSampler Approximate bootstrap sampling. Microsoft.ML.Transforms.BootstrapSample GetSample Microsoft.ML.Transforms.BootstrapSamplingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.BinaryPredictionScoreColumnsRenamer For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class. Microsoft.ML.EntryPoints.ScoreModel RenameBinaryPredictionScoreColumns Microsoft.ML.EntryPoints.ScoreModel+RenameBinaryPredictionScoreColumnsInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.BinNormalizer The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins. Microsoft.ML.Data.Normalize Bin Microsoft.ML.Transforms.Normalizers.NormalizeTransform+BinArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.CategoricalHashOneHotVectorizer Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the bag. If the input column is a vector, a single indicator bag is returned for it. Microsoft.ML.Transforms.Categorical.Categorical CatTransformHash Microsoft.ML.Transforms.Categorical.OneHotHashEncoding+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.CategoricalHashOneHotVectorizer Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the bag. If the input column is a vector, a single indicator bag is returned for it. Microsoft.ML.Transforms.Categorical.Categorical CatTransformHash Microsoft.ML.Transforms.Categorical.OneHotHashEncodingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CategoricalOneHotVectorizer Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. Microsoft.ML.Transforms.Categorical.Categorical CatTransformDict Microsoft.ML.Transforms.Categorical.OneHotEncodingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CharacterTokenizer Character-oriented tokenizer where text is considered a sequence of characters. Microsoft.ML.Transforms.Text.TextAnalytics CharTokenize Microsoft.ML.Transforms.Text.TokenizingByCharactersTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ColumnConcatenator Concatenates one or more columns of the same item type. Microsoft.ML.EntryPoints.SchemaManipulation ConcatColumns Microsoft.ML.Data.ColumnConcatenatingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.ColumnCopier Duplicates columns from the dataset Microsoft.ML.EntryPoints.SchemaManipulation CopyColumns Microsoft.ML.Transforms.ColumnCopyingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.ColumnCopier Duplicates columns from the dataset Microsoft.ML.EntryPoints.SchemaManipulation CopyColumns Microsoft.ML.Transforms.ColumnCopyingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ColumnSelector Selects a set of columns, dropping all others Microsoft.ML.EntryPoints.SchemaManipulation SelectColumns Microsoft.ML.Transforms.ColumnSelectingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ColumnTypeConverter Converts a column to a different type, using standard conversions. Microsoft.ML.Transforms.Conversions.TypeConversion Convert Microsoft.ML.Transforms.Conversions.TypeConvertingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CombinerByContiguousGroupId Groups values of a scalar column into a vector, by a contiguous group ID Microsoft.ML.Transforms.GroupingOperations Group Microsoft.ML.Transforms.GroupTransform+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput diff --git a/test/Microsoft.ML.Benchmarks/HashBench.cs b/test/Microsoft.ML.Benchmarks/HashBench.cs index 06461276fd..429e9d7201 100644 --- a/test/Microsoft.ML.Benchmarks/HashBench.cs +++ b/test/Microsoft.ML.Benchmarks/HashBench.cs @@ -73,7 +73,7 @@ private void InitMap<T>(T val, ColumnType type, int hashBits = 20, ValueGetter<T getter = (ref T dst) => dst = val; _inRow = RowImpl.Create(type, getter); // One million features is a nice, typical number. - var info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: hashBits); + var info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: hashBits); var xf = new HashingTransformer(_env, new[] { info }); var mapper = xf.GetRowToRowMapper(_inRow.Schema); var column = mapper.OutputSchema["Bar"]; diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index 95af6fbd73..d089153764 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -1104,7 +1104,7 @@ private void TestHashTransformHelper<T>(T[] data, uint[] results, NumberType typ builder.AddColumn("F1", type, data); var srcView = builder.GetDataView(); - var hashTransform = new HashingTransformer(Env, new HashingTransformer.ColumnInfo("F1", "F1", 5, 42)).Transform(srcView); + var hashTransform = new HashingTransformer(Env, new HashingEstimator.ColumnInfo("F1", "F1", 5, 42)).Transform(srcView); using (var cursor = hashTransform.GetRowCursorForAllColumns()) { var resultGetter = cursor.GetGetter<uint>(1); @@ -1135,7 +1135,7 @@ private void TestHashTransformVectorHelper<T>(VBuffer<T> data, uint[][] results, private void TestHashTransformVectorHelper(ArrayDataViewBuilder builder, uint[][] results) { var srcView = builder.GetDataView(); - var hashTransform = new HashingTransformer(Env, new HashingTransformer.ColumnInfo("F1V", "F1V", 5, 42)).Transform(srcView); + var hashTransform = new HashingTransformer(Env, new HashingEstimator.ColumnInfo("F1V", "F1V", 5, 42)).Transform(srcView); using (var cursor = hashTransform.GetRowCursorForAllColumns()) { var resultGetter = cursor.GetGetter<VBuffer<uint>>(1); diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs index b41c12313d..4e96f9e519 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs @@ -51,7 +51,7 @@ public void CategoricalHashWorkout() var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new OneHotHashEncodingEstimator(Env, new[]{ + var pipe = ML.Transforms.Categorical.OneHotHashEncoding(new[]{ new OneHotHashEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag), new OneHotHashEncodingEstimator.ColumnInfo("CatB", "A", OneHotEncodingTransformer.OutputKind.Bin), new OneHotHashEncodingEstimator.ColumnInfo("CatC", "A", OneHotEncodingTransformer.OutputKind.Ind), @@ -113,7 +113,7 @@ public void TestMetadataPropagation() new TestMeta() { A = new string[2] { "A", "B"}, B = "C", C =new float[2] { 5.0f,6.0f}, D = 1.0f , E= new string[2]{"D","E"}, F="D"} }; var dataView = ML.Data.ReadFromEnumerable(data); - var bagPipe = new OneHotHashEncodingEstimator(Env, + var bagPipe = ML.Transforms.Categorical.OneHotHashEncoding( new OneHotHashEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("CatB", "B", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("CatC", "C", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), @@ -217,7 +217,7 @@ public void TestOldSavingAndLoading() { var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new OneHotHashEncodingEstimator(Env, new[]{ + var pipe = ML.Transforms.Categorical.OneHotHashEncoding(new[]{ new OneHotHashEncodingEstimator.ColumnInfo("CatHashA", "A"), new OneHotHashEncodingEstimator.ColumnInfo("CatHashB", "B"), new OneHotHashEncodingEstimator.ColumnInfo("CatHashC", "C") diff --git a/test/Microsoft.ML.Tests/Transformers/HashTests.cs b/test/Microsoft.ML.Tests/Transformers/HashTests.cs index edb2e858f9..1a1688fd84 100644 --- a/test/Microsoft.ML.Tests/Transformers/HashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/HashTests.cs @@ -46,11 +46,11 @@ public void HashWorkout() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new HashingEstimator(Env, new[]{ - new HashingTransformer.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), - new HashingTransformer.ColumnInfo("HashB", "B", hashBits:3, ordered:true), - new HashingTransformer.ColumnInfo("HashC", "C", seed:42), - new HashingTransformer.ColumnInfo("HashD", "A"), + var pipe = ML.Transforms.Conversion.Hash(new[]{ + new HashingEstimator.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), + new HashingEstimator.ColumnInfo("HashB", "B", hashBits:3, ordered:true), + new HashingEstimator.ColumnInfo("HashC", "C", seed:42), + new HashingEstimator.ColumnInfo("HashD", "A"), }); TestEstimatorCore(pipe, dataView); @@ -68,10 +68,10 @@ public void TestMetadata() var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new HashingEstimator(Env, new[] { - new HashingTransformer.ColumnInfo("HashA", "A", invertHash:1, hashBits:10), - new HashingTransformer.ColumnInfo("HashAUnlim", "A", invertHash:-1, hashBits:10), - new HashingTransformer.ColumnInfo("HashAUnlimOrdered", "A", invertHash:-1, hashBits:10, ordered:true) + var pipe = ML.Transforms.Conversion.Hash(new[] { + new HashingEstimator.ColumnInfo("HashA", "A", invertHash:1, hashBits:10), + new HashingEstimator.ColumnInfo("HashAUnlim", "A", invertHash:-1, hashBits:10), + new HashingEstimator.ColumnInfo("HashAUnlimOrdered", "A", invertHash:-1, hashBits:10, ordered:true) }); var result = pipe.Fit(dataView).Transform(dataView); ValidateMetadata(result); @@ -108,11 +108,11 @@ public void TestOldSavingAndLoading() { var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new HashingEstimator(Env, new[]{ - new HashingTransformer.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), - new HashingTransformer.ColumnInfo("HashB", "B", hashBits:3, ordered:true), - new HashingTransformer.ColumnInfo("HashC", "C", seed:42), - new HashingTransformer.ColumnInfo("HashD" ,"A"), + var pipe = ML.Transforms.Conversion.Hash(new[]{ + new HashingEstimator.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), + new HashingEstimator.ColumnInfo("HashB", "B", hashBits:3, ordered:true), + new HashingEstimator.ColumnInfo("HashC", "C", seed:42), + new HashingEstimator.ColumnInfo("HashD" ,"A"), }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); @@ -133,7 +133,7 @@ private void HashTestCore<T>(T val, PrimitiveType type, uint expected, uint expe var inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); // First do an unordered hash. - var info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits); + var info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits); var xf = new HashingTransformer(Env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out int outCol); @@ -145,7 +145,7 @@ private void HashTestCore<T>(T val, PrimitiveType type, uint expected, uint expe Assert.Equal(expected, result); // Next do an ordered hash. - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -163,7 +163,7 @@ private void HashTestCore<T>(T val, PrimitiveType type, uint expected, uint expe builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer<T> dst) => denseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -178,7 +178,7 @@ private void HashTestCore<T>(T val, PrimitiveType type, uint expected, uint expe Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v)); // Now do ordered with the dense vector. - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -197,7 +197,7 @@ private void HashTestCore<T>(T val, PrimitiveType type, uint expected, uint expe builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer<T> dst) => sparseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -210,7 +210,7 @@ private void HashTestCore<T>(T val, PrimitiveType type, uint expected, uint expe Assert.Equal(expected, vecResult.GetItemOrDefault(3)); Assert.Equal(expected, vecResult.GetItemOrDefault(7)); - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs index ab9dc36206..998d2606b0 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs @@ -58,10 +58,10 @@ public void KeyToVectorWorkout() new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C", textKeyValues:true) }).Fit(dataView).Transform(dataView); - var pipe = new KeyToVectorMappingEstimator(Env, new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TermA", false), - new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TermB", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatC", "TermC", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatCNonBag", "TermC", false)); + var pipe = ML.Transforms.Conversion.MapKeyToVector(new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TermA", false), + new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TermB", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatC", "TermC", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatCNonBag", "TermC", false)); TestEstimatorCore(pipe, dataView); Done(); } @@ -121,15 +121,15 @@ public void TestMetadataPropagation() var termTransformer = termEst.Fit(dataView); dataView = termTransformer.Transform(dataView); - var pipe = new KeyToVectorMappingEstimator(Env, - new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TA", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TB", false), - new KeyToVectorMappingTransformer.ColumnInfo("CatC", "TC", false), - new KeyToVectorMappingTransformer.ColumnInfo("CatD", "TD", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatE", "TE", false), - new KeyToVectorMappingTransformer.ColumnInfo("CatF", "TF", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatG", "TG", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatH", "TH", false) + var pipe = ML.Transforms.Conversion.MapKeyToVector( + new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TA", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TB", false), + new KeyToVectorMappingEstimator.ColumnInfo("CatC", "TC", false), + new KeyToVectorMappingEstimator.ColumnInfo("CatD", "TD", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatE", "TE", false), + new KeyToVectorMappingEstimator.ColumnInfo("CatF", "TF", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatG", "TG", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatH", "TH", false) ); var result = pipe.Fit(dataView).Transform(dataView); @@ -221,9 +221,9 @@ public void TestOldSavingAndLoading() }); var transformer = est.Fit(dataView); dataView = transformer.Transform(dataView); - var pipe = new KeyToVectorMappingEstimator(Env, - new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TermA",false), - new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TermB", true) + var pipe = ML.Transforms.Conversion.MapKeyToVector( + new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TermA",false), + new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TermB", true) ); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result);