From c428ae4b3ecb0554591d63a7459ceeb783c77bec Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Fri, 1 Feb 2019 00:00:42 -0800 Subject: [PATCH 1/5] onehothash, hash, copy col, key to vector --- .../EntryPoints/SchemaManipulation.cs | 2 +- src/Microsoft.ML.Data/TrainCatalog.cs | 6 +- .../Transforms/ColumnCopying.cs | 23 +- .../ConversionsExtensionsCatalog.cs | 4 +- src/Microsoft.ML.Data/Transforms/Hashing.cs | 216 +++++++++--------- .../Transforms/KeyToVector.cs | 95 ++++---- .../FeatureCombiner.cs | 10 +- .../TransformsStatic.cs | 4 +- .../KeyToVectorMapping.cs | 2 +- src/Microsoft.ML.Transforms/OneHotEncoding.cs | 4 +- .../OneHotHashEncoding.cs | 67 +++--- .../Text/WordHashBagProducingTransform.cs | 6 +- test/Microsoft.ML.Benchmarks/HashBench.cs | 2 +- .../Transformers/CategoricalHashTests.cs | 6 +- .../Transformers/HashTests.cs | 34 +-- .../Transformers/KeyToVectorEstimatorTests.cs | 28 +-- 16 files changed, 262 insertions(+), 247 deletions(-) diff --git a/src/Microsoft.ML.Data/EntryPoints/SchemaManipulation.cs b/src/Microsoft.ML.Data/EntryPoints/SchemaManipulation.cs index 63409f78d7..6296c2aa55 100644 --- a/src/Microsoft.ML.Data/EntryPoints/SchemaManipulation.cs +++ b/src/Microsoft.ML.Data/EntryPoints/SchemaManipulation.cs @@ -38,7 +38,7 @@ public static CommonOutputs.TransformOutput SelectColumns(IHostEnvironment env, } [TlcModule.EntryPoint(Name = "Transforms.ColumnCopier", Desc = "Duplicates columns from the dataset", UserName = ColumnCopyingTransformer.UserName, ShortName = ColumnCopyingTransformer.ShortName)] - public static CommonOutputs.TransformOutput CopyColumns(IHostEnvironment env, ColumnCopyingTransformer.Arguments input) + public static CommonOutputs.TransformOutput CopyColumns(IHostEnvironment env, ColumnCopyingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("CopyColumns"); diff --git a/src/Microsoft.ML.Data/TrainCatalog.cs b/src/Microsoft.ML.Data/TrainCatalog.cs index ee8e4ac548..f10d5dda6e 100644 --- a/src/Microsoft.ML.Data/TrainCatalog.cs +++ b/src/Microsoft.ML.Data/TrainCatalog.cs @@ -152,11 +152,11 @@ private void EnsureStratificationColumn(ref IDataView data, ref string stratific // Generate a new column with the hashed stratification column. while (data.Schema.TryGetColumnIndex(stratificationColumn, out tmp)) stratificationColumn = string.Format("{0}_{1:000}", origStratCol, ++inc); - HashingTransformer.ColumnInfo columnInfo; + HashingEstimator.ColumnInfo columnInfo; if (seed.HasValue) - columnInfo = new HashingTransformer.ColumnInfo(stratificationColumn, origStratCol, 30, seed.Value); + columnInfo = new HashingEstimator.ColumnInfo(stratificationColumn, origStratCol, 30, seed.Value); else - columnInfo = new HashingTransformer.ColumnInfo(stratificationColumn, origStratCol, 30); + columnInfo = new HashingEstimator.ColumnInfo(stratificationColumn, origStratCol, 30); data = new HashingEstimator(Host, columnInfo).Fit(data).Transform(data); } } diff --git a/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs b/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs index a880496d60..b8fdc64089 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs @@ -18,7 +18,7 @@ using Microsoft.ML.Transforms; [assembly: LoadableClass(ColumnCopyingTransformer.Summary, typeof(IDataTransform), typeof(ColumnCopyingTransformer), - typeof(ColumnCopyingTransformer.Arguments), typeof(SignatureDataTransform), + typeof(ColumnCopyingTransformer.Options), typeof(SignatureDataTransform), ColumnCopyingTransformer.UserName, "CopyColumns", "CopyColumnsTransform", ColumnCopyingTransformer.ShortName, DocName = "transform/CopyColumnsTransformer.md")] @@ -35,16 +35,21 @@ namespace Microsoft.ML.Transforms { public sealed class ColumnCopyingEstimator : TrivialEstimator { - public ColumnCopyingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName) : + [BestFriend] + internal ColumnCopyingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName) : this(env, (outputColumnName, inputColumnName)) { } - public ColumnCopyingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) + internal ColumnCopyingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ColumnCopyingEstimator)), new ColumnCopyingTransformer(env, columns)) { } + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public override SchemaShape GetOutputSchema(SchemaShape inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); @@ -82,12 +87,12 @@ private static VersionInfo GetVersionInfo() loaderAssemblyName: typeof(ColumnCopyingTransformer).Assembly.FullName); } - public ColumnCopyingTransformer(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) + internal ColumnCopyingTransformer(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ColumnCopyingTransformer)), columns) { } - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { internal static Column Parse(string str) { @@ -106,7 +111,7 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class Arguments : TransformInputBase + internal sealed class Options : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] @@ -114,12 +119,12 @@ public sealed class Arguments : TransformInputBase } // Factory method corresponding to SignatureDataTransform. - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); - var transformer = new ColumnCopyingTransformer(env, args.Columns.Select(x => (x.Name, x.Source)).ToArray()); + var transformer = new ColumnCopyingTransformer(env, options.Columns.Select(x => (x.Name, x.Source)).ToArray()); return transformer.MakeDataTransform(input); } diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index 3261638c5e..6afc031fef 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -36,7 +36,7 @@ public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms /// /// The transform's catalog. /// Description of dataset columns and how to process them. - public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, params HashingTransformer.ColumnInfo[] columns) + public static HashingEstimator Hash(this TransformsCatalog.ConversionTransforms catalog, params HashingEstimator.ColumnInfo[] columns) => new HashingEstimator(CatalogUtils.GetEnvironment(catalog), columns); /// @@ -93,7 +93,7 @@ public static KeyToValueMappingEstimator MapKeyToValue(this TransformsCatalog.Co /// The categorical transform's catalog. /// The input column to map back to vectors. public static KeyToVectorMappingEstimator MapKeyToVector(this TransformsCatalog.ConversionTransforms catalog, - params KeyToVectorMappingTransformer.ColumnInfo[] columns) + params KeyToVectorMappingEstimator.ColumnInfo[] columns) => new KeyToVectorMappingEstimator(CatalogUtils.GetEnvironment(catalog), columns); /// diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index be3d937c66..3259f4d298 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -16,7 +16,7 @@ using Microsoft.ML.Model; using Microsoft.ML.Transforms.Conversions; -[assembly: LoadableClass(HashingTransformer.Summary, typeof(IDataTransform), typeof(HashingTransformer), typeof(HashingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(HashingTransformer.Summary, typeof(IDataTransform), typeof(HashingTransformer), typeof(HashingTransformer.Options), typeof(SignatureDataTransform), "Hash Transform", "HashTransform", "Hash", DocName = "transform/HashTransform.md")] [assembly: LoadableClass(HashingTransformer.Summary, typeof(IDataTransform), typeof(HashingTransformer), null, typeof(SignatureLoadDataTransform), @@ -37,7 +37,7 @@ namespace Microsoft.ML.Transforms.Conversions /// public sealed class HashingTransformer : OneToOneTransformerBase { - public sealed class Arguments + internal sealed class Options { [Argument(ArgumentType.Multiple, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] @@ -59,7 +59,7 @@ public sealed class Arguments public int InvertHash = HashingEstimator.Defaults.InvertHash; } - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 31, inclusive", ShortName = "bits")] public int? HashBits; @@ -115,76 +115,6 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class ColumnInfo - { - public readonly string Name; - public readonly string InputColumnName; - public readonly int HashBits; - public readonly uint Seed; - public readonly bool Ordered; - public readonly int InvertHash; - - /// - /// Describes how the transformer handles one column pair. - /// - /// Name of the column resulting from the transformation of . - /// Name of column to transform. If set to , the value of the will be used as source. - /// Number of bits to hash into. Must be between 1 and 31, inclusive. - /// Hashing seed. - /// Whether the position of each term should be included in the hash. - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public ColumnInfo(string name, - string inputColumnName = null, - int hashBits = HashingEstimator.Defaults.HashBits, - uint seed = HashingEstimator.Defaults.Seed, - bool ordered = HashingEstimator.Defaults.Ordered, - int invertHash = HashingEstimator.Defaults.InvertHash) - { - if (invertHash < -1) - throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger"); - if (invertHash != 0 && hashBits >= 31) - throw Contracts.ExceptParam(nameof(hashBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", hashBits); - Contracts.CheckNonWhiteSpace(name, nameof(name)); - Name = name; - InputColumnName = inputColumnName ?? name; - HashBits = hashBits; - Seed = seed; - Ordered = ordered; - InvertHash = invertHash; - } - - internal ColumnInfo(string name, string inputColumnName, ModelLoadContext ctx) - { - Name = name; - InputColumnName = inputColumnName; - // *** Binary format *** - // int: HashBits - // uint: HashSeed - // byte: Ordered - HashBits = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(HashingEstimator.NumBitsMin <= HashBits && HashBits < HashingEstimator.NumBitsLim); - Seed = ctx.Reader.ReadUInt32(); - Ordered = ctx.Reader.ReadBoolByte(); - } - - internal void Save(ModelSaveContext ctx) - { - // *** Binary format *** - // int: HashBits - // uint: HashSeed - // byte: Ordered - - Contracts.Assert(HashingEstimator.NumBitsMin <= HashBits && HashBits < HashingEstimator.NumBitsLim); - ctx.Writer.Write(HashBits); - - ctx.Writer.Write(Seed); - ctx.Writer.WriteBoolByte(Ordered); - } - } - private const string RegistrationName = "Hash"; internal const string Summary = "Converts column values into hashes. This transform accepts text and keys as inputs. It works on single- and vector-valued columns, " @@ -203,7 +133,7 @@ private static VersionInfo GetVersionInfo() loaderAssemblyName: typeof(HashingTransformer).Assembly.FullName); } - private readonly ColumnInfo[] _columns; + private readonly HashingEstimator.ColumnInfo[] _columns; private readonly VBuffer>[] _keyValues; private readonly VectorType[] _kvTypes; @@ -214,13 +144,13 @@ protected override void CheckInputColumn(Schema inputSchema, int col, int srcCol throw Host.ExceptParam(nameof(inputSchema), HashingEstimator.ExpectedColumnType); } - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfo[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(HashingEstimator.ColumnInfo[] columns) { Contracts.CheckNonEmpty(columns, nameof(columns)); return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); } - private ColumnType GetOutputType(Schema inputSchema, ColumnInfo column) + private ColumnType GetOutputType(Schema inputSchema, HashingEstimator.ColumnInfo column) { var keyCount = (ulong)1 << column.HashBits; inputSchema.TryGetColumnIndex(column.InputColumnName, out int srcCol); @@ -237,7 +167,7 @@ private ColumnType GetOutputType(Schema inputSchema, ColumnInfo column) /// /// Host Environment. /// Description of dataset columns and how to process them. - public HashingTransformer(IHostEnvironment env, params ColumnInfo[] columns) : + internal HashingTransformer(IHostEnvironment env, params HashingEstimator.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { _columns = columns.ToArray(); @@ -248,7 +178,7 @@ public HashingTransformer(IHostEnvironment env, params ColumnInfo[] columns) : } } - internal HashingTransformer(IHostEnvironment env, IDataView input, params ColumnInfo[] columns) : + internal HashingTransformer(IHostEnvironment env, IDataView input, params HashingEstimator.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { _columns = columns.ToArray(); @@ -342,9 +272,9 @@ private HashingTransformer(IHost host, ModelLoadContext ctx) : base(host, ctx) { var columnsLength = ColumnPairs.Length; - _columns = new ColumnInfo[columnsLength]; + _columns = new HashingEstimator.ColumnInfo[columnsLength]; for (int i = 0; i < columnsLength; i++) - _columns[i] = new ColumnInfo(ColumnPairs[i].outputColumnName, ColumnPairs[i].inputColumnName, ctx); + _columns[i] = new HashingEstimator.ColumnInfo(ColumnPairs[i].outputColumnName, ColumnPairs[i].inputColumnName, ctx); TextModelHelper.LoadAll(Host, ctx, columnsLength, out _keyValues, out _kvTypes); } @@ -376,25 +306,25 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Sch => Create(env, ctx).MakeRowMapper(inputSchema); // Factory method for SignatureDataTransform. - private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new ColumnInfo[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new HashingEstimator.ColumnInfo[options.Columns.Length]; for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; - var kind = item.InvertHash ?? args.InvertHash; - cols[i] = new ColumnInfo( + var item = options.Columns[i]; + var kind = item.InvertHash ?? options.InvertHash; + cols[i] = new HashingEstimator.ColumnInfo( item.Name, item.Source ?? item.Name, - item.HashBits ?? args.HashBits, - item.Seed ?? args.Seed, - item.Ordered ?? args.Ordered, - item.InvertHash ?? args.InvertHash); + item.HashBits ?? options.HashBits, + item.Seed ?? options.Seed, + item.Ordered ?? options.Ordered, + item.InvertHash ?? options.InvertHash); }; return new HashingTransformer(env, input, cols).MakeDataTransform(input); } @@ -914,11 +844,11 @@ private abstract class InvertHashHelper { protected readonly Row Row; private readonly bool _includeSlot; - private readonly ColumnInfo _ex; + private readonly HashingEstimator.ColumnInfo _ex; private readonly ColumnType _srcType; private readonly int _srcCol; - private InvertHashHelper(Row row, ColumnInfo ex) + private InvertHashHelper(Row row, HashingEstimator.ColumnInfo ex) { Contracts.AssertValue(row); Row = row; @@ -939,7 +869,7 @@ private InvertHashHelper(Row row, ColumnInfo ex) /// The extra column info /// The number of input hashed valuPres to accumulate per output hash value /// A hash getter, built on top of . - public static InvertHashHelper Create(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) + public static InvertHashHelper Create(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) { row.Schema.TryGetColumnIndex(ex.InputColumnName, out int srcCol); ColumnType typeSrc = row.Schema[srcCol].Type; @@ -950,7 +880,7 @@ public static InvertHashHelper Create(Row row, ColumnInfo ex, int invertHashMaxC t = t.MakeGenericType(itemType.RawType); - var consTypes = new Type[] { typeof(Row), typeof(ColumnInfo), typeof(int), typeof(Delegate) }; + var consTypes = new Type[] { typeof(Row), typeof(HashingEstimator.ColumnInfo), typeof(int), typeof(Delegate) }; var constructorInfo = t.GetConstructor(consTypes); return (InvertHashHelper)constructorInfo.Invoke(new object[] { row, ex, invertHashMaxCount, dstGetter }); } @@ -1027,7 +957,7 @@ private abstract class Impl : InvertHashHelper { protected readonly InvertHashCollector Collector; - protected Impl(Row row, ColumnInfo ex, int invertHashMaxCount) + protected Impl(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount) : base(row, ex) { Contracts.AssertValue(row); @@ -1060,7 +990,7 @@ private sealed class ImplOne : Impl private T _value; private uint _hash; - public ImplOne(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) + public ImplOne(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) : base(row, ex, invertHashMaxCount) { _srcGetter = Row.GetGetter(_srcCol); @@ -1094,7 +1024,7 @@ private sealed class ImplVec : Impl private VBuffer _value; private VBuffer _hash; - public ImplVec(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) + public ImplVec(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) : base(row, ex, invertHashMaxCount) { _srcGetter = Row.GetGetter>(_srcCol); @@ -1128,7 +1058,7 @@ private sealed class ImplVecOrdered : Impl> private VBuffer _value; private VBuffer _hash; - public ImplVecOrdered(Row row, ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) + public ImplVecOrdered(Row row, HashingEstimator.ColumnInfo ex, int invertHashMaxCount, Delegate dstGetter) : base(row, ex, invertHashMaxCount) { _srcGetter = Row.GetGetter>(_srcCol); @@ -1183,7 +1113,77 @@ public sealed class HashingEstimator : IEstimator internal const int NumBitsMin = 1; internal const int NumBitsLim = 32; - internal static class Defaults + public sealed class ColumnInfo + { + public readonly string Name; + public readonly string InputColumnName; + public readonly int HashBits; + public readonly uint Seed; + public readonly bool Ordered; + public readonly int InvertHash; + + /// + /// Describes how the transformer handles one column pair. + /// + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. + /// Number of bits to hash into. Must be between 1 and 31, inclusive. + /// Hashing seed. + /// Whether the position of each term should be included in the hash. + /// During hashing we constuct mappings between original values and the produced hash values. + /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. + /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. + /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + public ColumnInfo(string name, + string inputColumnName = null, + int hashBits = HashingEstimator.Defaults.HashBits, + uint seed = HashingEstimator.Defaults.Seed, + bool ordered = HashingEstimator.Defaults.Ordered, + int invertHash = HashingEstimator.Defaults.InvertHash) + { + if (invertHash < -1) + throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger"); + if (invertHash != 0 && hashBits >= 31) + throw Contracts.ExceptParam(nameof(hashBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", hashBits); + Contracts.CheckNonWhiteSpace(name, nameof(name)); + Name = name; + InputColumnName = inputColumnName ?? name; + HashBits = hashBits; + Seed = seed; + Ordered = ordered; + InvertHash = invertHash; + } + + internal ColumnInfo(string name, string inputColumnName, ModelLoadContext ctx) + { + Name = name; + InputColumnName = inputColumnName; + // *** Binary format *** + // int: HashBits + // uint: HashSeed + // byte: Ordered + HashBits = ctx.Reader.ReadInt32(); + Contracts.CheckDecode(HashingEstimator.NumBitsMin <= HashBits && HashBits < HashingEstimator.NumBitsLim); + Seed = ctx.Reader.ReadUInt32(); + Ordered = ctx.Reader.ReadBoolByte(); + } + + internal void Save(ModelSaveContext ctx) + { + // *** Binary format *** + // int: HashBits + // uint: HashSeed + // byte: Ordered + + Contracts.Assert(HashingEstimator.NumBitsMin <= HashBits && HashBits < HashingEstimator.NumBitsLim); + ctx.Writer.Write(HashBits); + + ctx.Writer.Write(Seed); + ctx.Writer.WriteBoolByte(Ordered); + } + } + + public static class Defaults { public const int HashBits = NumBitsLim - 1; public const uint Seed = 314489979; @@ -1192,7 +1192,7 @@ internal static class Defaults } private readonly IHost _host; - private readonly HashingTransformer.ColumnInfo[] _columns; + private readonly ColumnInfo[] _columns; internal static bool IsColumnTypeValid(ColumnType type) { @@ -1214,9 +1214,9 @@ internal static bool IsColumnTypeValid(ColumnType type) /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public HashingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, + internal HashingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int hashBits = Defaults.HashBits, int invertHash = Defaults.InvertHash) - : this(env, new HashingTransformer.ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, hashBits: hashBits, invertHash: invertHash)) + : this(env, new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, hashBits: hashBits, invertHash: invertHash)) { } @@ -1225,15 +1225,23 @@ public HashingEstimator(IHostEnvironment env, string outputColumnName, string in /// /// Host Environment. /// Description of dataset columns and how to process them. - public HashingEstimator(IHostEnvironment env, params HashingTransformer.ColumnInfo[] columns) + [BestFriend] + internal HashingEstimator(IHostEnvironment env, params ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(HashingEstimator)); _columns = columns.ToArray(); } + /// + /// Train and return a transformer. + /// public HashingTransformer Fit(IDataView input) => new HashingTransformer(_host, input, _columns); + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public SchemaShape GetOutputSchema(SchemaShape inputSchema) { _host.CheckValue(inputSchema, nameof(inputSchema)); diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 75b0017f1f..5a4098f1e3 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -18,7 +18,7 @@ using Microsoft.ML.Transforms.Conversions; using Newtonsoft.Json.Linq; -[assembly: LoadableClass(KeyToVectorMappingTransformer.Summary, typeof(IDataTransform), typeof(KeyToVectorMappingTransformer), typeof(KeyToVectorMappingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(KeyToVectorMappingTransformer.Summary, typeof(IDataTransform), typeof(KeyToVectorMappingTransformer), typeof(KeyToVectorMappingTransformer.Options), typeof(SignatureDataTransform), "Key To Vector Transform", KeyToVectorMappingTransformer.UserName, "KeyToVector", "ToVector", DocName = "transform/KeyToVectorTransform.md")] [assembly: LoadableClass(KeyToVectorMappingTransformer.Summary, typeof(IDataTransform), typeof(KeyToVectorMappingTransformer), null, typeof(SignatureLoadDataTransform), @@ -34,7 +34,7 @@ namespace Microsoft.ML.Transforms.Conversions { public sealed class KeyToVectorMappingTransformer : OneToOneTransformerBase { - public abstract class ColumnBase : OneToOneColumn + internal abstract class ColumnBase : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input is a vector.")] @@ -62,7 +62,8 @@ private protected override bool TryUnparseCore(StringBuilder sb, string extra) } } - public sealed class Column : ColumnBase + [BestFriend] + internal sealed class Column : ColumnBase { internal static Column Parse(string str) { @@ -80,7 +81,7 @@ internal bool TryUnparse(StringBuilder sb) return TryUnparseCore(sb); } } - public sealed class Arguments + internal sealed class Options { [Argument(ArgumentType.Multiple, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] @@ -91,36 +92,12 @@ public sealed class Arguments public bool Bag = KeyToVectorMappingEstimator.Defaults.Bag; } - /// - /// Describes how the transformer handles one column pair. - /// - public sealed class ColumnInfo - { - public readonly string Name; - public readonly string InputColumnName; - public readonly bool Bag; - - /// - /// Describes how the transformer handles one column pair. - /// - /// Name of the column resulting from the transformation of . - /// Name of column to transform. If set to , the value of the will be used as source. - /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector. - public ColumnInfo(string name, string inputColumnName = null, bool bag = KeyToVectorMappingEstimator.Defaults.Bag) - { - Contracts.CheckNonWhiteSpace(name, nameof(name)); - Name = name; - InputColumnName = inputColumnName ?? name; - Bag = bag; - } - } - private const string RegistrationName = "KeyToVector"; - public IReadOnlyCollection Columns => _columns.AsReadOnly(); - private readonly ColumnInfo[] _columns; + public IReadOnlyCollection Columns => _columns.AsReadOnly(); + private readonly KeyToVectorMappingEstimator.ColumnInfo[] _columns; - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfo[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(KeyToVectorMappingEstimator.ColumnInfo[] columns) { Contracts.CheckValue(columns, nameof(columns)); return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); @@ -141,7 +118,7 @@ protected override void CheckInputColumn(Schema inputSchema, int col, int srcCol throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", ColumnPairs[col].inputColumnName, reason, type.ToString()); } - public KeyToVectorMappingTransformer(IHostEnvironment env, params ColumnInfo[] columns) : + internal KeyToVectorMappingTransformer(IHostEnvironment env, params KeyToVectorMappingEstimator.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), GetColumnPairs(columns)) { _columns = columns.ToArray(); @@ -206,28 +183,28 @@ private KeyToVectorMappingTransformer(IHost host, ModelLoadContext ctx) var bags = new bool[columnsLength]; bags = ctx.Reader.ReadBoolArray(columnsLength); - _columns = new ColumnInfo[columnsLength]; + _columns = new KeyToVectorMappingEstimator.ColumnInfo[columnsLength]; for (int i = 0; i < columnsLength; i++) - _columns[i] = new ColumnInfo(ColumnPairs[i].outputColumnName, ColumnPairs[i].inputColumnName, bags[i]); + _columns[i] = new KeyToVectorMappingEstimator.ColumnInfo(ColumnPairs[i].outputColumnName, ColumnPairs[i].inputColumnName, bags[i]); } // Factory method for SignatureDataTransform. - private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new ColumnInfo[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new KeyToVectorMappingEstimator.ColumnInfo[options.Columns.Length]; for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; + var item = options.Columns[i]; - cols[i] = new ColumnInfo( + cols[i] = new KeyToVectorMappingEstimator.ColumnInfo( item.Name, item.Source ?? item.Name, - item.Bag ?? args.Bag); + item.Bag ?? options.Bag); }; return new KeyToVectorMappingTransformer(env, cols).MakeDataTransform(input); } @@ -745,18 +722,42 @@ private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src public sealed class KeyToVectorMappingEstimator : TrivialEstimator { - internal static class Defaults + public static class Defaults { public const bool Bag = false; } - public KeyToVectorMappingEstimator(IHostEnvironment env, params KeyToVectorMappingTransformer.ColumnInfo[] columns) + /// + /// Describes how the transformer handles one column pair. + /// + public sealed class ColumnInfo + { + public readonly string Name; + public readonly string InputColumnName; + public readonly bool Bag; + + /// + /// Describes how the transformer handles one column pair. + /// + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. + /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector. + public ColumnInfo(string name, string inputColumnName = null, bool bag = Defaults.Bag) + { + Contracts.CheckNonWhiteSpace(name, nameof(name)); + Name = name; + InputColumnName = inputColumnName ?? name; + Bag = bag; + } + } + + internal KeyToVectorMappingEstimator(IHostEnvironment env, params ColumnInfo[] columns) : this(env, new KeyToVectorMappingTransformer(env, columns)) { } - public KeyToVectorMappingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, bool bag = Defaults.Bag) - : this(env, new KeyToVectorMappingTransformer(env, new KeyToVectorMappingTransformer.ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, bag))) + internal KeyToVectorMappingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, bool bag = Defaults.Bag) + : this(env, new KeyToVectorMappingTransformer(env, new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, bag))) { } @@ -765,6 +766,10 @@ private KeyToVectorMappingEstimator(IHostEnvironment env, KeyToVectorMappingTran { } + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public override SchemaShape GetOutputSchema(SchemaShape inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); diff --git a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs index da5b3dcb7a..1844de7347 100644 --- a/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs +++ b/src/Microsoft.ML.EntryPoints/FeatureCombiner.cs @@ -84,7 +84,7 @@ public static CommonOutputs.TransformOutput PrepareFeatures(IHostEnvironment env } } - private static IDataView ApplyKeyToVec(List ktv, IDataView viewTrain, IHost host) + private static IDataView ApplyKeyToVec(List ktv, IDataView viewTrain, IHost host) { Contracts.AssertValueOrNull(ktv); Contracts.AssertValue(viewTrain); @@ -107,7 +107,7 @@ private static IDataView ApplyKeyToVec(List new KeyToVectorMappingTransformer.ColumnInfo(c.Name, c.Name)).ToArray()).Transform(viewTrain); + viewTrain = new KeyToVectorMappingTransformer(host, ktv.Select(c => new KeyToVectorMappingEstimator.ColumnInfo(c.Name, c.Name)).ToArray()).Transform(viewTrain); } return viewTrain; } @@ -149,14 +149,14 @@ private static IDataView ApplyConvert(List return viewTrain; } - private static List ConvertFeatures(IEnumerable feats, HashSet featNames, List> concatNames, IChannel ch, + private static List ConvertFeatures(IEnumerable feats, HashSet featNames, List> concatNames, IChannel ch, out List cvt, out int errCount) { Contracts.AssertValue(feats); Contracts.AssertValue(featNames); Contracts.AssertValue(concatNames); Contracts.AssertValue(ch); - List ktv = null; + List ktv = null; cvt = null; errCount = 0; foreach (var col in feats) @@ -174,7 +174,7 @@ private static IDataView ApplyConvert(List { var colName = GetUniqueName(); concatNames.Add(new KeyValuePair(col.Name, colName)); - Utils.Add(ref ktv, new KeyToVectorMappingTransformer.ColumnInfo(colName, col.Name)); + Utils.Add(ref ktv, new KeyToVectorMappingEstimator.ColumnInfo(colName, col.Name)); continue; } } diff --git a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs index 47e2aef5f1..b575777ba4 100644 --- a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs +++ b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs @@ -578,11 +578,11 @@ public override IEstimator Reconcile(IHostEnvironment env, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) { - var infos = new KeyToVectorMappingTransformer.ColumnInfo[toOutput.Length]; + var infos = new KeyToVectorMappingEstimator.ColumnInfo[toOutput.Length]; for (int i = 0; i < toOutput.Length; ++i) { var col = (IColInput)toOutput[i]; - infos[i] = new KeyToVectorMappingTransformer.ColumnInfo(outputNames[toOutput[i]], inputNames[col.Input], col.Bag); + infos[i] = new KeyToVectorMappingEstimator.ColumnInfo(outputNames[toOutput[i]], inputNames[col.Input], col.Bag); } return new KeyToVectorMappingEstimator(env, infos); } diff --git a/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs b/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs index 11166bafb3..9fab277205 100644 --- a/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs +++ b/src/Microsoft.ML.Transforms/KeyToVectorMapping.cs @@ -31,7 +31,7 @@ namespace Microsoft.ML.Transforms.Conversions { public sealed class KeyToBinaryVectorMappingTransformer : OneToOneTransformerBase { - public sealed class Arguments + internal sealed class Arguments { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] diff --git a/src/Microsoft.ML.Transforms/OneHotEncoding.cs b/src/Microsoft.ML.Transforms/OneHotEncoding.cs index bcfdf0c942..3d4431538f 100644 --- a/src/Microsoft.ML.Transforms/OneHotEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotEncoding.cs @@ -262,7 +262,7 @@ internal OneHotEncodingEstimator(IHostEnvironment env, ColumnInfo[] columns, IDa if (binaryCols.Count > 0) toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorMappingTransformer.ColumnInfo(x.outputColumnName, x.inputColumnName)).ToArray()); if (cols.Count > 0) - toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.outputColumnName, x.inputColumnName, x.bag)).ToArray()); + toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingEstimator.ColumnInfo(x.outputColumnName, x.inputColumnName, x.bag)).ToArray()); if (toBinVector != null && toVector != null) _toSomething = toVector.Append(toBinVector); @@ -318,7 +318,7 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", Desc = OneHotHashEncoding.Summary, UserName = OneHotHashEncoding.UserName)] - public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, OneHotHashEncoding.Arguments input) + public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, OneHotHashEncoding.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("CatTransformDict"); diff --git a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs index c8ebd854d4..8412df8896 100644 --- a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs @@ -16,14 +16,14 @@ using Microsoft.ML.Transforms.Categorical; using Microsoft.ML.Transforms.Conversions; -[assembly: LoadableClass(OneHotHashEncoding.Summary, typeof(IDataTransform), typeof(OneHotHashEncoding), typeof(OneHotHashEncoding.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(OneHotHashEncoding.Summary, typeof(IDataTransform), typeof(OneHotHashEncoding), typeof(OneHotHashEncoding.Options), typeof(SignatureDataTransform), OneHotHashEncoding.UserName, "CategoricalHashTransform", "CatHashTransform", "CategoricalHash", "CatHash")] namespace Microsoft.ML.Transforms.Categorical { public sealed class OneHotHashEncoding : ITransformer, ICanSaveModel { - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "The number of bits to hash into. Must be between 1 and 30, inclusive.", @@ -83,42 +83,33 @@ internal bool TryUnparse(StringBuilder sb) } } - private static class Defaults - { - public const int HashBits = 16; - public const uint Seed = 314489979; - public const bool Ordered = true; - public const int InvertHash = 0; - public const OneHotEncodingTransformer.OutputKind OutputKind = OneHotEncodingTransformer.OutputKind.Bag; - } - /// - /// This class is a merger of and + /// This class is a merger of and /// with join option removed /// - public sealed class Arguments : TransformInputBase + internal sealed class Options : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:hashBits:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 30, inclusive.", ShortName = "bits", SortOrder = 2)] - public int HashBits = Defaults.HashBits; + public int HashBits = OneHotHashEncodingEstimator.Defaults.HashBits; [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")] - public uint Seed = Defaults.Seed; + public uint Seed = OneHotHashEncodingEstimator.Defaults.Seed; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the position of each term should be included in the hash", ShortName = "ord")] - public bool Ordered = Defaults.Ordered; + public bool Ordered = OneHotHashEncodingEstimator.Defaults.Ordered; [Argument(ArgumentType.AtMostOnce, HelpText = "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", ShortName = "ih")] - public int InvertHash = Defaults.InvertHash; + public int InvertHash = OneHotHashEncodingEstimator.Defaults.InvertHash; [Argument(ArgumentType.AtMostOnce, HelpText = "Output kind: Bag (multi-set vector), Ind (indicator vector), or Key (index)", ShortName = "kind", SortOrder = 102)] - public OneHotEncodingTransformer.OutputKind OutputKind = Defaults.OutputKind; + public OneHotEncodingTransformer.OutputKind OutputKind = OneHotHashEncodingEstimator.Defaults.OutputKind; } internal const string Summary = "Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the " @@ -139,7 +130,7 @@ public sealed class Arguments : TransformInputBase /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. /// The type of output expected. - public static IDataView Create(IHostEnvironment env, + private static IDataView Create(IHostEnvironment env, IDataView input, string name, string source = null, @@ -150,25 +141,25 @@ public static IDataView Create(IHostEnvironment env, return new OneHotHashEncodingEstimator(env, name, source, hashBits, invertHash, outputKind).Fit(input).Transform(input) as IDataView; } - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var h = env.Register("Categorical"); - h.CheckValue(args, nameof(args)); + h.CheckValue(options, nameof(options)); h.CheckValue(input, nameof(input)); - h.CheckUserArg(Utils.Size(args.Columns) > 0, nameof(args.Columns)); + h.CheckUserArg(Utils.Size(options.Columns) > 0, nameof(options.Columns)); var columns = new List(); - foreach (var column in args.Columns) + foreach (var column in options.Columns) { var col = new OneHotHashEncodingEstimator.ColumnInfo( column.Name, column.Source ?? column.Name, - column.OutputKind ?? args.OutputKind, - column.HashBits ?? args.HashBits, - column.Seed ?? args.Seed, - column.Ordered ?? args.Ordered, - column.InvertHash ?? args.InvertHash); + column.OutputKind ?? options.OutputKind, + column.HashBits ?? options.HashBits, + column.Seed ?? options.Seed, + column.Ordered ?? options.Ordered, + column.InvertHash ?? options.InvertHash); columns.Add(col); } return new OneHotHashEncodingEstimator(env, columns.ToArray()).Fit(input).Transform(input) as IDataTransform; @@ -200,8 +191,7 @@ internal OneHotHashEncoding(HashingEstimator hash, IEstimator keyT /// public sealed class OneHotHashEncodingEstimator : IEstimator { - [BestFriend] - internal static class Defaults + public static class Defaults { public const int HashBits = 16; public const uint Seed = 314489979; @@ -212,7 +202,7 @@ internal static class Defaults public sealed class ColumnInfo { - public readonly HashingTransformer.ColumnInfo HashInfo; + public readonly HashingEstimator.ColumnInfo HashInfo; public readonly OneHotEncodingTransformer.OutputKind OutputKind; /// @@ -235,7 +225,7 @@ public ColumnInfo(string name, string inputColumnName = null, bool ordered = Defaults.Ordered, int invertHash = Defaults.InvertHash) { - HashInfo = new HashingTransformer.ColumnInfo(name, inputColumnName ?? name, hashBits, seed, ordered, invertHash); + HashInfo = new HashingEstimator.ColumnInfo(name, inputColumnName ?? name, hashBits, seed, ordered, invertHash); OutputKind = outputKind; } } @@ -257,7 +247,7 @@ public ColumnInfo(string name, string inputColumnName = null, /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. /// The type of output expected. - public OneHotHashEncodingEstimator(IHostEnvironment env, + internal OneHotHashEncodingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int hashBits = OneHotHashEncodingEstimator.Defaults.HashBits, @@ -267,7 +257,7 @@ public OneHotHashEncodingEstimator(IHostEnvironment env, { } - public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] columns) + internal OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(ValueToKeyMappingEstimator)); @@ -304,7 +294,7 @@ public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] col if (binaryCols.Count > 0) toBinVector = new KeyToBinaryVectorMappingEstimator(_host, binaryCols.Select(x => new KeyToBinaryVectorMappingTransformer.ColumnInfo(x.outputColumnName, x.inputColumnName)).ToArray()); if (cols.Count > 0) - toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingTransformer.ColumnInfo(x.outputColumnName, x.inputColumnName, x.bag)).ToArray()); + toVector = new KeyToVectorMappingEstimator(_host, cols.Select(x => new KeyToVectorMappingEstimator.ColumnInfo(x.outputColumnName, x.inputColumnName, x.bag)).ToArray()); if (toBinVector != null && toVector != null) _toSomething = toVector.Append(toBinVector); @@ -318,6 +308,10 @@ public OneHotHashEncodingEstimator(IHostEnvironment env, params ColumnInfo[] col } } + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public SchemaShape GetOutputSchema(SchemaShape inputSchema) { if (_toSomething != null) @@ -326,6 +320,9 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) return _hash.GetOutputSchema(inputSchema); } + /// + /// Train and return a transformer. + /// public OneHotHashEncoding Fit(IDataView input) => new OneHotHashEncoding(_hash, _toSomething, input); } } diff --git a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs index 49aae725d4..e2ffc70609 100644 --- a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs @@ -245,7 +245,7 @@ internal bool TryUnparse(StringBuilder sb) } /// - /// This class is a merger of and + /// This class is a merger of and /// , with the ordered option, /// the rehashUnigrams option and the allLength option removed. /// @@ -330,7 +330,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV List termCols = null; if (termLoaderArgs != null) termCols = new List(); - var hashColumns = new List(); + var hashColumns = new List(); var ngramHashColumns = new NgramHashingTransformer.ColumnInfo[args.Columns.Length]; var colCount = args.Columns.Length; @@ -360,7 +360,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV }); } - hashColumns.Add(new HashingTransformer.ColumnInfo(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName, + hashColumns.Add(new HashingEstimator.ColumnInfo(tmpName, termLoaderArgs == null ? column.Source[isrc] : tmpName, 30, column.Seed ?? args.Seed, false, column.InvertHash ?? args.InvertHash)); } diff --git a/test/Microsoft.ML.Benchmarks/HashBench.cs b/test/Microsoft.ML.Benchmarks/HashBench.cs index 06461276fd..429e9d7201 100644 --- a/test/Microsoft.ML.Benchmarks/HashBench.cs +++ b/test/Microsoft.ML.Benchmarks/HashBench.cs @@ -73,7 +73,7 @@ private void InitMap(T val, ColumnType type, int hashBits = 20, ValueGetter dst = val; _inRow = RowImpl.Create(type, getter); // One million features is a nice, typical number. - var info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: hashBits); + var info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: hashBits); var xf = new HashingTransformer(_env, new[] { info }); var mapper = xf.GetRowToRowMapper(_inRow.Schema); var column = mapper.OutputSchema["Bar"]; diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs index b41c12313d..4e96f9e519 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs @@ -51,7 +51,7 @@ public void CategoricalHashWorkout() var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new OneHotHashEncodingEstimator(Env, new[]{ + var pipe = ML.Transforms.Categorical.OneHotHashEncoding(new[]{ new OneHotHashEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag), new OneHotHashEncodingEstimator.ColumnInfo("CatB", "A", OneHotEncodingTransformer.OutputKind.Bin), new OneHotHashEncodingEstimator.ColumnInfo("CatC", "A", OneHotEncodingTransformer.OutputKind.Ind), @@ -113,7 +113,7 @@ public void TestMetadataPropagation() new TestMeta() { A = new string[2] { "A", "B"}, B = "C", C =new float[2] { 5.0f,6.0f}, D = 1.0f , E= new string[2]{"D","E"}, F="D"} }; var dataView = ML.Data.ReadFromEnumerable(data); - var bagPipe = new OneHotHashEncodingEstimator(Env, + var bagPipe = ML.Transforms.Categorical.OneHotHashEncoding( new OneHotHashEncodingEstimator.ColumnInfo("CatA", "A", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("CatB", "B", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), new OneHotHashEncodingEstimator.ColumnInfo("CatC", "C", OneHotEncodingTransformer.OutputKind.Bag, invertHash: -1), @@ -217,7 +217,7 @@ public void TestOldSavingAndLoading() { var data = new[] { new TestClass() { A = "1", B = "2", C = "3", }, new TestClass() { A = "4", B = "5", C = "6" } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new OneHotHashEncodingEstimator(Env, new[]{ + var pipe = ML.Transforms.Categorical.OneHotHashEncoding(new[]{ new OneHotHashEncodingEstimator.ColumnInfo("CatHashA", "A"), new OneHotHashEncodingEstimator.ColumnInfo("CatHashB", "B"), new OneHotHashEncodingEstimator.ColumnInfo("CatHashC", "C") diff --git a/test/Microsoft.ML.Tests/Transformers/HashTests.cs b/test/Microsoft.ML.Tests/Transformers/HashTests.cs index edb2e858f9..0f0de4cd3f 100644 --- a/test/Microsoft.ML.Tests/Transformers/HashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/HashTests.cs @@ -47,10 +47,10 @@ public void HashWorkout() var dataView = ML.Data.ReadFromEnumerable(data); var pipe = new HashingEstimator(Env, new[]{ - new HashingTransformer.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), - new HashingTransformer.ColumnInfo("HashB", "B", hashBits:3, ordered:true), - new HashingTransformer.ColumnInfo("HashC", "C", seed:42), - new HashingTransformer.ColumnInfo("HashD", "A"), + new HashingEstimator.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), + new HashingEstimator.ColumnInfo("HashB", "B", hashBits:3, ordered:true), + new HashingEstimator.ColumnInfo("HashC", "C", seed:42), + new HashingEstimator.ColumnInfo("HashD", "A"), }); TestEstimatorCore(pipe, dataView); @@ -69,9 +69,9 @@ public void TestMetadata() var dataView = ML.Data.ReadFromEnumerable(data); var pipe = new HashingEstimator(Env, new[] { - new HashingTransformer.ColumnInfo("HashA", "A", invertHash:1, hashBits:10), - new HashingTransformer.ColumnInfo("HashAUnlim", "A", invertHash:-1, hashBits:10), - new HashingTransformer.ColumnInfo("HashAUnlimOrdered", "A", invertHash:-1, hashBits:10, ordered:true) + new HashingEstimator.ColumnInfo("HashA", "A", invertHash:1, hashBits:10), + new HashingEstimator.ColumnInfo("HashAUnlim", "A", invertHash:-1, hashBits:10), + new HashingEstimator.ColumnInfo("HashAUnlimOrdered", "A", invertHash:-1, hashBits:10, ordered:true) }); var result = pipe.Fit(dataView).Transform(dataView); ValidateMetadata(result); @@ -109,10 +109,10 @@ public void TestOldSavingAndLoading() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); var pipe = new HashingEstimator(Env, new[]{ - new HashingTransformer.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), - new HashingTransformer.ColumnInfo("HashB", "B", hashBits:3, ordered:true), - new HashingTransformer.ColumnInfo("HashC", "C", seed:42), - new HashingTransformer.ColumnInfo("HashD" ,"A"), + new HashingEstimator.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), + new HashingEstimator.ColumnInfo("HashB", "B", hashBits:3, ordered:true), + new HashingEstimator.ColumnInfo("HashC", "C", seed:42), + new HashingEstimator.ColumnInfo("HashD" ,"A"), }); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); @@ -133,7 +133,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe var inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); // First do an unordered hash. - var info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits); + var info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits); var xf = new HashingTransformer(Env, new[] { info }); var mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out int outCol); @@ -145,7 +145,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe Assert.Equal(expected, result); // Next do an ordered hash. - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -163,7 +163,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer dst) => denseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -178,7 +178,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe Assert.All(vecResult.DenseValues(), v => Assert.Equal(expected, v)); // Now do ordered with the dense vector. - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -197,7 +197,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe builder.Add("Foo", new VectorType(type, vecLen), (ref VBuffer dst) => sparseVec.CopyTo(ref dst)); inRow = MetadataUtils.MetadataAsRow(builder.GetMetadata()); - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: false); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); @@ -210,7 +210,7 @@ private void HashTestCore(T val, PrimitiveType type, uint expected, uint expe Assert.Equal(expected, vecResult.GetItemOrDefault(3)); Assert.Equal(expected, vecResult.GetItemOrDefault(7)); - info = new HashingTransformer.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); + info = new HashingEstimator.ColumnInfo("Bar", "Foo", hashBits: bits, ordered: true); xf = new HashingTransformer(Env, new[] { info }); mapper = xf.GetRowToRowMapper(inRow.Schema); mapper.OutputSchema.TryGetColumnIndex("Bar", out outCol); diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs index ab9dc36206..7dc65e2265 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs @@ -58,10 +58,10 @@ public void KeyToVectorWorkout() new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C", textKeyValues:true) }).Fit(dataView).Transform(dataView); - var pipe = new KeyToVectorMappingEstimator(Env, new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TermA", false), - new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TermB", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatC", "TermC", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatCNonBag", "TermC", false)); + var pipe = new KeyToVectorMappingEstimator(Env, new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TermA", false), + new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TermB", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatC", "TermC", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatCNonBag", "TermC", false)); TestEstimatorCore(pipe, dataView); Done(); } @@ -122,14 +122,14 @@ public void TestMetadataPropagation() dataView = termTransformer.Transform(dataView); var pipe = new KeyToVectorMappingEstimator(Env, - new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TA", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TB", false), - new KeyToVectorMappingTransformer.ColumnInfo("CatC", "TC", false), - new KeyToVectorMappingTransformer.ColumnInfo("CatD", "TD", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatE", "TE", false), - new KeyToVectorMappingTransformer.ColumnInfo("CatF", "TF", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatG", "TG", true), - new KeyToVectorMappingTransformer.ColumnInfo("CatH", "TH", false) + new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TA", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TB", false), + new KeyToVectorMappingEstimator.ColumnInfo("CatC", "TC", false), + new KeyToVectorMappingEstimator.ColumnInfo("CatD", "TD", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatE", "TE", false), + new KeyToVectorMappingEstimator.ColumnInfo("CatF", "TF", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatG", "TG", true), + new KeyToVectorMappingEstimator.ColumnInfo("CatH", "TH", false) ); var result = pipe.Fit(dataView).Transform(dataView); @@ -222,8 +222,8 @@ public void TestOldSavingAndLoading() var transformer = est.Fit(dataView); dataView = transformer.Transform(dataView); var pipe = new KeyToVectorMappingEstimator(Env, - new KeyToVectorMappingTransformer.ColumnInfo("CatA", "TermA",false), - new KeyToVectorMappingTransformer.ColumnInfo("CatB", "TermB", true) + new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TermA",false), + new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TermB", true) ); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); From 42fc7808dd9887b3827848c3e3a9f6e678f2a85d Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Fri, 1 Feb 2019 18:03:04 -0800 Subject: [PATCH 2/5] further cleaning and fix build with bestfriends assemblies --- .../Properties/AssemblyInfo.cs | 4 ++ .../Transforms/ColumnCopying.cs | 7 +++ src/Microsoft.ML.Data/Transforms/Hashing.cs | 53 ++++++++++++++----- .../Transforms/KeyToVector.cs | 15 ++++++ src/Microsoft.ML.Transforms/OneHotEncoding.cs | 8 +-- .../OneHotHashEncoding.cs | 42 +++++++++++---- .../DataPipe/TestDataPipe.cs | 4 +- 7 files changed, 102 insertions(+), 31 deletions(-) diff --git a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs index 235c709af1..40729e6894 100644 --- a/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs +++ b/src/Microsoft.ML.Data/Properties/AssemblyInfo.cs @@ -39,6 +39,10 @@ [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TensorFlow" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.TimeSeries" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.Transforms" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.AlexNet" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet101" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet18" + PublicKey.Value)] +[assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.DnnImageFeaturizer.ResNet50" + PublicKey.Value)] [assembly: InternalsVisibleTo(assemblyName: "Microsoft.ML.StaticPipe" + PublicKey.Value)] diff --git a/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs b/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs index b8fdc64089..2928922f06 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs @@ -33,6 +33,9 @@ namespace Microsoft.ML.Transforms { + /// + /// copies the input column to another column named as specified in the parameters of the transformation. + /// public sealed class ColumnCopyingEstimator : TrivialEstimator { [BestFriend] @@ -41,6 +44,7 @@ internal ColumnCopyingEstimator(IHostEnvironment env, string outputColumnName, s { } + [BestFriend] internal ColumnCopyingEstimator(IHostEnvironment env, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ColumnCopyingEstimator)), new ColumnCopyingTransformer(env, columns)) { @@ -74,6 +78,9 @@ public sealed class ColumnCopyingTransformer : OneToOneTransformerBase internal const string UserName = "Copy Columns Transform"; internal const string ShortName = "Copy"; + /// + /// Names of input and ouput column pairs on which the transformation is applied. + /// public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); private static VersionInfo GetVersionInfo() diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 3259f4d298..8c42042b3f 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -1106,20 +1106,53 @@ public override void Process() } /// - /// Estimator for + /// Estimator for which can hash either single valued columns or vector columns. For vector columns, + /// it hashes each slot separately. It can hash either text values or key values. /// public sealed class HashingEstimator : IEstimator { internal const int NumBitsMin = 1; internal const int NumBitsLim = 32; + internal static class Defaults + { + public const int HashBits = NumBitsLim - 1; + public const uint Seed = 314489979; + public const bool Ordered = false; + public const int InvertHash = 0; + } + + /// + /// Describes how the transformer handles one column pair. + /// public sealed class ColumnInfo { + /// + /// Name of the column resulting from the transformation of . + /// public readonly string Name; + /// + /// Name of column to transform. If set to , the value of the will be used as source. + /// public readonly string InputColumnName; + /// + /// Number of bits to hash into. Must be between 1 and 31, inclusive. + /// public readonly int HashBits; + /// + /// Hashing seed. + /// public readonly uint Seed; + /// + /// Whether the position of each term should be included in the hash. + /// public readonly bool Ordered; + /// + /// During hashing we constuct mappings between original values and the produced hash values. + /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. + /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. + /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + /// public readonly int InvertHash; /// @@ -1136,10 +1169,10 @@ public sealed class ColumnInfo /// 0 does not retain any input values. -1 retains all input values mapping to each hash. public ColumnInfo(string name, string inputColumnName = null, - int hashBits = HashingEstimator.Defaults.HashBits, - uint seed = HashingEstimator.Defaults.Seed, - bool ordered = HashingEstimator.Defaults.Ordered, - int invertHash = HashingEstimator.Defaults.InvertHash) + int hashBits = Defaults.HashBits, + uint seed = Defaults.Seed, + bool ordered = Defaults.Ordered, + int invertHash = Defaults.InvertHash) { if (invertHash < -1) throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger"); @@ -1183,14 +1216,6 @@ internal void Save(ModelSaveContext ctx) } } - public static class Defaults - { - public const int HashBits = NumBitsLim - 1; - public const uint Seed = 314489979; - public const bool Ordered = false; - public const int InvertHash = 0; - } - private readonly IHost _host; private readonly ColumnInfo[] _columns; @@ -1234,7 +1259,7 @@ internal HashingEstimator(IHostEnvironment env, params ColumnInfo[] columns) } /// - /// Train and return a transformer. + /// Trains and returns a . /// public HashingTransformer Fit(IDataView input) => new HashingTransformer(_host, input, _columns); diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 5a4098f1e3..ab4f7a1ed6 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -32,6 +32,9 @@ namespace Microsoft.ML.Transforms.Conversions { + /// + /// Converts the key types back to their original vectors. + /// public sealed class KeyToVectorMappingTransformer : OneToOneTransformerBase { internal abstract class ColumnBase : OneToOneColumn @@ -720,6 +723,9 @@ private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src } } + /// + /// Estimator for . Converts the key types back to their original vectors. + /// public sealed class KeyToVectorMappingEstimator : TrivialEstimator { public static class Defaults @@ -732,8 +738,17 @@ public static class Defaults /// public sealed class ColumnInfo { + /// + /// Name of the column resulting from the transformation of . + /// public readonly string Name; + /// + /// Name of column to transform. If set to , the value of the will be used as source. + /// public readonly string InputColumnName; + /// + /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector. + /// public readonly bool Bag; /// diff --git a/src/Microsoft.ML.Transforms/OneHotEncoding.cs b/src/Microsoft.ML.Transforms/OneHotEncoding.cs index 3d4431538f..c67c9991bf 100644 --- a/src/Microsoft.ML.Transforms/OneHotEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotEncoding.cs @@ -316,16 +316,16 @@ public static CommonOutputs.TransformOutput CatTransformDict(IHostEnvironment en } [TlcModule.EntryPoint(Name = "Transforms.CategoricalHashOneHotVectorizer", - Desc = OneHotHashEncoding.Summary, - UserName = OneHotHashEncoding.UserName)] - public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, OneHotHashEncoding.Options input) + Desc = OneHotHashEncodingTransformer.Summary, + UserName = OneHotHashEncodingTransformer.UserName)] + public static CommonOutputs.TransformOutput CatTransformHash(IHostEnvironment env, OneHotHashEncodingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("CatTransformDict"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); - var xf = OneHotHashEncoding.Create(host, input, input.Data); + var xf = OneHotHashEncodingTransformer.Create(host, input, input.Data); return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, input.Data), OutputData = xf }; } diff --git a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs index 8412df8896..760a041934 100644 --- a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs @@ -16,12 +16,15 @@ using Microsoft.ML.Transforms.Categorical; using Microsoft.ML.Transforms.Conversions; -[assembly: LoadableClass(OneHotHashEncoding.Summary, typeof(IDataTransform), typeof(OneHotHashEncoding), typeof(OneHotHashEncoding.Options), typeof(SignatureDataTransform), - OneHotHashEncoding.UserName, "CategoricalHashTransform", "CatHashTransform", "CategoricalHash", "CatHash")] +[assembly: LoadableClass(OneHotHashEncodingTransformer.Summary, typeof(IDataTransform), typeof(OneHotHashEncodingTransformer), typeof(OneHotHashEncodingTransformer.Options), typeof(SignatureDataTransform), + OneHotHashEncodingTransformer.UserName, "CategoricalHashTransform", "CatHashTransform", "CategoricalHash", "CatHash")] namespace Microsoft.ML.Transforms.Categorical { - public sealed class OneHotHashEncoding : ITransformer, ICanSaveModel + /// + /// Produces a column of indicator vectors. The mapping between a value and a corresponding index is done through hashing. + /// + public sealed class OneHotHashEncodingTransformer : ITransformer, ICanSaveModel { internal sealed class Column : OneToOneColumn { @@ -118,7 +121,7 @@ internal sealed class Options : TransformInputBase internal const string UserName = "Categorical Hash Transform"; /// - /// A helper method to create . + /// A helper method to create . /// /// Host Environment. /// Input . This is the output from previous transform or loader. @@ -167,31 +170,45 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa private readonly TransformerChain _transformer; - internal OneHotHashEncoding(HashingEstimator hash, IEstimator keyToVector, IDataView input) + internal OneHotHashEncodingTransformer(HashingEstimator hash, IEstimator keyToVector, IDataView input) { if (keyToVector != null) _transformer = hash.Append(keyToVector).Fit(input); else _transformer = new TransformerChain(hash.Fit(input)); } - + /// + /// Schema propagation for transformers. Returns the output schema of the data, if + /// the input schema is like the one provided. + /// public Schema GetOutputSchema(Schema inputSchema) => _transformer.GetOutputSchema(inputSchema); + /// + /// Take the data in, make transformations, output the data. Note that + /// are lazy, so no actual transformations happen here, just schema validation. + /// public IDataView Transform(IDataView input) => _transformer.Transform(input); public void Save(ModelSaveContext ctx) => _transformer.Save(ctx); + /// + /// Whether a call to should succeed, on an appropriate schema. + /// public bool IsRowToRowMapper => _transformer.IsRowToRowMapper; + /// + /// Constructs a row-to-row mapper based on an input schema. + /// public IRowToRowMapper GetRowToRowMapper(Schema inputSchema) => _transformer.GetRowToRowMapper(inputSchema); } /// - /// Estimator which takes set of columns and produce for each column indicator array. Use hashing to determine indicator position. + /// Estimator that produces a column of indicator vectors. The mapping between a value and a corresponding index is done through hashing. /// - public sealed class OneHotHashEncodingEstimator : IEstimator + public sealed class OneHotHashEncodingEstimator : IEstimator { - public static class Defaults + [BestFriend] + internal static class Defaults { public const int HashBits = 16; public const uint Seed = 314489979; @@ -200,6 +217,9 @@ public static class Defaults public const OneHotEncodingTransformer.OutputKind OutputKind = OneHotEncodingTransformer.OutputKind.Bag; } + /// + /// Describes how the transformer handles one column pair. + /// public sealed class ColumnInfo { public readonly HashingEstimator.ColumnInfo HashInfo; @@ -321,8 +341,8 @@ public SchemaShape GetOutputSchema(SchemaShape inputSchema) } /// - /// Train and return a transformer. + /// Trains and returns a . /// - public OneHotHashEncoding Fit(IDataView input) => new OneHotHashEncoding(_hash, _toSomething, input); + public OneHotHashEncodingTransformer Fit(IDataView input) => new OneHotHashEncodingTransformer(_hash, _toSomething, input); } } diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs index 95af6fbd73..d089153764 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipe.cs @@ -1104,7 +1104,7 @@ private void TestHashTransformHelper(T[] data, uint[] results, NumberType typ builder.AddColumn("F1", type, data); var srcView = builder.GetDataView(); - var hashTransform = new HashingTransformer(Env, new HashingTransformer.ColumnInfo("F1", "F1", 5, 42)).Transform(srcView); + var hashTransform = new HashingTransformer(Env, new HashingEstimator.ColumnInfo("F1", "F1", 5, 42)).Transform(srcView); using (var cursor = hashTransform.GetRowCursorForAllColumns()) { var resultGetter = cursor.GetGetter(1); @@ -1135,7 +1135,7 @@ private void TestHashTransformVectorHelper(VBuffer data, uint[][] results, private void TestHashTransformVectorHelper(ArrayDataViewBuilder builder, uint[][] results) { var srcView = builder.GetDataView(); - var hashTransform = new HashingTransformer(Env, new HashingTransformer.ColumnInfo("F1V", "F1V", 5, 42)).Transform(srcView); + var hashTransform = new HashingTransformer(Env, new HashingEstimator.ColumnInfo("F1V", "F1V", 5, 42)).Transform(srcView); using (var cursor = hashTransform.GetRowCursorForAllColumns()) { var resultGetter = cursor.GetGetter>(1); From 80c51adc4eb9fdad51a515dbdabd8ad0205c2996 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Sat, 2 Feb 2019 16:57:25 -0800 Subject: [PATCH 3/5] entrypoint catalog --- test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 8525c932d4..3618d6ea16 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -73,11 +73,11 @@ Trainers.SymSgdBinaryClassifier Train a symbolic SGD. Microsoft.ML.Trainers.SymS Transforms.ApproximateBootstrapSampler Approximate bootstrap sampling. Microsoft.ML.Transforms.BootstrapSample GetSample Microsoft.ML.Transforms.BootstrapSamplingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.BinaryPredictionScoreColumnsRenamer For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class. Microsoft.ML.EntryPoints.ScoreModel RenameBinaryPredictionScoreColumns Microsoft.ML.EntryPoints.ScoreModel+RenameBinaryPredictionScoreColumnsInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.BinNormalizer The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins. Microsoft.ML.Data.Normalize Bin Microsoft.ML.Transforms.Normalizers.NormalizeTransform+BinArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.CategoricalHashOneHotVectorizer Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the bag. If the input column is a vector, a single indicator bag is returned for it. Microsoft.ML.Transforms.Categorical.Categorical CatTransformHash Microsoft.ML.Transforms.Categorical.OneHotHashEncoding+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.CategoricalHashOneHotVectorizer Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the bag. If the input column is a vector, a single indicator bag is returned for it. Microsoft.ML.Transforms.Categorical.Categorical CatTransformHash Microsoft.ML.Transforms.Categorical.OneHotHashEncodingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CategoricalOneHotVectorizer Converts the categorical value into an indicator array by building a dictionary of categories based on the data and using the id in the dictionary as the index in the array. Microsoft.ML.Transforms.Categorical.Categorical CatTransformDict Microsoft.ML.Transforms.Categorical.OneHotEncodingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CharacterTokenizer Character-oriented tokenizer where text is considered a sequence of characters. Microsoft.ML.Transforms.Text.TextAnalytics CharTokenize Microsoft.ML.Transforms.Text.TokenizingByCharactersTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ColumnConcatenator Concatenates one or more columns of the same item type. Microsoft.ML.EntryPoints.SchemaManipulation ConcatColumns Microsoft.ML.Data.ColumnConcatenatingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.ColumnCopier Duplicates columns from the dataset Microsoft.ML.EntryPoints.SchemaManipulation CopyColumns Microsoft.ML.Transforms.ColumnCopyingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.ColumnCopier Duplicates columns from the dataset Microsoft.ML.EntryPoints.SchemaManipulation CopyColumns Microsoft.ML.Transforms.ColumnCopyingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ColumnSelector Selects a set of columns, dropping all others Microsoft.ML.EntryPoints.SchemaManipulation SelectColumns Microsoft.ML.Transforms.ColumnSelectingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ColumnTypeConverter Converts a column to a different type, using standard conversions. Microsoft.ML.Transforms.Conversions.TypeConversion Convert Microsoft.ML.Transforms.Conversions.TypeConvertingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CombinerByContiguousGroupId Groups values of a scalar column into a vector, by a contiguous group ID Microsoft.ML.Transforms.GroupingOperations Group Microsoft.ML.Transforms.GroupTransform+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput From 3a752e3a737812600d70e7bf27fb7148df080955 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Mon, 4 Feb 2019 16:00:11 -0800 Subject: [PATCH 4/5] review comments --- src/Microsoft.ML.Data/Transforms/ColumnCopying.cs | 2 +- src/Microsoft.ML.Data/Transforms/KeyToVector.cs | 2 +- test/Microsoft.ML.Tests/Transformers/HashTests.cs | 6 +++--- .../Transformers/KeyToVectorEstimatorTests.cs | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs b/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs index 2928922f06..7dfdf17d23 100644 --- a/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs +++ b/src/Microsoft.ML.Data/Transforms/ColumnCopying.cs @@ -79,7 +79,7 @@ public sealed class ColumnCopyingTransformer : OneToOneTransformerBase internal const string ShortName = "Copy"; /// - /// Names of input and ouput column pairs on which the transformation is applied. + /// Names of output and input column pairs on which the transformation is applied. /// public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index ab4f7a1ed6..1da1a8a07c 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -728,7 +728,7 @@ private bool SaveAsOnnxCore(OnnxContext ctx, int iinfo, ColInfo info, string src /// public sealed class KeyToVectorMappingEstimator : TrivialEstimator { - public static class Defaults + internal static class Defaults { public const bool Bag = false; } diff --git a/test/Microsoft.ML.Tests/Transformers/HashTests.cs b/test/Microsoft.ML.Tests/Transformers/HashTests.cs index 0f0de4cd3f..1a1688fd84 100644 --- a/test/Microsoft.ML.Tests/Transformers/HashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/HashTests.cs @@ -46,7 +46,7 @@ public void HashWorkout() var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new HashingEstimator(Env, new[]{ + var pipe = ML.Transforms.Conversion.Hash(new[]{ new HashingEstimator.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), new HashingEstimator.ColumnInfo("HashB", "B", hashBits:3, ordered:true), new HashingEstimator.ColumnInfo("HashC", "C", seed:42), @@ -68,7 +68,7 @@ public void TestMetadata() var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new HashingEstimator(Env, new[] { + var pipe = ML.Transforms.Conversion.Hash(new[] { new HashingEstimator.ColumnInfo("HashA", "A", invertHash:1, hashBits:10), new HashingEstimator.ColumnInfo("HashAUnlim", "A", invertHash:-1, hashBits:10), new HashingEstimator.ColumnInfo("HashAUnlimOrdered", "A", invertHash:-1, hashBits:10, ordered:true) @@ -108,7 +108,7 @@ public void TestOldSavingAndLoading() { var data = new[] { new TestClass() { A = 1, B = 2, C = 3, }, new TestClass() { A = 4, B = 5, C = 6 } }; var dataView = ML.Data.ReadFromEnumerable(data); - var pipe = new HashingEstimator(Env, new[]{ + var pipe = ML.Transforms.Conversion.Hash(new[]{ new HashingEstimator.ColumnInfo("HashA", "A", hashBits:4, invertHash:-1), new HashingEstimator.ColumnInfo("HashB", "B", hashBits:3, ordered:true), new HashingEstimator.ColumnInfo("HashC", "C", seed:42), diff --git a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs index 7dc65e2265..998d2606b0 100644 --- a/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/KeyToVectorEstimatorTests.cs @@ -58,7 +58,7 @@ public void KeyToVectorWorkout() new ValueToKeyMappingEstimator.ColumnInfo("TermC", "C", textKeyValues:true) }).Fit(dataView).Transform(dataView); - var pipe = new KeyToVectorMappingEstimator(Env, new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TermA", false), + var pipe = ML.Transforms.Conversion.MapKeyToVector(new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TermA", false), new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TermB", true), new KeyToVectorMappingEstimator.ColumnInfo("CatC", "TermC", true), new KeyToVectorMappingEstimator.ColumnInfo("CatCNonBag", "TermC", false)); @@ -121,7 +121,7 @@ public void TestMetadataPropagation() var termTransformer = termEst.Fit(dataView); dataView = termTransformer.Transform(dataView); - var pipe = new KeyToVectorMappingEstimator(Env, + var pipe = ML.Transforms.Conversion.MapKeyToVector( new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TA", true), new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TB", false), new KeyToVectorMappingEstimator.ColumnInfo("CatC", "TC", false), @@ -221,7 +221,7 @@ public void TestOldSavingAndLoading() }); var transformer = est.Fit(dataView); dataView = transformer.Transform(dataView); - var pipe = new KeyToVectorMappingEstimator(Env, + var pipe = ML.Transforms.Conversion.MapKeyToVector( new KeyToVectorMappingEstimator.ColumnInfo("CatA", "TermA",false), new KeyToVectorMappingEstimator.ColumnInfo("CatB", "TermB", true) ); From ca2c603ccb9ba00f645b56a486d2e9c2dc4fd273 Mon Sep 17 00:00:00 2001 From: Artidoro Pagnoni Date: Tue, 5 Feb 2019 13:37:09 -0800 Subject: [PATCH 5/5] review comments --- src/Microsoft.ML.Data/Transforms/Hashing.cs | 16 ++++------------ src/Microsoft.ML.Data/Transforms/KeyToVector.cs | 11 ++++------- .../OneHotHashEncoding.cs | 6 +++--- 3 files changed, 11 insertions(+), 22 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/Hashing.cs b/src/Microsoft.ML.Data/Transforms/Hashing.cs index 8c42042b3f..f98f4baa47 100644 --- a/src/Microsoft.ML.Data/Transforms/Hashing.cs +++ b/src/Microsoft.ML.Data/Transforms/Hashing.cs @@ -1131,21 +1131,13 @@ public sealed class ColumnInfo /// Name of the column resulting from the transformation of . /// public readonly string Name; - /// - /// Name of column to transform. If set to , the value of the will be used as source. - /// + /// Name of column to transform. public readonly string InputColumnName; - /// - /// Number of bits to hash into. Must be between 1 and 31, inclusive. - /// + /// Number of bits to hash into. Must be between 1 and 31, inclusive. public readonly int HashBits; - /// - /// Hashing seed. - /// + /// Hashing seed. public readonly uint Seed; - /// - /// Whether the position of each term should be included in the hash. - /// + /// Whether the position of each term should be included in the hash. public readonly bool Ordered; /// /// During hashing we constuct mappings between original values and the produced hash values. diff --git a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs index 1da1a8a07c..d2c7a5e260 100644 --- a/src/Microsoft.ML.Data/Transforms/KeyToVector.cs +++ b/src/Microsoft.ML.Data/Transforms/KeyToVector.cs @@ -738,16 +738,13 @@ internal static class Defaults /// public sealed class ColumnInfo { - /// - /// Name of the column resulting from the transformation of . - /// + /// Name of the column resulting from the transformation of . public readonly string Name; - /// - /// Name of column to transform. If set to , the value of the will be used as source. - /// + /// Name of column to transform. public readonly string InputColumnName; /// - /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. This is only relevant when the input column is a vector. + /// Whether to combine multiple indicator vectors into a single bag vector instead of concatenating them. + /// This is only relevant when the input column is a vector. /// public readonly bool Bag; diff --git a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs index 760a041934..d9c5d880ca 100644 --- a/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs +++ b/src/Microsoft.ML.Transforms/OneHotHashEncoding.cs @@ -255,7 +255,7 @@ public ColumnInfo(string name, string inputColumnName = null, private HashingEstimator _hash; /// - /// A helper method to create for public facing API. + /// Instantiates a new instance of . /// /// Host Environment. /// Name of the column resulting from the transformation of . @@ -270,8 +270,8 @@ public ColumnInfo(string name, string inputColumnName = null, internal OneHotHashEncodingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, - int hashBits = OneHotHashEncodingEstimator.Defaults.HashBits, - int invertHash = OneHotHashEncodingEstimator.Defaults.InvertHash, + int hashBits = Defaults.HashBits, + int invertHash = Defaults.InvertHash, OneHotEncodingTransformer.OutputKind outputKind = Defaults.OutputKind) : this(env, new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, outputKind, hashBits, invertHash: invertHash)) {