From 4f581077340c57e34e09e7ed33cde7d959e0fc06 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 8 Mar 2019 17:00:56 -0800 Subject: [PATCH 01/12] Scrub n-gram hashing --- .../Text/NgramHashingTransformer.cs | 125 ++++-------------- .../Text/TextCatalog.cs | 85 +++--------- 2 files changed, 46 insertions(+), 164 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs index eb2e63d85d..86bfdc3e87 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs @@ -52,8 +52,8 @@ internal sealed class Column : ManyToOneColumn [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 30, inclusive.", - ShortName = "bits")] - public int? HashBits; + Name = "HashBits", ShortName = "bits")] + public int? NumberOfBits; [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")] public uint? Seed; @@ -91,7 +91,7 @@ private protected override bool TryParse(string str) if (!int.TryParse(extra, out int bits)) return false; - HashBits = bits; + NumberOfBits = bits; return true; } @@ -103,10 +103,10 @@ internal bool TryUnparse(StringBuilder sb) { return false; } - if (HashBits == null) + if (NumberOfBits == null) return TryUnparseCore(sb); - string extra = HashBits.Value.ToString(); + string extra = NumberOfBits.Value.ToString(); return TryUnparseCore(sb, extra); } } @@ -133,8 +133,8 @@ internal sealed class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 30, inclusive.", - ShortName = "bits", SortOrder = 2)] - public int HashBits = NgramHashingEstimator.Defaults.HashBits; + Name = "HashBits", ShortName = "bits", SortOrder = 2)] + public int NumberOfBits = NgramHashingEstimator.Defaults.NumberOfBits; [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")] public uint Seed = NgramHashingEstimator.Defaults.Seed; @@ -353,7 +353,7 @@ private static IDataTransform Create(IHostEnvironment env, Options options, IDat item.NgramLength ?? options.NgramLength, item.SkipLength ?? options.SkipLength, item.AllLengths ?? options.AllLengths, - item.HashBits ?? options.HashBits, + item.NumberOfBits ?? options.NumberOfBits, item.Seed ?? options.Seed, item.Ordered ?? options.Ordered, item.InvertHash ?? options.InvertHash, @@ -408,13 +408,13 @@ public Mapper(NgramHashingTransformer parent, DataViewSchema inputSchema, Finder _srcTypes[i][j] = srcType; } - _types[i] = new VectorType(NumberDataViewType.Single, 1 << _parent._columns[i].HashBits); + _types[i] = new VectorType(NumberDataViewType.Single, 1 << _parent._columns[i].NumberOfBits); } } private NgramIdFinder GetNgramIdFinder(int iinfo) { - uint mask = (1U << _parent._columns[iinfo].HashBits) - 1; + uint mask = (1U << _parent._columns[iinfo].NumberOfBits) - 1; int ngramLength = _parent._columns[iinfo].NgramLength; bool rehash = _parent._columns[iinfo].RehashUnigrams; bool ordered = _parent._columns[iinfo].Ordered; @@ -819,7 +819,7 @@ public NgramIdFinder Decorate(int iinfo, NgramIdFinder finder) } var collector = _iinfoToCollector[iinfo] = new InvertHashCollector( - 1 << _parent._columns[iinfo].HashBits, _invertHashMaxCounts[iinfo], + 1 << _parent._columns[iinfo].NumberOfBits, _invertHashMaxCounts[iinfo], stringMapper, EqualityComparer.Default, (in NGram src, ref NGram dst) => dst = src.Clone()); return @@ -852,7 +852,7 @@ public VBuffer>[] SlotNamesMetadata(out VectorType[] types) if (_iinfoToCollector[iinfo] != null) { var vec = values[iinfo] = _iinfoToCollector[iinfo].GetMetadata(); - Contracts.Assert(vec.Length == 1 << _parent._columns[iinfo].HashBits); + Contracts.Assert(vec.Length == 1 << _parent._columns[iinfo].NumberOfBits); types[iinfo] = new VectorType(TextDataViewType.Instance, vec.Length); } } @@ -887,7 +887,7 @@ public sealed class ColumnOptions /// Whether to store all ngram lengths up to , or only . public readonly bool AllLengths; /// Number of bits to hash into. Must be between 1 and 31, inclusive. - public readonly int HashBits; + public readonly int NumberOfBits; /// Hashing seed. public readonly uint Seed; /// Whether the position of each term should be included in the hash. @@ -907,14 +907,14 @@ public sealed class ColumnOptions internal string[] FriendlyNames; /// - /// Describes how the transformer handles one column pair. + /// Describes how the transformer maps several input columns, , to a output column, . /// /// Name of the column resulting from the transformation of . /// Names of the columns to transform. /// Maximum ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to store all ngram lengths up to , or only . - /// Number of bits to hash into. Must be between 1 and 31, inclusive. + /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// Hashing seed. /// Whether the position of each term should be included in the hash. /// During hashing we constuct mappings between original values and the produced hash values. @@ -928,7 +928,7 @@ public ColumnOptions(string name, int ngramLength = NgramHashingEstimator.Defaults.NgramLength, int skipLength = NgramHashingEstimator.Defaults.SkipLength, bool allLengths = NgramHashingEstimator.Defaults.AllLengths, - int hashBits = NgramHashingEstimator.Defaults.HashBits, + int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, uint seed = NgramHashingEstimator.Defaults.Seed, bool ordered = NgramHashingEstimator.Defaults.Ordered, int invertHash = NgramHashingEstimator.Defaults.InvertHash, @@ -942,8 +942,8 @@ public ColumnOptions(string name, throw Contracts.ExceptParam(nameof(invertHash), "Value too small, must be -1 or larger"); // If the bits is 31 or higher, we can't declare a KeyValues of the appropriate length, // this requiring a VBuffer of length 1u << 31 which exceeds int.MaxValue. - if (invertHash != 0 && hashBits >= 31) - throw Contracts.ExceptParam(nameof(hashBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", hashBits); + if (invertHash != 0 && numberOfBits >= 31) + throw Contracts.ExceptParam(nameof(numberOfBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", numberOfBits); if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength) { @@ -956,7 +956,7 @@ public ColumnOptions(string name, NgramLength = ngramLength; SkipLength = skipLength; AllLengths = allLengths; - HashBits = hashBits; + NumberOfBits = numberOfBits; Seed = seed; Ordered = ordered; InvertHash = invertHash; @@ -988,8 +988,8 @@ internal ColumnOptions(ModelLoadContext ctx) SkipLength = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 <= SkipLength && SkipLength <= NgramBufferBuilder.MaxSkipNgramLength); Contracts.CheckDecode(SkipLength <= NgramBufferBuilder.MaxSkipNgramLength - NgramLength); - HashBits = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(1 <= HashBits && HashBits <= 30); + NumberOfBits = ctx.Reader.ReadInt32(); + Contracts.CheckDecode(1 <= NumberOfBits && NumberOfBits <= 30); Seed = ctx.Reader.ReadUInt32(); RehashUnigrams = ctx.Reader.ReadBoolByte(); Ordered = ctx.Reader.ReadBoolByte(); @@ -1018,8 +1018,8 @@ internal ColumnOptions(ModelLoadContext ctx, string name, string[] inputColumnNa SkipLength = ctx.Reader.ReadInt32(); Contracts.CheckDecode(0 <= SkipLength && SkipLength <= NgramBufferBuilder.MaxSkipNgramLength); Contracts.CheckDecode(SkipLength <= NgramBufferBuilder.MaxSkipNgramLength - NgramLength); - HashBits = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(1 <= HashBits && HashBits <= 30); + NumberOfBits = ctx.Reader.ReadInt32(); + Contracts.CheckDecode(1 <= NumberOfBits && NumberOfBits <= 30); Seed = ctx.Reader.ReadUInt32(); RehashUnigrams = ctx.Reader.ReadBoolByte(); Ordered = ctx.Reader.ReadBoolByte(); @@ -1052,8 +1052,8 @@ internal void Save(ModelSaveContext ctx) Contracts.Assert(0 <= SkipLength && SkipLength <= NgramBufferBuilder.MaxSkipNgramLength); Contracts.Assert(NgramLength + SkipLength <= NgramBufferBuilder.MaxSkipNgramLength); ctx.Writer.Write(SkipLength); - Contracts.Assert(1 <= HashBits && HashBits <= 30); - ctx.Writer.Write(HashBits); + Contracts.Assert(1 <= NumberOfBits && NumberOfBits <= 30); + ctx.Writer.Write(NumberOfBits); ctx.Writer.Write(Seed); ctx.Writer.WriteBoolByte(RehashUnigrams); ctx.Writer.WriteBoolByte(Ordered); @@ -1066,7 +1066,7 @@ internal static class Defaults internal const int NgramLength = 2; internal const bool AllLengths = true; internal const int SkipLength = 0; - internal const int HashBits = 16; + internal const int NumberOfBits = 16; internal const uint Seed = 314489979; internal const bool RehashUnigrams = false; internal const bool Ordered = true; @@ -1086,7 +1086,7 @@ internal static class Defaults /// The environment. /// Name of output column, will contain the ngram vector. Null means is replaced. /// Name of input column containing tokenized text. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -1099,84 +1099,17 @@ internal static class Defaults internal NgramHashingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, - int hashBits = 16, + int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, bool allLengths = true, uint seed = 314489979, bool ordered = true, int invertHash = 0) - : this(env, new[] { (outputColumnName, new[] { inputColumnName ?? outputColumnName }) }, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash) + : this(env, new ColumnOptions(outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, allLengths, numberOfBits, seed, ordered, invertHash)) { } - /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs ngram vector as - /// - /// is different from in a way that - /// takes tokenized text as input while tokenizes text internally. - /// - /// The environment. - /// Name of output column, will contain the ngram vector. - /// Name of input columns containing tokenized text. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - internal NgramHashingEstimator(IHostEnvironment env, - string outputColumnName, - string[] inputColumnNames, - int hashBits = 16, - int ngramLength = 2, - int skipLength = 0, - bool allLengths = true, - uint seed = 314489979, - bool ordered = true, - int invertHash = 0) - : this(env, new[] { (outputColumnName, inputColumnNames) }, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash) - { - } - - /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs ngram vector for each output in - /// - /// is different from in a way that - /// takes tokenized text as input while tokenizes text internally. - /// - /// The environment. - /// Pairs of input columns to output column mappings on which to compute ngram vector. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - internal NgramHashingEstimator(IHostEnvironment env, - (string outputColumnName, string[] inputColumnName)[] columns, - int hashBits = 16, - int ngramLength = 2, - int skipLength = 0, - bool allLengths = true, - uint seed = 314489979, - bool ordered = true, - int invertHash = 0) - : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, allLengths, hashBits, seed, ordered, invertHash)).ToArray()) - { - - } - /// /// Produces a bag of counts of hashed ngrams in /// and outputs ngram vector for each output in diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index bcfa9801e3..79f9c3c1ca 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -394,7 +394,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf /// The text-related transform's catalog. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -407,7 +407,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, - int hashBits = NgramHashExtractingTransformer.DefaultArguments.HashBits, + int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.HashBits, int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, @@ -415,7 +415,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, int invertHash = NgramHashExtractingTransformer.DefaultArguments.InvertHash) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + outputColumnName, inputColumnName, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); /// /// Produces a bag of counts of hashed ngrams in @@ -424,7 +424,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// The text-related transform's catalog. /// Name of the column resulting from the transformation of . /// Name of the columns to transform. If set to , the value of the will be used as source. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -437,7 +437,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string[] inputColumnNames, - int hashBits = NgramHashExtractingTransformer.DefaultArguments.HashBits, + int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.HashBits, int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, @@ -445,7 +445,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, int invertHash = NgramHashExtractingTransformer.DefaultArguments.InvertHash) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + outputColumnName, inputColumnNames, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); /// /// Produces a bag of counts of hashed ngrams in @@ -453,7 +453,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// /// The text-related transform's catalog. /// Pairs of columns to compute bag of word vector. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -465,7 +465,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog.TextTransforms catalog, (string outputColumnName, string[] inputColumnNames)[] columns, - int hashBits = NgramHashExtractingTransformer.DefaultArguments.HashBits, + int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.HashBits, int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, @@ -473,7 +473,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, int invertHash = NgramHashExtractingTransformer.DefaultArguments.InvertHash) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - columns, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + columns, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); /// /// Produces a bag of counts of hashed ngrams in @@ -485,7 +485,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// The text-related transform's catalog. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -498,7 +498,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, - int hashBits = NgramHashingEstimator.Defaults.HashBits, + int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, int ngramLength = NgramHashingEstimator.Defaults.NgramLength, int skipLength = NgramHashingEstimator.Defaults.SkipLength, bool allLengths = NgramHashingEstimator.Defaults.AllLengths, @@ -506,71 +506,20 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T bool ordered = NgramHashingEstimator.Defaults.Ordered, int invertHash = NgramHashingEstimator.Defaults.InvertHash) => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + outputColumnName, inputColumnName, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs ngram vector as + /// Produces a bag of counts of hashed ngrams for each . For each column, + /// are the input columns of the output column named as . /// /// is different from in a way that /// takes tokenized text as input while tokenizes text internally. /// /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the columns to transform. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + /// Pairs of columns to compute n-grams. Note that gram indices are generated by hashing. public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, - string outputColumnName, - string[] inputColumnNames, - int hashBits = NgramHashingEstimator.Defaults.HashBits, - int ngramLength = NgramHashingEstimator.Defaults.NgramLength, - int skipLength = NgramHashingEstimator.Defaults.SkipLength, - bool allLengths = NgramHashingEstimator.Defaults.AllLengths, - uint seed = NgramHashingEstimator.Defaults.Seed, - bool ordered = NgramHashingEstimator.Defaults.Ordered, - int invertHash = NgramHashingEstimator.Defaults.InvertHash) - => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); - - /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs ngram vector for each output in - /// - /// is different from in a way that - /// takes tokenized text as input while tokenizes text internally. - /// - /// The text-related transform's catalog. - /// Pairs of columns to compute bag of word vector. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string[] inputColumnNames)[] columns, - int hashBits = NgramHashingEstimator.Defaults.HashBits, - int ngramLength = NgramHashingEstimator.Defaults.NgramLength, - int skipLength = NgramHashingEstimator.Defaults.SkipLength, - bool allLengths = NgramHashingEstimator.Defaults.AllLengths, - uint seed = NgramHashingEstimator.Defaults.Seed, - bool ordered = NgramHashingEstimator.Defaults.Ordered, - int invertHash = NgramHashingEstimator.Defaults.InvertHash) - => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - columns, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + NgramHashingEstimator.ColumnOptions[] columns) + => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); /// /// Uses LightLDA to transform a document (represented as a vector of floats) From 6cfe2fc6765882d34b4e2fd1e239dd367ae5725a Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 8 Mar 2019 17:07:38 -0800 Subject: [PATCH 02/12] Handle static part --- .../TextStaticExtensions.cs | 26 +++++++++---------- .../Text/TextCatalog.cs | 4 +-- .../StaticPipeTests.cs | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 3bf6647d70..39944d6924 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -512,8 +512,8 @@ private sealed class OutPipelineColumn : Vector { public readonly VarVector> Input; - public OutPipelineColumn(VarVector> input, int hashBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int invertHash) - : base(new Reconciler(hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash), input) + public OutPipelineColumn(VarVector> input, int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int invertHash) + : base(new Reconciler(numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash), input) { Input = input; } @@ -521,7 +521,7 @@ public OutPipelineColumn(VarVector> input, int hashBits, int n private sealed class Reconciler : EstimatorReconciler, IEquatable { - private readonly int _hashBits; + private readonly int _numberOfBits; private readonly int _ngramLength; private readonly int _skipLength; private readonly bool _allLengths; @@ -529,9 +529,9 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly bool _ordered; private readonly int _invertHash; - public Reconciler(int hashBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int invertHash) + public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int invertHash) { - _hashBits = hashBits; + _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; _allLengths = allLengths; @@ -542,7 +542,7 @@ public Reconciler(int hashBits, int ngramLength, int skipLength, bool allLengths public bool Equals(Reconciler other) { - return _hashBits == other._hashBits && + return _numberOfBits == other._numberOfBits && _ngramLength == other._ngramLength && _skipLength == other._skipLength && _allLengths == other._allLengths && @@ -561,7 +561,7 @@ public override IEstimator Reconcile(IHostEnvironment env, var columns = new List(); foreach (var outCol in toOutput) columns.Add(new NgramHashingEstimator.ColumnOptions(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] }, - _ngramLength, _skipLength, _allLengths, _hashBits, _seed, _ordered, _invertHash)); + _ngramLength, _skipLength, _allLengths, _numberOfBits, _seed, _ordered, _invertHash)); return new NgramHashingEstimator(env, columns.ToArray()); } @@ -571,11 +571,11 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given tokenized text. /// It does so by hashing each ngram and using the hash value as the index in the bag. /// - /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -585,13 +585,13 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector ToNgramsHash(this VarVector> input, - int hashBits = 16, + public static Vector ApplyNgramHashing(this VarVector> input, + int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, bool allLengths = true, uint seed = 314489979, bool ordered = true, - int invertHash = 0) => new OutPipelineColumn(input, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + int invertHash = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 79f9c3c1ca..44fb6c45b9 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -495,7 +495,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, + public static NgramHashingEstimator ApplyNgramHashing(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, @@ -517,7 +517,7 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T /// /// The text-related transform's catalog. /// Pairs of columns to compute n-grams. Note that gram indices are generated by hashing. - public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, + public static NgramHashingEstimator ApplyNgramHashing(this TransformsCatalog.TextTransforms catalog, NgramHashingEstimator.ColumnOptions[] columns) => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index f18a707999..8a1f392533 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -605,7 +605,7 @@ public void Ngrams() .Append(r => ( r.label, ngrams: r.text.TokenizeText().ToKey().ToNgrams(), - ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash())); + ngramshash: r.text.TokenizeText().ToKey().ApplyNgramHashing())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; From 698b941cf601bb8d4b46b4a95fc0bb31cb02dafe Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 8 Mar 2019 17:23:41 -0800 Subject: [PATCH 03/12] Handle Ngram Shift from Array to ReadOnlyList --- .../TextStaticExtensions.cs | 6 +- .../Text/NgramTransform.cs | 66 ++++++++++--------- .../Text/TextCatalog.cs | 18 ++--- .../Text/WordBagTransform.cs | 4 +- 4 files changed, 49 insertions(+), 45 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 39944d6924..f2d5678f79 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -492,15 +492,15 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static Vector ToNgrams(this VarVector> input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, - int maxNumTerms = 10000000, + int maximumTermCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting); + => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maximumTermCount, weighting); } /// diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index ce5fb31119..c1df26fdac 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -93,7 +93,7 @@ internal sealed class Options : TransformInputBase public int SkipLength = NgramExtractingEstimator.Defaults.SkipLength; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] - public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms }; + public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumTermCount }; [Argument(ArgumentType.AtMostOnce, HelpText = "The weighting criteria")] public NgramExtractingEstimator.WeightingCriteria Weighting = NgramExtractingEstimator.Defaults.Weighting; @@ -253,7 +253,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will // be added (using lims[iinfo]), therefore we set slotLim to the maximum helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize, - GetNgramIdFinderAdd(env, counts[iinfo], columns[iinfo].Limits, ngramMaps[iinfo], transformInfos[iinfo].RequireIdf)); + GetNgramIdFinderAdd(env, counts[iinfo], columns[iinfo].MaximumTermCounts, ngramMaps[iinfo], transformInfos[iinfo].RequireIdf)); } int cInfoFull = 0; @@ -293,7 +293,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat } } } - AssertValid(env, counts[iinfo], columns[iinfo].Limits, ngramMaps[iinfo]); + AssertValid(env, counts[iinfo], columns[iinfo].MaximumTermCounts, ngramMaps[iinfo]); } } @@ -307,7 +307,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat for (int iinfo = 0; iinfo < columns.Length; iinfo++) { - AssertValid(env, counts[iinfo], columns[iinfo].Limits, ngramMaps[iinfo]); + AssertValid(env, counts[iinfo], columns[iinfo].MaximumTermCounts, ngramMaps[iinfo]); int ngramLength = transformInfos[iinfo].NgramLength; for (int i = 0; i < ngramLength; i++) @@ -319,11 +319,11 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat } [Conditional("DEBUG")] - private static void AssertValid(IHostEnvironment env, int[] counts, ImmutableArray lims, SequencePool pool) + private static void AssertValid(IHostEnvironment env, int[] counts, IReadOnlyList lims, SequencePool pool) { int count = 0; int countFull = 0; - for (int i = 0; i < lims.Length; i++) + for (int i = 0; i < lims.Count; i++) { env.Assert(counts[i] >= 0); env.Assert(counts[i] <= lims[i]); @@ -334,20 +334,20 @@ private static void AssertValid(IHostEnvironment env, int[] counts, ImmutableArr env.Assert(count == pool.Count); } - private static NgramIdFinder GetNgramIdFinderAdd(IHostEnvironment env, int[] counts, ImmutableArray lims, SequencePool pool, bool requireIdf) + private static NgramIdFinder GetNgramIdFinderAdd(IHostEnvironment env, int[] counts, IReadOnlyList lims, SequencePool pool, bool requireIdf) { Contracts.AssertValue(env); - env.Assert(lims.Length > 0); - env.Assert(lims.Length == Utils.Size(counts)); + env.Assert(lims.Count > 0); + env.Assert(lims.Count == Utils.Size(counts)); int numFull = lims.Count(l => l <= 0); - int ngramLength = lims.Length; + int ngramLength = lims.Count; return (uint[] ngram, int lim, int icol, ref bool more) => { env.Assert(0 < lim && lim <= Utils.Size(ngram)); env.Assert(lim <= Utils.Size(counts)); - env.Assert(lim <= lims.Length); + env.Assert(lim <= lims.Count); env.Assert(icol == 0); var max = lim - 1; @@ -695,7 +695,7 @@ internal static class Defaults public const int NgramLength = 2; public const bool AllLengths = true; public const int SkipLength = 0; - public const int MaxNumTerms = 10000000; + public const int MaximumTermCount = 10000000; public const WeightingCriteria Weighting = WeightingCriteria.Tf; } @@ -712,16 +712,16 @@ internal static class Defaults /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal NgramExtractingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, bool allLengths = Defaults.AllLengths, - int maxNumTerms = Defaults.MaxNumTerms, + int maximumTermCount = Defaults.MaximumTermCount, WeightingCriteria weighting = Defaults.Weighting) - : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, ngramLength, skipLength, allLengths, maxNumTerms, weighting) + : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, ngramLength, skipLength, allLengths, maximumTermCount, weighting) { } @@ -734,16 +734,16 @@ internal NgramExtractingEstimator(IHostEnvironment env, /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal NgramExtractingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, bool allLengths = Defaults.AllLengths, - int maxNumTerms = Defaults.MaxNumTerms, + int maximumTermCount = Defaults.MaximumTermCount, WeightingCriteria weighting = Defaults.Weighting) - : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, allLengths, weighting, maxNumTerms)).ToArray()) + : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, allLengths, weighting, maximumTermCount)).ToArray()) { } @@ -809,10 +809,14 @@ public sealed class ColumnOptions /// The weighting criteria. public readonly WeightingCriteria Weighting; /// + /// Underlying state of . + /// + private readonly ImmutableArray _maximumTermCounts; + /// /// Contains the maximum number of grams to store in the dictionary, for each level of ngrams, /// from 1 (in position 0) up to ngramLength (in position ngramLength-1) /// - public readonly ImmutableArray Limits; + public IReadOnlyList MaximumTermCounts => _maximumTermCounts; /// /// Describes how the transformer handles one Gcn column pair. @@ -823,14 +827,14 @@ public sealed class ColumnOptions /// Maximum number of tokens to skip when constructing an ngram. /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. /// The weighting criteria. - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. public ColumnOptions(string name, string inputColumnName = null, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, bool allLengths = Defaults.AllLengths, WeightingCriteria weighting = Defaults.Weighting, - int maxNumTerms = Defaults.MaxNumTerms) - : this(name, ngramLength, skipLength, allLengths, weighting, new int[] { maxNumTerms }, inputColumnName ?? name) + int maximumTermCount = Defaults.MaximumTermCount) + : this(name, ngramLength, skipLength, allLengths, weighting, new int[] { maximumTermCount }, inputColumnName ?? name) { } @@ -839,7 +843,7 @@ internal ColumnOptions(string name, int skipLength, bool allLengths, WeightingCriteria weighting, - int[] maxNumTerms, + int[] maximumTermCounts, string inputColumnName = null) { Name = name; @@ -857,18 +861,18 @@ internal ColumnOptions(string name, var limits = new int[ngramLength]; if (!AllLengths) { - Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || - Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(maxNumTerms)); - limits[ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Defaults.MaxNumTerms : maxNumTerms[0]; + Contracts.CheckUserArg(Utils.Size(maximumTermCounts) == 0 || + Utils.Size(maximumTermCounts) == 1 && maximumTermCounts[0] > 0, nameof(maximumTermCounts)); + limits[ngramLength - 1] = Utils.Size(maximumTermCounts) == 0 ? Defaults.MaximumTermCount : maximumTermCounts[0]; } else { - Contracts.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(maxNumTerms)); - Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(maxNumTerms)); - var extend = Utils.Size(maxNumTerms) == 0 ? Defaults.MaxNumTerms : maxNumTerms[maxNumTerms.Length - 1]; - limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); + Contracts.CheckUserArg(Utils.Size(maximumTermCounts) <= ngramLength, nameof(maximumTermCounts)); + Contracts.CheckUserArg(Utils.Size(maximumTermCounts) == 0 || maximumTermCounts.All(i => i >= 0) && maximumTermCounts[maximumTermCounts.Length - 1] > 0, nameof(maximumTermCounts)); + var extend = Utils.Size(maximumTermCounts) == 0 ? Defaults.MaximumTermCount : maximumTermCounts[maximumTermCounts.Length - 1]; + limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maximumTermCounts) ? maximumTermCounts[i] : extend); } - Limits = ImmutableArray.Create(limits); + _maximumTermCounts = ImmutableArray.Create(limits); } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 44fb6c45b9..8d3856aa78 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -194,7 +194,7 @@ public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextT /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. /// /// @@ -209,10 +209,10 @@ public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Text int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, + int maximumTermCount = NgramExtractingEstimator.Defaults.MaximumTermCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting) => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, - ngramLength, skipLength, allLengths, maxNumTerms, weighting); + ngramLength, skipLength, allLengths, maximumTermCount, weighting); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in @@ -223,17 +223,17 @@ public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Text /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.TextTransforms catalog, (string outputColumnName, string inputColumnName)[] columns, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, + int maximumTermCount = NgramExtractingEstimator.Defaults.MaximumTermCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting) => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, - ngramLength, skipLength, allLengths, maxNumTerms, weighting); + ngramLength, skipLength, allLengths, maximumTermCount, weighting); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in @@ -339,7 +339,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, + int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumTermCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, ngramLength, skipLength, allLengths, maxNumTerms); @@ -362,7 +362,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, + int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumTermCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnNames, ngramLength, skipLength, allLengths, maxNumTerms, weighting); @@ -383,7 +383,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, + int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumTermCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, ngramLength, skipLength, allLengths, maxNumTerms, weighting); diff --git a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs index a66b4512be..1ccd1d38fe 100644 --- a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs @@ -229,7 +229,7 @@ internal abstract class ArgumentsBase public bool AllLengths = NgramExtractingEstimator.Defaults.AllLengths; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] - public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms }; + public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumTermCount }; [Argument(ArgumentType.AtMostOnce, HelpText = "The weighting criteria")] public NgramExtractingEstimator.WeightingCriteria Weighting = NgramExtractingEstimator.Defaults.Weighting; @@ -315,7 +315,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa termArgs = new ValueToKeyMappingTransformer.Options() { - MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms, + MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaximumTermCount, Columns = new ValueToKeyMappingTransformer.Column[termCols.Count] }; } From 03f6025d2784d63f284a819ae07a069d828f7e84 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 10:02:28 -0700 Subject: [PATCH 04/12] Address comments --- .../TextStaticExtensions.cs | 14 ++--- .../Text/NgramTransform.cs | 56 +++++++++---------- .../Text/TextCatalog.cs | 33 ++--------- .../Text/WordBagTransform.cs | 4 +- 4 files changed, 43 insertions(+), 64 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index f2d5678f79..bc542fb485 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -443,7 +443,7 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly int _ngramLength; private readonly int _skipLength; private readonly bool _allLengths; - private readonly int _maxNumTerms; + private readonly int _maxNgramsCount; private readonly NgramExtractingEstimator.WeightingCriteria _weighting; public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) @@ -451,7 +451,7 @@ public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTe _ngramLength = ngramLength; _skipLength = skipLength; _allLengths = allLengths; - _maxNumTerms = maxNumTerms; + _maxNgramsCount = maxNumTerms; _weighting = weighting; } @@ -461,7 +461,7 @@ public bool Equals(Reconciler other) return _ngramLength == other._ngramLength && _skipLength == other._skipLength && _allLengths == other._allLengths && - _maxNumTerms == other._maxNumTerms && + _maxNgramsCount == other._maxNgramsCount && _weighting == other._weighting; } @@ -477,7 +477,7 @@ public override IEstimator Reconcile(IHostEnvironment env, foreach (var outCol in toOutput) pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - return new NgramExtractingEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _allLengths, _maxNumTerms, _weighting); + return new NgramExtractingEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _allLengths, _maxNgramsCount, _weighting); } } @@ -492,15 +492,15 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static Vector ToNgrams(this VarVector> input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, - int maximumTermCount = 10000000, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maximumTermCount, weighting); + => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting); } /// diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index c1df26fdac..8e360e7123 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -93,7 +93,7 @@ internal sealed class Options : TransformInputBase public int SkipLength = NgramExtractingEstimator.Defaults.SkipLength; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] - public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumTermCount }; + public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount }; [Argument(ArgumentType.AtMostOnce, HelpText = "The weighting criteria")] public NgramExtractingEstimator.WeightingCriteria Weighting = NgramExtractingEstimator.Defaults.Weighting; @@ -253,7 +253,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will // be added (using lims[iinfo]), therefore we set slotLim to the maximum helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize, - GetNgramIdFinderAdd(env, counts[iinfo], columns[iinfo].MaximumTermCounts, ngramMaps[iinfo], transformInfos[iinfo].RequireIdf)); + GetNgramIdFinderAdd(env, counts[iinfo], columns[iinfo].MaximumNgramsCounts, ngramMaps[iinfo], transformInfos[iinfo].RequireIdf)); } int cInfoFull = 0; @@ -293,7 +293,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat } } } - AssertValid(env, counts[iinfo], columns[iinfo].MaximumTermCounts, ngramMaps[iinfo]); + AssertValid(env, counts[iinfo], columns[iinfo].MaximumNgramsCounts, ngramMaps[iinfo]); } } @@ -307,7 +307,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat for (int iinfo = 0; iinfo < columns.Length; iinfo++) { - AssertValid(env, counts[iinfo], columns[iinfo].MaximumTermCounts, ngramMaps[iinfo]); + AssertValid(env, counts[iinfo], columns[iinfo].MaximumNgramsCounts, ngramMaps[iinfo]); int ngramLength = transformInfos[iinfo].NgramLength; for (int i = 0; i < ngramLength; i++) @@ -695,7 +695,7 @@ internal static class Defaults public const int NgramLength = 2; public const bool AllLengths = true; public const int SkipLength = 0; - public const int MaximumTermCount = 10000000; + public const int MaximumNgramsCount = 10000000; public const WeightingCriteria Weighting = WeightingCriteria.Tf; } @@ -712,16 +712,16 @@ internal static class Defaults /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal NgramExtractingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, bool allLengths = Defaults.AllLengths, - int maximumTermCount = Defaults.MaximumTermCount, + int maximumNgramsCount = Defaults.MaximumNgramsCount, WeightingCriteria weighting = Defaults.Weighting) - : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, ngramLength, skipLength, allLengths, maximumTermCount, weighting) + : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting) { } @@ -734,16 +734,16 @@ internal NgramExtractingEstimator(IHostEnvironment env, /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal NgramExtractingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, bool allLengths = Defaults.AllLengths, - int maximumTermCount = Defaults.MaximumTermCount, + int maximumNgramsCount = Defaults.MaximumNgramsCount, WeightingCriteria weighting = Defaults.Weighting) - : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, allLengths, weighting, maximumTermCount)).ToArray()) + : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, allLengths, weighting, maximumNgramsCount)).ToArray()) { } @@ -809,14 +809,14 @@ public sealed class ColumnOptions /// The weighting criteria. public readonly WeightingCriteria Weighting; /// - /// Underlying state of . + /// Underlying state of . /// - private readonly ImmutableArray _maximumTermCounts; + private readonly ImmutableArray _maximumNgramsCounts; /// - /// Contains the maximum number of grams to store in the dictionary, for each level of ngrams, - /// from 1 (in position 0) up to ngramLength (in position ngramLength-1) + /// Contains the maximum number of terms (that is, n-grams) to store in the dictionary, for each level of n-grams, + /// from n=1 (in position 0) up to n= (in position -1) /// - public IReadOnlyList MaximumTermCounts => _maximumTermCounts; + public IReadOnlyList MaximumNgramsCounts => _maximumNgramsCounts; /// /// Describes how the transformer handles one Gcn column pair. @@ -827,14 +827,14 @@ public sealed class ColumnOptions /// Maximum number of tokens to skip when constructing an ngram. /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. /// The weighting criteria. - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of n-grams to store in the dictionary. public ColumnOptions(string name, string inputColumnName = null, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, bool allLengths = Defaults.AllLengths, WeightingCriteria weighting = Defaults.Weighting, - int maximumTermCount = Defaults.MaximumTermCount) - : this(name, ngramLength, skipLength, allLengths, weighting, new int[] { maximumTermCount }, inputColumnName ?? name) + int maximumNgramsCount = Defaults.MaximumNgramsCount) + : this(name, ngramLength, skipLength, allLengths, weighting, new int[] { maximumNgramsCount }, inputColumnName ?? name) { } @@ -843,7 +843,7 @@ internal ColumnOptions(string name, int skipLength, bool allLengths, WeightingCriteria weighting, - int[] maximumTermCounts, + int[] maximumNgramsCounts, string inputColumnName = null) { Name = name; @@ -861,18 +861,18 @@ internal ColumnOptions(string name, var limits = new int[ngramLength]; if (!AllLengths) { - Contracts.CheckUserArg(Utils.Size(maximumTermCounts) == 0 || - Utils.Size(maximumTermCounts) == 1 && maximumTermCounts[0] > 0, nameof(maximumTermCounts)); - limits[ngramLength - 1] = Utils.Size(maximumTermCounts) == 0 ? Defaults.MaximumTermCount : maximumTermCounts[0]; + Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) == 0 || + Utils.Size(maximumNgramsCounts) == 1 && maximumNgramsCounts[0] > 0, nameof(maximumNgramsCounts)); + limits[ngramLength - 1] = Utils.Size(maximumNgramsCounts) == 0 ? Defaults.MaximumNgramsCount : maximumNgramsCounts[0]; } else { - Contracts.CheckUserArg(Utils.Size(maximumTermCounts) <= ngramLength, nameof(maximumTermCounts)); - Contracts.CheckUserArg(Utils.Size(maximumTermCounts) == 0 || maximumTermCounts.All(i => i >= 0) && maximumTermCounts[maximumTermCounts.Length - 1] > 0, nameof(maximumTermCounts)); - var extend = Utils.Size(maximumTermCounts) == 0 ? Defaults.MaximumTermCount : maximumTermCounts[maximumTermCounts.Length - 1]; - limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maximumTermCounts) ? maximumTermCounts[i] : extend); + Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) <= ngramLength, nameof(maximumNgramsCounts)); + Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) == 0 || maximumNgramsCounts.All(i => i >= 0) && maximumNgramsCounts[maximumNgramsCounts.Length - 1] > 0, nameof(maximumNgramsCounts)); + var extend = Utils.Size(maximumNgramsCounts) == 0 ? Defaults.MaximumNgramsCount : maximumNgramsCounts[maximumNgramsCounts.Length - 1]; + limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maximumNgramsCounts) ? maximumNgramsCounts[i] : extend); } - _maximumTermCounts = ImmutableArray.Create(limits); + _maximumNgramsCounts = ImmutableArray.Create(limits); } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 8d3856aa78..3a370c8739 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -194,7 +194,7 @@ public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextT /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. /// /// @@ -209,31 +209,10 @@ public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Text int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maximumTermCount = NgramExtractingEstimator.Defaults.MaximumTermCount, + int maximumNgramsCounts = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting) => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, - ngramLength, skipLength, allLengths, maximumTermCount, weighting); - - /// - /// Produces a bag of counts of ngrams (sequences of consecutive words) in - /// and outputs bag of word vector for each output in - /// - /// The text-related transform's catalog. - /// Pairs of columns to compute bag of word vector. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. - /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string inputColumnName)[] columns, - int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, - int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maximumTermCount = NgramExtractingEstimator.Defaults.MaximumTermCount, - NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting) - => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, - ngramLength, skipLength, allLengths, maximumTermCount, weighting); + ngramLength, skipLength, allLengths, maximumNgramsCounts, weighting); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in @@ -339,7 +318,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumTermCount, + int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, ngramLength, skipLength, allLengths, maxNumTerms); @@ -362,7 +341,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumTermCount, + int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnNames, ngramLength, skipLength, allLengths, maxNumTerms, weighting); @@ -383,7 +362,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumTermCount, + int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, ngramLength, skipLength, allLengths, maxNumTerms, weighting); diff --git a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs index 1ccd1d38fe..30a1a2fae1 100644 --- a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs @@ -229,7 +229,7 @@ internal abstract class ArgumentsBase public bool AllLengths = NgramExtractingEstimator.Defaults.AllLengths; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] - public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumTermCount }; + public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount }; [Argument(ArgumentType.AtMostOnce, HelpText = "The weighting criteria")] public NgramExtractingEstimator.WeightingCriteria Weighting = NgramExtractingEstimator.Defaults.Weighting; @@ -315,7 +315,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa termArgs = new ValueToKeyMappingTransformer.Options() { - MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaximumTermCount, + MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaximumNgramsCount, Columns = new ValueToKeyMappingTransformer.Column[termCols.Count] }; } From 86048700849c235537cd38bcbca10f17431210d1 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 10:04:21 -0700 Subject: [PATCH 05/12] Address one more comment --- src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs | 6 +++--- test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs | 2 +- .../Scenarios/Api/CookbookSamples/CookbookSamples.cs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index bc542fb485..5aea9f53f9 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -485,8 +485,8 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// - /// /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. /// Ngram length. @@ -494,7 +494,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Whether to include all ngram lengths up to or only . /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static Vector ToNgrams(this VarVector> input, + public static Vector ProduceNgrams(this VarVector> input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 8a1f392533..9f18956af4 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -604,7 +604,7 @@ public void Ngrams() var est = data.MakeNewEstimator() .Append(r => ( r.label, - ngrams: r.text.TokenizeText().ToKey().ToNgrams(), + ngrams: r.text.TokenizeText().ToKey().ProduceNgrams(), ngramshash: r.text.TokenizeText().ToKey().ApplyNgramHashing())); var tdata = est.Fit(data).Transform(data); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index b60afc07f2..64d81361d4 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -467,7 +467,7 @@ private void TextFeaturizationOn(string dataPath) BagOfBigrams: r.Message.NormalizeText().ToBagofHashedWords(ngramLength: 2, allLengths: false), // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - BagOfTrichar: r.Message.TokenizeIntoCharacters().ToNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), + BagOfTrichar: r.Message.TokenizeIntoCharacters().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real From 47b3aacc5c653c231d2f0eb46aeff7bd29b911eb Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 10:47:56 -0700 Subject: [PATCH 06/12] Rename NgramHashing's static API again --- src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs | 6 +++--- test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 5aea9f53f9..e722c4f7d6 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -571,8 +571,8 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given tokenized text. /// It does so by hashing each ngram and using the hash value as the index in the bag. /// - /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. /// Number of bits to hash into. Must be between 1 and 30, inclusive. @@ -585,7 +585,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector ApplyNgramHashing(this VarVector> input, + public static Vector ProduceHashedNgrams(this VarVector> input, int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 9f18956af4..5cd6dbfb85 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -605,7 +605,7 @@ public void Ngrams() .Append(r => ( r.label, ngrams: r.text.TokenizeText().ToKey().ProduceNgrams(), - ngramshash: r.text.TokenizeText().ToKey().ApplyNgramHashing())); + ngramshash: r.text.TokenizeText().ToKey().ProduceHashedNgrams())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; From 390aa147f04797eab8c9fbd2a52f8938a2f47b7e Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 15:18:04 -0700 Subject: [PATCH 07/12] Address comments and handle WordBags and HashedWordBags --- .../Dynamic/NgramExtraction.cs | 2 +- .../TextStaticExtensions.cs | 24 +++---- .../Text/NgramHashingTransformer.cs | 5 +- .../Text/NgramTransform.cs | 24 +++---- .../Text/TextCatalog.cs | 64 +++---------------- .../Text/WrappedTextTransformers.cs | 18 +++--- .../StaticPipeTests.cs | 10 +-- .../Api/CookbookSamples/CookbookSamples.cs | 4 +- 8 files changed, 54 insertions(+), 97 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs index fa3c6317bf..d1f36d3731 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs @@ -61,7 +61,7 @@ public static void NgramTransform() // 'e' - 1 '' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1 // 'B' - 0 'e' - 6 's' - 3 't' - 6 '' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ... // Preview of the CharsTwoGrams column obtained after processing the input. - var charsTwoGramColumn = transformedData_twochars.GetColumn>(transformedData_onechars.Schema["CharsUnigrams"]); + var charsTwoGramColumn = transformedData_twochars.GetColumn>(transformedData_twochars.Schema["CharsTwograms"]); transformedData_twochars.Schema["CharsTwograms"].GetSlotNames(ref slotNames); printHelper("CharsTwograms", charsTwoGramColumn, slotNames); diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index e722c4f7d6..4d8644afc9 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -310,15 +310,15 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static Vector ToBagofWords(this Scalar input, + public static Vector ProduceWordBags(this Scalar input, int ngramLength = 1, int skipLength = 0, bool allLengths = true, - int maxNumTerms = 10000000, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting); + => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting); } /// @@ -397,7 +397,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// It does so by hashing each ngram and using the hash value as the index in the bag. /// /// The column to apply to. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. + /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . @@ -407,14 +407,14 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector ToBagofHashedWords(this Scalar input, - int hashBits = 16, + public static Vector ProduceHashedWordBags(this Scalar input, + int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, bool allLengths = true, uint seed = 314489979, bool ordered = true, - int invertHash = 0) => new OutPipelineColumn(input, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + int invertHash = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); } /// @@ -485,8 +485,8 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// - /// /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. /// Ngram length. @@ -571,8 +571,8 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given tokenized text. /// It does so by hashing each ngram and using the hash value as the index in the bag. /// - /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. /// Number of bits to hash into. Must be between 1 and 30, inclusive. diff --git a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs index 86bfdc3e87..2c7eae634d 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs @@ -945,11 +945,14 @@ public ColumnOptions(string name, if (invertHash != 0 && numberOfBits >= 31) throw Contracts.ExceptParam(nameof(numberOfBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", numberOfBits); - if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength) + if (ngramLength == 1 && skipLength != 0) + throw Contracts.ExceptUserArg(nameof(skipLength), $"Number of skips can only be zero when the maximum n-gram's length is one."); + if (ngramLength + skipLength > NgramBufferBuilder.MaxSkipNgramLength) { throw Contracts.ExceptUserArg(nameof(skipLength), $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}"); } + FriendlyNames = null; Name = name; InputColumnNamesArray = inputColumnNames; diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index 8e360e7123..7731a2dd62 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -846,20 +846,15 @@ internal ColumnOptions(string name, int[] maximumNgramsCounts, string inputColumnName = null) { - Name = name; - InputColumnName = inputColumnName ?? name; - NgramLength = ngramLength; - Contracts.CheckUserArg(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength)); - SkipLength = skipLength; - if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength) - { + if (ngramLength == 1 && skipLength != 0) + throw Contracts.ExceptUserArg(nameof(skipLength), $"Number of skips can only be zero when the maximum n-gram's length is one."); + if (ngramLength + skipLength > NgramBufferBuilder.MaxSkipNgramLength) throw Contracts.ExceptUserArg(nameof(skipLength), $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}"); - } - AllLengths = allLengths; - Weighting = weighting; + Contracts.CheckUserArg(0 < ngramLength && ngramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength)); + var limits = new int[ngramLength]; - if (!AllLengths) + if (!allLengths) { Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) == 0 || Utils.Size(maximumNgramsCounts) == 1 && maximumNgramsCounts[0] > 0, nameof(maximumNgramsCounts)); @@ -873,6 +868,13 @@ internal ColumnOptions(string name, limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maximumNgramsCounts) ? maximumNgramsCounts[i] : extend); } _maximumNgramsCounts = ImmutableArray.Create(limits); + + Name = name; + InputColumnName = inputColumnName ?? name; + NgramLength = ngramLength; + SkipLength = skipLength; + AllLengths = allLengths; + Weighting = weighting; } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 3a370c8739..486835c4a7 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -310,7 +310,7 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransforms catalog, string outputColumnName, @@ -318,10 +318,10 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumNgramsCount, + int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, ngramLength, skipLength, allLengths, maxNumTerms); + outputColumnName, inputColumnName, ngramLength, skipLength, allLengths, maximumNgramsCount); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in @@ -333,7 +333,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransforms catalog, string outputColumnName, @@ -341,30 +341,10 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumNgramsCount, + int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, ngramLength, skipLength, allLengths, maxNumTerms, weighting); - - /// - /// Produces a bag of counts of ngrams (sequences of consecutive words) in - /// and outputs bag of word vector for each output in - /// - /// The text-related transform's catalog. - /// Pairs of columns to compute bag of word vector. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. - /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string[] inputColumnNames)[] columns, - int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, - int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaximumNgramsCount, - NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, ngramLength, skipLength, allLengths, maxNumTerms, weighting); + outputColumnName, inputColumnNames, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting); /// /// Produces a bag of counts of hashed ngrams in @@ -426,34 +406,6 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnNames, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); - /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs bag of word vector for each output in - /// - /// The text-related transform's catalog. - /// Pairs of columns to compute bag of word vector. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string[] inputColumnNames)[] columns, - int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.HashBits, - int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, - int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, - bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, - uint seed = NgramHashExtractingTransformer.DefaultArguments.Seed, - bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, - int invertHash = NgramHashExtractingTransformer.DefaultArguments.InvertHash) - => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - columns, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); - /// /// Produces a bag of counts of hashed ngrams in /// and outputs ngram vector as @@ -474,7 +426,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static NgramHashingEstimator ApplyNgramHashing(this TransformsCatalog.TextTransforms catalog, + public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, @@ -496,7 +448,7 @@ public static NgramHashingEstimator ApplyNgramHashing(this TransformsCatalog.Tex /// /// The text-related transform's catalog. /// Pairs of columns to compute n-grams. Note that gram indices are generated by hashing. - public static NgramHashingEstimator ApplyNgramHashing(this TransformsCatalog.TextTransforms catalog, + public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, NgramHashingEstimator.ColumnOptions[] columns) => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs index cf8722495a..19f87ea3fc 100644 --- a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs +++ b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs @@ -36,7 +36,7 @@ public sealed class WordBagEstimator : IEstimator /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, string outputColumnName, @@ -44,9 +44,9 @@ internal WordBagEstimator(IHostEnvironment env, int ngramLength = 1, int skipLength = 0, bool allLengths = true, - int maxNumTerms = 10000000, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, allLengths, maxNumTerms, weighting) + : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting) { } @@ -60,7 +60,7 @@ internal WordBagEstimator(IHostEnvironment env, /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, string outputColumnName, @@ -68,9 +68,9 @@ internal WordBagEstimator(IHostEnvironment env, int ngramLength = 1, int skipLength = 0, bool allLengths = true, - int maxNumTerms = 10000000, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - : this(env, new[] { (outputColumnName, inputColumnNames) }, ngramLength, skipLength, allLengths, maxNumTerms, weighting) + : this(env, new[] { (outputColumnName, inputColumnNames) }, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting) { } @@ -83,14 +83,14 @@ internal WordBagEstimator(IHostEnvironment env, /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, (string outputColumnName, string[] inputColumnNames)[] columns, int ngramLength = 1, int skipLength = 0, bool allLengths = true, - int maxNumTerms = 10000000, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) { Contracts.CheckValue(env, nameof(env)); @@ -106,7 +106,7 @@ internal WordBagEstimator(IHostEnvironment env, _ngramLength = ngramLength; _skipLength = skipLength; _allLengths = allLengths; - _maxNumTerms = maxNumTerms; + _maxNumTerms = maximumNgramsCount; _weighting = weighting; } diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 5cd6dbfb85..926e770187 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -575,8 +575,8 @@ public void ConvertToWordBag() var est = data.MakeNewEstimator() .Append(r => ( r.label, - bagofword: r.text.ToBagofWords(), - bagofhashedword: r.text.ToBagofHashedWords())); + bagofword: r.text.ProduceWordBags(), + bagofhashedword: r.text.ProduceHashedWordBags())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; @@ -675,7 +675,7 @@ public void LdaTopicModel() var est = data.MakeNewEstimator() .Append(r => ( r.label, - topics: r.text.ToBagofWords().LatentDirichletAllocation(numberOfTopics: 3, numberOfSummaryTermsPerTopic:5, alphaSum: 10, onFit: m => ldaSummary = m.LdaTopicSummary))); + topics: r.text.ProduceWordBags().LatentDirichletAllocation(numberOfTopics: 3, numberOfSummaryTermsPerTopic:5, alphaSum: 10, onFit: m => ldaSummary = m.LdaTopicSummary))); var transformer = est.Fit(data); var tdata = transformer.Transform(data); @@ -700,8 +700,8 @@ public void FeatureSelection() var est = data.MakeNewEstimator() .Append(r => ( r.label, - bag_of_words_count: r.text.ToBagofWords().SelectFeaturesBasedOnCount(10), - bag_of_words_mi: r.text.ToBagofWords().SelectFeaturesBasedOnMutualInformation(r.label))); + bag_of_words_count: r.text.ProduceWordBags().SelectFeaturesBasedOnCount(10), + bag_of_words_mi: r.text.ProduceWordBags().SelectFeaturesBasedOnMutualInformation(r.label))); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 64d81361d4..0261377a49 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -461,10 +461,10 @@ private void TextFeaturizationOn(string dataPath) TextFeatures: r.Message.FeaturizeText(), // NLP pipeline 1: bag of words. - BagOfWords: r.Message.NormalizeText().ToBagofWords(), + BagOfWords: r.Message.NormalizeText().ProduceWordBags(), // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. - BagOfBigrams: r.Message.NormalizeText().ToBagofHashedWords(ngramLength: 2, allLengths: false), + BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, allLengths: false), // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. BagOfTrichar: r.Message.TokenizeIntoCharacters().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), From b60d6a496dccf8db05c8530bddadaa8a7a1d76c6 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 16:37:18 -0700 Subject: [PATCH 08/12] Address comment --- src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs | 3 ++- src/Microsoft.ML.Transforms/Text/NgramTransform.cs | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs index 2c7eae634d..d0fb5e5a1f 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs @@ -946,7 +946,8 @@ public ColumnOptions(string name, throw Contracts.ExceptParam(nameof(numberOfBits), $"Cannot support invertHash for a {0} bit hash. 30 is the maximum possible.", numberOfBits); if (ngramLength == 1 && skipLength != 0) - throw Contracts.ExceptUserArg(nameof(skipLength), $"Number of skips can only be zero when the maximum n-gram's length is one."); + throw Contracts.ExceptUserArg(nameof(skipLength), string.Format( + "{0} (actual value: {1}) can only be zero when {2} set to one.", nameof(skipLength), skipLength, nameof(ngramLength))); if (ngramLength + skipLength > NgramBufferBuilder.MaxSkipNgramLength) { throw Contracts.ExceptUserArg(nameof(skipLength), diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index 7731a2dd62..8bfad84278 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -847,7 +847,8 @@ internal ColumnOptions(string name, string inputColumnName = null) { if (ngramLength == 1 && skipLength != 0) - throw Contracts.ExceptUserArg(nameof(skipLength), $"Number of skips can only be zero when the maximum n-gram's length is one."); + throw Contracts.ExceptUserArg(nameof(skipLength), string.Format( + "{0} (actual value: {1}) can only be zero when {2} set to one.", nameof(skipLength), skipLength, nameof(ngramLength))); if (ngramLength + skipLength > NgramBufferBuilder.MaxSkipNgramLength) throw Contracts.ExceptUserArg(nameof(skipLength), $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}"); From 97469bc54bd2a9112f51951f3feee3c3241bf375 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2019 16:42:20 -0700 Subject: [PATCH 09/12] ordered ---> useOrderedHashing --- .../TextStaticExtensions.cs | 40 +++++++++---------- .../Text/NgramHashingTransformer.cs | 26 ++++++------ .../Text/TextCatalog.cs | 19 +++++---- .../Text/WrappedTextTransformers.cs | 18 ++++----- 4 files changed, 51 insertions(+), 52 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 4d8644afc9..babd909ae4 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -336,9 +336,9 @@ public OutPipelineColumn(Scalar input, int skipLength, bool allLengths, uint seed, - bool ordered, + bool useOrderedHashing, int invertHash) - : base(new Reconciler(hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash), input) + : base(new Reconciler(hashBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash), input) { Input = input; } @@ -351,17 +351,17 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly int _skipLength; private readonly bool _allLengths; private readonly uint _seed; - private readonly bool _ordered; + private readonly bool _useOrderedHashing; private readonly int _invertHash; - public Reconciler(int hashBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int invertHash) + public Reconciler(int hashBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool useOrderedHashing, int invertHash) { _hashBits = hashBits; _ngramLength = ngramLength; _skipLength = skipLength; _allLengths = allLengths; _seed = seed; - _ordered = ordered; + _useOrderedHashing = useOrderedHashing; _invertHash = invertHash; } @@ -372,7 +372,7 @@ public bool Equals(Reconciler other) _skipLength == other._skipLength && _allLengths == other._allLengths && _seed == other._seed && - _ordered == other._ordered && + _useOrderedHashing == other._useOrderedHashing && _invertHash == other._invertHash; } @@ -388,7 +388,7 @@ public override IEstimator Reconcile(IHostEnvironment env, foreach (var outCol in toOutput) pairs.Add((outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] })); - return new WordHashBagEstimator(env, pairs.ToArray(), _hashBits, _ngramLength, _skipLength, _allLengths, _seed, _ordered, _invertHash); + return new WordHashBagEstimator(env, pairs.ToArray(), _hashBits, _ngramLength, _skipLength, _allLengths, _seed, _useOrderedHashing, _invertHash); } } @@ -402,7 +402,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -413,8 +413,8 @@ public static Vector ProduceHashedWordBags(this Scalar input, int skipLength = 0, bool allLengths = true, uint seed = 314489979, - bool ordered = true, - int invertHash = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + bool useOrderedHashing = true, + int invertHash = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash); } /// @@ -512,8 +512,8 @@ private sealed class OutPipelineColumn : Vector { public readonly VarVector> Input; - public OutPipelineColumn(VarVector> input, int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int invertHash) - : base(new Reconciler(numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash), input) + public OutPipelineColumn(VarVector> input, int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool useOrderedHashing, int invertHash) + : base(new Reconciler(numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash), input) { Input = input; } @@ -526,17 +526,17 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly int _skipLength; private readonly bool _allLengths; private readonly uint _seed; - private readonly bool _ordered; + private readonly bool _useOrderedHashing; private readonly int _invertHash; - public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int invertHash) + public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool useOrderedHashing, int invertHash) { _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; _allLengths = allLengths; _seed = seed; - _ordered = ordered; + _useOrderedHashing = useOrderedHashing; _invertHash = invertHash; } @@ -547,7 +547,7 @@ public bool Equals(Reconciler other) _skipLength == other._skipLength && _allLengths == other._allLengths && _seed == other._seed && - _ordered == other._ordered && + _useOrderedHashing == other._useOrderedHashing && _invertHash == other._invertHash; } @@ -561,7 +561,7 @@ public override IEstimator Reconcile(IHostEnvironment env, var columns = new List(); foreach (var outCol in toOutput) columns.Add(new NgramHashingEstimator.ColumnOptions(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] }, - _ngramLength, _skipLength, _allLengths, _numberOfBits, _seed, _ordered, _invertHash)); + _ngramLength, _skipLength, _allLengths, _numberOfBits, _seed, _useOrderedHashing, _invertHash)); return new NgramHashingEstimator(env, columns.ToArray()); } @@ -580,7 +580,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -591,7 +591,7 @@ public static Vector ProduceHashedNgrams(this VarVector int skipLength = 0, bool allLengths = true, uint seed = 314489979, - bool ordered = true, - int invertHash = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + bool useOrderedHashing = true, + int invertHash = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash); } } diff --git a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs index d0fb5e5a1f..9acd8ccc4c 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs @@ -145,7 +145,7 @@ internal sealed class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the position of each source column should be included in the hash (when there are multiple source columns).", ShortName = "ord", SortOrder = 6)] - public bool Ordered = NgramHashingEstimator.Defaults.Ordered; + public bool Ordered = NgramHashingEstimator.Defaults.UseOrderedHashing; [Argument(ArgumentType.AtMostOnce, HelpText = "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", ShortName = "ih")] @@ -417,7 +417,7 @@ private NgramIdFinder GetNgramIdFinder(int iinfo) uint mask = (1U << _parent._columns[iinfo].NumberOfBits) - 1; int ngramLength = _parent._columns[iinfo].NgramLength; bool rehash = _parent._columns[iinfo].RehashUnigrams; - bool ordered = _parent._columns[iinfo].Ordered; + bool ordered = _parent._columns[iinfo].UseOrderedHashing; bool all = _parent._columns[iinfo].AllLengths; uint seed = _parent._columns[iinfo].Seed; @@ -891,7 +891,7 @@ public sealed class ColumnOptions /// Hashing seed. public readonly uint Seed; /// Whether the position of each term should be included in the hash. - public readonly bool Ordered; + public readonly bool UseOrderedHashing; /// /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column. @@ -916,7 +916,7 @@ public sealed class ColumnOptions /// Whether to store all ngram lengths up to , or only . /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// Hashing seed. - /// Whether the position of each term should be included in the hash. + /// Whether the position of each term should be included in the hash. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column. /// Hashing, as such, can map many initial values to one. @@ -930,7 +930,7 @@ public ColumnOptions(string name, bool allLengths = NgramHashingEstimator.Defaults.AllLengths, int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, uint seed = NgramHashingEstimator.Defaults.Seed, - bool ordered = NgramHashingEstimator.Defaults.Ordered, + bool useOrderedHashing = NgramHashingEstimator.Defaults.UseOrderedHashing, int invertHash = NgramHashingEstimator.Defaults.InvertHash, bool rehashUnigrams = NgramHashingEstimator.Defaults.RehashUnigrams) { @@ -962,7 +962,7 @@ public ColumnOptions(string name, AllLengths = allLengths; NumberOfBits = numberOfBits; Seed = seed; - Ordered = ordered; + UseOrderedHashing = useOrderedHashing; InvertHash = invertHash; RehashUnigrams = rehashUnigrams; } @@ -996,7 +996,7 @@ internal ColumnOptions(ModelLoadContext ctx) Contracts.CheckDecode(1 <= NumberOfBits && NumberOfBits <= 30); Seed = ctx.Reader.ReadUInt32(); RehashUnigrams = ctx.Reader.ReadBoolByte(); - Ordered = ctx.Reader.ReadBoolByte(); + UseOrderedHashing = ctx.Reader.ReadBoolByte(); AllLengths = ctx.Reader.ReadBoolByte(); } @@ -1026,7 +1026,7 @@ internal ColumnOptions(ModelLoadContext ctx, string name, string[] inputColumnNa Contracts.CheckDecode(1 <= NumberOfBits && NumberOfBits <= 30); Seed = ctx.Reader.ReadUInt32(); RehashUnigrams = ctx.Reader.ReadBoolByte(); - Ordered = ctx.Reader.ReadBoolByte(); + UseOrderedHashing = ctx.Reader.ReadBoolByte(); AllLengths = ctx.Reader.ReadBoolByte(); } @@ -1060,7 +1060,7 @@ internal void Save(ModelSaveContext ctx) ctx.Writer.Write(NumberOfBits); ctx.Writer.Write(Seed); ctx.Writer.WriteBoolByte(RehashUnigrams); - ctx.Writer.WriteBoolByte(Ordered); + ctx.Writer.WriteBoolByte(UseOrderedHashing); ctx.Writer.WriteBoolByte(AllLengths); } } @@ -1073,7 +1073,7 @@ internal static class Defaults internal const int NumberOfBits = 16; internal const uint Seed = 314489979; internal const bool RehashUnigrams = false; - internal const bool Ordered = true; + internal const bool UseOrderedHashing = true; internal const int InvertHash = 0; } @@ -1095,7 +1095,7 @@ internal static class Defaults /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -1108,9 +1108,9 @@ internal NgramHashingEstimator(IHostEnvironment env, int skipLength = 0, bool allLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int invertHash = 0) - : this(env, new ColumnOptions(outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, allLengths, numberOfBits, seed, ordered, invertHash)) + : this(env, new ColumnOptions(outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, allLengths, numberOfBits, seed, useOrderedHashing, invertHash)) { } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 486835c4a7..2f3c3b0603 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.Collections.Generic; using Microsoft.ML.Data; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms.Text; @@ -358,7 +357,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -371,10 +370,10 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, uint seed = NgramHashExtractingTransformer.DefaultArguments.Seed, - bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, + bool useOrderedHashing = NgramHashExtractingTransformer.DefaultArguments.Ordered, int invertHash = NgramHashExtractingTransformer.DefaultArguments.InvertHash) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + outputColumnName, inputColumnName, numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash); /// /// Produces a bag of counts of hashed ngrams in @@ -388,7 +387,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -401,10 +400,10 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, uint seed = NgramHashExtractingTransformer.DefaultArguments.Seed, - bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, + bool useOrderedHashing = NgramHashExtractingTransformer.DefaultArguments.Ordered, int invertHash = NgramHashExtractingTransformer.DefaultArguments.InvertHash) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + outputColumnName, inputColumnNames, numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash); /// /// Produces a bag of counts of hashed ngrams in @@ -421,7 +420,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -434,10 +433,10 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T int skipLength = NgramHashingEstimator.Defaults.SkipLength, bool allLengths = NgramHashingEstimator.Defaults.AllLengths, uint seed = NgramHashingEstimator.Defaults.Seed, - bool ordered = NgramHashingEstimator.Defaults.Ordered, + bool useOrderedHashing = NgramHashingEstimator.Defaults.UseOrderedHashing, int invertHash = NgramHashingEstimator.Defaults.InvertHash) => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash); + outputColumnName, inputColumnName, numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash); /// /// Produces a bag of counts of hashed ngrams for each . For each column, diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs index 19f87ea3fc..bd7df9abc0 100644 --- a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs +++ b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs @@ -169,7 +169,7 @@ public sealed class WordHashBagEstimator : IEstimator /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -182,9 +182,9 @@ internal WordHashBagEstimator(IHostEnvironment env, int skipLength = 0, bool allLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int invertHash = 0) - : this(env, new[] { (outputColumnName, new[] { inputColumnName ?? outputColumnName }) }, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash) + : this(env, new[] { (outputColumnName, new[] { inputColumnName ?? outputColumnName }) }, hashBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash) { } @@ -200,7 +200,7 @@ internal WordHashBagEstimator(IHostEnvironment env, /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -213,9 +213,9 @@ internal WordHashBagEstimator(IHostEnvironment env, int skipLength = 0, bool allLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int invertHash = 0) - : this(env, new[] { (outputColumnName, inputColumnNames) }, hashBits, ngramLength, skipLength, allLengths, seed, ordered, invertHash) + : this(env, new[] { (outputColumnName, inputColumnNames) }, hashBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, invertHash) { } @@ -230,7 +230,7 @@ internal WordHashBagEstimator(IHostEnvironment env, /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -242,7 +242,7 @@ internal WordHashBagEstimator(IHostEnvironment env, int skipLength = 0, bool allLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int invertHash = 0) { Contracts.CheckValue(env, nameof(env)); @@ -260,7 +260,7 @@ internal WordHashBagEstimator(IHostEnvironment env, _skipLength = skipLength; _allLengths = allLengths; _seed = seed; - _ordered = ordered; + _ordered = useOrderedHashing; _invertHash = invertHash; } From d1d2e66020ecd035b6c0b2ecb2996fdafe561ac9 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2019 22:53:45 -0700 Subject: [PATCH 10/12] Fix a name --- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 1f5b129af4..8eec4ef097 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -193,7 +193,7 @@ public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextT /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. /// Whether to include all ngram lengths up to or only . - /// Maximum number of n-grams to store in the dictionary. + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. /// /// @@ -208,10 +208,10 @@ public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Text int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maximumNgramsCounts = NgramExtractingEstimator.Defaults.MaximumNgramsCount, + int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting) => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, - ngramLength, skipLength, allLengths, maximumNgramsCounts, weighting); + ngramLength, skipLength, allLengths, maximumNgramsCount, weighting); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in From fa57beab542b7c7edcaee0a2fd58b273b187218e Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2019 23:28:21 -0700 Subject: [PATCH 11/12] Deal with most allLengths --- docs/code/MlNetCookBook.md | 2 +- .../TextStaticExtensions.cs | 32 ++++++------- .../Text/NgramHashingTransformer.cs | 42 ++++++++-------- .../Text/NgramTransform.cs | 40 ++++++++-------- .../Text/TextCatalog.cs | 36 +++++++------- .../Text/TextFeaturizingEstimator.cs | 6 +-- .../Text/WordBagTransform.cs | 25 +++++----- .../Text/WordHashBagProducingTransform.cs | 18 +++---- .../Text/WrappedTextTransformers.cs | 48 +++++++++---------- .../DataTransformation.cs | 2 +- .../CookbookSamplesDynamicApi.cs | 2 +- 11 files changed, 127 insertions(+), 126 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 8edb9a626a..7a73a1178a 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -772,7 +772,7 @@ var pipeline = // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. .Append(new WordHashBagEstimator(mlContext, "BagOfBigrams","NormalizedMessage", - ngramLength: 2, allLengths: false)) + ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. .Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message")) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index ed18a6a53e..4d597837f8 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -263,7 +263,7 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable { private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly int _maxNumTerms; private readonly NgramExtractingEstimator.WeightingCriteria _weighting; @@ -271,7 +271,7 @@ public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTe { _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = allLengths; _maxNumTerms = maxNumTerms; _weighting = weighting; @@ -281,7 +281,7 @@ public bool Equals(Reconciler other) { return _ngramLength == other._ngramLength && _skipLength == other._skipLength && - _allLengths == other._allLengths && + _useAllLengths == other._useAllLengths && _maxNumTerms == other._maxNumTerms && _weighting == other._weighting; } @@ -298,7 +298,7 @@ public override IEstimator Reconcile(IHostEnvironment env, foreach (var outCol in toOutput) pairs.Add((outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] })); - return new WordBagEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _allLengths, _maxNumTerms, _weighting); + return new WordBagEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _useAllLengths, _maxNumTerms, _weighting); } } @@ -349,7 +349,7 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly int _numberOfBits; private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly uint _seed; private readonly bool _useOrderedHashing; private readonly int _maximumNumberOfInverts; @@ -359,7 +359,7 @@ public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLen _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = allLengths; _seed = seed; _useOrderedHashing = useOrderedHashing; _maximumNumberOfInverts = maximumNumberOfInverts; @@ -370,7 +370,7 @@ public bool Equals(Reconciler other) return _numberOfBits == other._numberOfBits && _ngramLength == other._ngramLength && _skipLength == other._skipLength && - _allLengths == other._allLengths && + _useAllLengths == other._useAllLengths && _seed == other._seed && _useOrderedHashing == other._useOrderedHashing && _maximumNumberOfInverts == other._maximumNumberOfInverts; @@ -388,7 +388,7 @@ public override IEstimator Reconcile(IHostEnvironment env, foreach (var outCol in toOutput) pairs.Add((outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] })); - return new WordHashBagEstimator(env, pairs.ToArray(), _numberOfBits, _ngramLength, _skipLength, _allLengths, _seed, _useOrderedHashing, _maximumNumberOfInverts); + return new WordHashBagEstimator(env, pairs.ToArray(), _numberOfBits, _ngramLength, _skipLength, _useAllLengths, _seed, _useOrderedHashing, _maximumNumberOfInverts); } } @@ -442,7 +442,7 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable { private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly int _maxNgramsCount; private readonly NgramExtractingEstimator.WeightingCriteria _weighting; @@ -450,7 +450,7 @@ public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTe { _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = allLengths; _maxNgramsCount = maxNumTerms; _weighting = weighting; @@ -460,7 +460,7 @@ public bool Equals(Reconciler other) { return _ngramLength == other._ngramLength && _skipLength == other._skipLength && - _allLengths == other._allLengths && + _useAllLengths == other._useAllLengths && _maxNgramsCount == other._maxNgramsCount && _weighting == other._weighting; } @@ -477,7 +477,7 @@ public override IEstimator Reconcile(IHostEnvironment env, foreach (var outCol in toOutput) pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - return new NgramExtractingEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _allLengths, _maxNgramsCount, _weighting); + return new NgramExtractingEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _useAllLengths, _maxNgramsCount, _weighting); } } @@ -524,7 +524,7 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly int _numberOfBits; private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly uint _seed; private readonly bool _useOrderedHashing; private readonly int _maximumNumberOfInverts; @@ -534,7 +534,7 @@ public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLen _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = allLengths; _seed = seed; _useOrderedHashing = useOrderedHashing; _maximumNumberOfInverts = maximumNumberOfInverts; @@ -545,7 +545,7 @@ public bool Equals(Reconciler other) return _numberOfBits == other._numberOfBits && _ngramLength == other._ngramLength && _skipLength == other._skipLength && - _allLengths == other._allLengths && + _useAllLengths == other._useAllLengths && _seed == other._seed && _useOrderedHashing == other._useOrderedHashing && _maximumNumberOfInverts == other._maximumNumberOfInverts; @@ -561,7 +561,7 @@ public override IEstimator Reconcile(IHostEnvironment env, var columns = new List(); foreach (var outCol in toOutput) columns.Add(new NgramHashingEstimator.ColumnOptions(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] }, - _ngramLength, _skipLength, _allLengths, _numberOfBits, _seed, _useOrderedHashing, _maximumNumberOfInverts)); + _ngramLength, _skipLength, _useAllLengths, _numberOfBits, _seed, _useOrderedHashing, _maximumNumberOfInverts)); return new NgramHashingEstimator(env, columns.ToArray()); } diff --git a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs index 8de00d8106..21337758ca 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs @@ -42,8 +42,8 @@ internal sealed class Column : ManyToOneColumn public int? NgramLength; [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), ShortName = "all")] - public bool? AllLengths; + "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), Name = "AllLengths", ShortName = "all")] + public bool? UseAllLengths; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of tokens to skip when constructing an ngram", @@ -98,7 +98,7 @@ private protected override bool TryParse(string str) internal bool TryUnparse(StringBuilder sb) { Contracts.AssertValue(sb); - if (NgramLength != null || AllLengths != null || SkipLength != null || Seed != null || + if (NgramLength != null || UseAllLengths != null || SkipLength != null || Seed != null || RehashUnigrams != null || Ordered != null || MaximumNumberOfInverts != null) { return false; @@ -123,8 +123,8 @@ internal sealed class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), - ShortName = "all", SortOrder = 4)] - public bool AllLengths = NgramHashingEstimator.Defaults.AllLengths; + Name = "AllLengths", ShortName = "all", SortOrder = 4)] + public bool UseAllLengths = NgramHashingEstimator.Defaults.UseAllLengths; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of tokens to skip when constructing an ngram", @@ -352,7 +352,7 @@ private static IDataTransform Create(IHostEnvironment env, Options options, IDat item.Source ?? new string[] { item.Name }, item.NgramLength ?? options.NgramLength, item.SkipLength ?? options.SkipLength, - item.AllLengths ?? options.AllLengths, + item.UseAllLengths ?? options.UseAllLengths, item.NumberOfBits ?? options.NumberOfBits, item.Seed ?? options.Seed, item.Ordered ?? options.Ordered, @@ -418,7 +418,7 @@ private NgramIdFinder GetNgramIdFinder(int iinfo) int ngramLength = _parent._columns[iinfo].NgramLength; bool rehash = _parent._columns[iinfo].RehashUnigrams; bool ordered = _parent._columns[iinfo].UseOrderedHashing; - bool all = _parent._columns[iinfo].AllLengths; + bool all = _parent._columns[iinfo].UseAllLengths; uint seed = _parent._columns[iinfo].Seed; // REVIEW: Consider the case when: @@ -885,7 +885,7 @@ public sealed class ColumnOptions /// Maximum number of tokens to skip when constructing an ngram. public readonly int SkipLength; /// Whether to store all ngram lengths up to , or only . - public readonly bool AllLengths; + public readonly bool UseAllLengths; /// Number of bits to hash into. Must be between 1 and 31, inclusive. public readonly int NumberOfBits; /// Hashing seed. @@ -913,7 +913,7 @@ public sealed class ColumnOptions /// Names of the columns to transform. /// Maximum ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to store all ngram lengths up to , or only . + /// Whether to store all ngram lengths up to , or only . /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// Hashing seed. /// Whether the position of each term should be included in the hash. @@ -927,7 +927,7 @@ public ColumnOptions(string name, string[] inputColumnNames, int ngramLength = NgramHashingEstimator.Defaults.NgramLength, int skipLength = NgramHashingEstimator.Defaults.SkipLength, - bool allLengths = NgramHashingEstimator.Defaults.AllLengths, + bool useAllLengths = NgramHashingEstimator.Defaults.UseAllLengths, int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, uint seed = NgramHashingEstimator.Defaults.Seed, bool useOrderedHashing = NgramHashingEstimator.Defaults.UseOrderedHashing, @@ -959,7 +959,7 @@ public ColumnOptions(string name, InputColumnNamesArray = inputColumnNames; NgramLength = ngramLength; SkipLength = skipLength; - AllLengths = allLengths; + UseAllLengths = useAllLengths; NumberOfBits = numberOfBits; Seed = seed; UseOrderedHashing = useOrderedHashing; @@ -997,7 +997,7 @@ internal ColumnOptions(ModelLoadContext ctx) Seed = ctx.Reader.ReadUInt32(); RehashUnigrams = ctx.Reader.ReadBoolByte(); UseOrderedHashing = ctx.Reader.ReadBoolByte(); - AllLengths = ctx.Reader.ReadBoolByte(); + UseAllLengths = ctx.Reader.ReadBoolByte(); } internal ColumnOptions(ModelLoadContext ctx, string name, string[] inputColumnNames) @@ -1027,7 +1027,7 @@ internal ColumnOptions(ModelLoadContext ctx, string name, string[] inputColumnNa Seed = ctx.Reader.ReadUInt32(); RehashUnigrams = ctx.Reader.ReadBoolByte(); UseOrderedHashing = ctx.Reader.ReadBoolByte(); - AllLengths = ctx.Reader.ReadBoolByte(); + UseAllLengths = ctx.Reader.ReadBoolByte(); } internal void Save(ModelSaveContext ctx) @@ -1061,14 +1061,14 @@ internal void Save(ModelSaveContext ctx) ctx.Writer.Write(Seed); ctx.Writer.WriteBoolByte(RehashUnigrams); ctx.Writer.WriteBoolByte(UseOrderedHashing); - ctx.Writer.WriteBoolByte(AllLengths); + ctx.Writer.WriteBoolByte(UseAllLengths); } } internal static class Defaults { internal const int NgramLength = 2; - internal const bool AllLengths = true; + internal const bool UseAllLengths = true; internal const int SkipLength = 0; internal const int NumberOfBits = 16; internal const uint Seed = 314489979; @@ -1093,7 +1093,7 @@ internal static class Defaults /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -1106,11 +1106,11 @@ internal NgramHashingEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, bool useOrderedHashing = true, int maximumNumberOfInverts = 0) - : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, maximumNumberOfInverts) + : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts) { } @@ -1127,7 +1127,7 @@ internal NgramHashingEstimator(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -1140,11 +1140,11 @@ internal NgramHashingEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, bool useOrderedHashing = true, int maximumNumberOfInverts = 0) - : this(env, new ColumnOptions(outputColumnName, inputColumnNames, ngramLength, skipLength, allLengths, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)) + : this(env, new ColumnOptions(outputColumnName, inputColumnNames, ngramLength, skipLength, useAllLengths, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)) { } diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index 8bfad84278..6e0cfb35f3 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -42,8 +42,8 @@ internal sealed class Column : OneToOneColumn public int? NgramLength; [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), ShortName = "all")] - public bool? AllLengths; + "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), Name = "AllLengths", ShortName = "all")] + public bool? UseAllLengths; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of tokens to skip when constructing an ngram", @@ -69,7 +69,7 @@ internal static Column Parse(string str) internal bool TryUnparse(StringBuilder sb) { Contracts.AssertValue(sb); - if (NgramLength != null || AllLengths != null || SkipLength != null || Utils.Size(MaxNumTerms) != 0) + if (NgramLength != null || UseAllLengths != null || SkipLength != null || Utils.Size(MaxNumTerms) != 0) return false; return TryUnparseCore(sb); } @@ -84,8 +84,8 @@ internal sealed class Options : TransformInputBase public int NgramLength = NgramExtractingEstimator.Defaults.NgramLength; [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether to store all ngram lengths up to ngramLength, or only ngramLength", ShortName = "all")] - public bool AllLengths = NgramExtractingEstimator.Defaults.AllLengths; + "Whether to store all ngram lengths up to ngramLength, or only ngramLength", Name = "AllLengths", ShortName = "all")] + public bool UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of tokens to skip when constructing an ngram", @@ -424,7 +424,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa item.Name, item.NgramLength ?? options.NgramLength, item.SkipLength ?? options.SkipLength, - item.AllLengths ?? options.AllLengths, + item.UseAllLengths ?? options.UseAllLengths, item.Weighting ?? options.Weighting, maxNumTerms, item.Source ?? item.Name); @@ -693,7 +693,7 @@ public enum WeightingCriteria internal static class Defaults { public const int NgramLength = 2; - public const bool AllLengths = true; + public const bool UseAllLengths = true; public const int SkipLength = 0; public const int MaximumNgramsCount = 10000000; public const WeightingCriteria Weighting = WeightingCriteria.Tf; @@ -711,17 +711,17 @@ internal static class Defaults /// Name of the column to transform. If set to , the value of the will be used as source. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal NgramExtractingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, - bool allLengths = Defaults.AllLengths, + bool useAllLengths = Defaults.UseAllLengths, int maximumNgramsCount = Defaults.MaximumNgramsCount, WeightingCriteria weighting = Defaults.Weighting) - : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting) + : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting) { } @@ -733,17 +733,17 @@ internal NgramExtractingEstimator(IHostEnvironment env, /// Pairs of columns to compute bag of word vector. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal NgramExtractingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, - bool allLengths = Defaults.AllLengths, + bool useAllLengths = Defaults.UseAllLengths, int maximumNgramsCount = Defaults.MaximumNgramsCount, WeightingCriteria weighting = Defaults.Weighting) - : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, allLengths, weighting, maximumNgramsCount)).ToArray()) + : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, useAllLengths, weighting, maximumNgramsCount)).ToArray()) { } @@ -805,7 +805,7 @@ public sealed class ColumnOptions /// Maximum number of tokens to skip when constructing an ngram. public readonly int SkipLength; /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. - public readonly bool AllLengths; + public readonly bool UseAllLengths; /// The weighting criteria. public readonly WeightingCriteria Weighting; /// @@ -825,23 +825,23 @@ public sealed class ColumnOptions /// Name of column to transform. If set to , the value of the will be used as source. /// Maximum ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. + /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. /// The weighting criteria. /// Maximum number of n-grams to store in the dictionary. public ColumnOptions(string name, string inputColumnName = null, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, - bool allLengths = Defaults.AllLengths, + bool useAllLengths = Defaults.UseAllLengths, WeightingCriteria weighting = Defaults.Weighting, int maximumNgramsCount = Defaults.MaximumNgramsCount) - : this(name, ngramLength, skipLength, allLengths, weighting, new int[] { maximumNgramsCount }, inputColumnName ?? name) + : this(name, ngramLength, skipLength, useAllLengths, weighting, new int[] { maximumNgramsCount }, inputColumnName ?? name) { } internal ColumnOptions(string name, int ngramLength, int skipLength, - bool allLengths, + bool useAllLengths, WeightingCriteria weighting, int[] maximumNgramsCounts, string inputColumnName = null) @@ -855,7 +855,7 @@ internal ColumnOptions(string name, Contracts.CheckUserArg(0 < ngramLength && ngramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength)); var limits = new int[ngramLength]; - if (!allLengths) + if (!useAllLengths) { Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) == 0 || Utils.Size(maximumNgramsCounts) == 1 && maximumNgramsCounts[0] > 0, nameof(maximumNgramsCounts)); @@ -874,7 +874,7 @@ internal ColumnOptions(string name, InputColumnName = inputColumnName ?? name; NgramLength = ngramLength; SkipLength = skipLength; - AllLengths = allLengths; + UseAllLengths = useAllLengths; Weighting = weighting; } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 8eec4ef097..3aa10978ac 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -192,7 +192,7 @@ public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextT /// Name of the column to transform. If set to , the value of the will be used as source. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. /// @@ -207,11 +207,11 @@ public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Text string inputColumnName = null, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, + bool useAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths, int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting) => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, - ngramLength, skipLength, allLengths, maximumNgramsCount, weighting); + ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in @@ -308,7 +308,7 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa /// Name of the column to transform. If set to , the value of the will be used as source. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransforms catalog, @@ -316,11 +316,11 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf string inputColumnName = null, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, + bool useAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths, int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, ngramLength, skipLength, allLengths, maximumNgramsCount); + outputColumnName, inputColumnName, ngramLength, skipLength, useAllLengths, maximumNgramsCount); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in @@ -331,7 +331,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf /// Name of the columns to transform. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransforms catalog, @@ -339,11 +339,11 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf string[] inputColumnNames, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, + bool useAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths, int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting); + outputColumnName, inputColumnNames, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); /// /// Produces a bag of counts of hashed ngrams in @@ -355,7 +355,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -368,13 +368,13 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.NumberOfBits, int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, - bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, + bool useAllLengths = NgramHashExtractingTransformer.DefaultArguments.UseAllLengths, uint seed = NgramHashExtractingTransformer.DefaultArguments.Seed, bool useOrderedHashing = NgramHashExtractingTransformer.DefaultArguments.Ordered, int maximumNumberOfInverts = NgramHashExtractingTransformer.DefaultArguments.MaximumNumberOfInverts) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, numberOfBits: numberOfBits, ngramLength: ngramLength, - skipLength: skipLength, allLengths: allLengths, seed: seed, useOrderedHashing: useOrderedHashing, + skipLength: skipLength, useAllLengths: useAllLengths, seed: seed, useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts); /// @@ -387,7 +387,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -400,13 +400,13 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.NumberOfBits, int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, - bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, + bool useAllLengths = NgramHashExtractingTransformer.DefaultArguments.UseAllLengths, uint seed = NgramHashExtractingTransformer.DefaultArguments.Seed, bool useOrderedHashing = NgramHashExtractingTransformer.DefaultArguments.Ordered, int maximumNumberOfInverts = NgramHashExtractingTransformer.DefaultArguments.MaximumNumberOfInverts) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnNames, numberOfBits: numberOfBits, ngramLength: ngramLength, - skipLength: skipLength, allLengths: allLengths, seed: seed, useOrderedHashing: useOrderedHashing, + skipLength: skipLength, useAllLengths: useAllLengths, seed: seed, useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts); /// @@ -422,7 +422,7 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -435,13 +435,13 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, int ngramLength = NgramHashingEstimator.Defaults.NgramLength, int skipLength = NgramHashingEstimator.Defaults.SkipLength, - bool allLengths = NgramHashingEstimator.Defaults.AllLengths, + bool useAllLengths = NgramHashingEstimator.Defaults.UseAllLengths, uint seed = NgramHashingEstimator.Defaults.Seed, bool useOrderedHashing = NgramHashingEstimator.Defaults.UseOrderedHashing, int maximumNumberOfInverts = NgramHashingEstimator.Defaults.MaximumNumberOfInverts) => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, numberOfBits: numberOfBits, ngramLength: ngramLength, skipLength: skipLength, - allLengths: allLengths, seed: seed, useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts); + useAllLengths: useAllLengths, seed: seed, useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts); /// /// Produces a bag of counts of hashed ngrams for each . For each column, diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs index 6c89740117..d7cca7752a 100644 --- a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs +++ b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs @@ -141,7 +141,7 @@ public WordBagEstimator.Options WordFeatureExtractor extractor = new NgramExtractorTransform.NgramExtractorArguments(); extractor.NgramLength = _wordFeatureExtractor.NgramLength; extractor.SkipLength = _wordFeatureExtractor.SkipLength; - extractor.AllLengths = _wordFeatureExtractor.AllLengths; + extractor.UseAllLengths = _wordFeatureExtractor.UseAllLengths; extractor.MaxNumTerms = _wordFeatureExtractor.MaximumNgramsCount; extractor.Weighting = _wordFeatureExtractor.Weighting; } @@ -173,7 +173,7 @@ public WordBagEstimator.Options CharFeatureExtractor extractor = new NgramExtractorTransform.NgramExtractorArguments(); extractor.NgramLength = _charFeatureExtractor.NgramLength; extractor.SkipLength = _charFeatureExtractor.SkipLength; - extractor.AllLengths = _charFeatureExtractor.AllLengths; + extractor.UseAllLengths = _charFeatureExtractor.UseAllLengths; extractor.MaxNumTerms = _charFeatureExtractor.MaximumNgramsCount; extractor.Weighting = _charFeatureExtractor.Weighting; } @@ -187,7 +187,7 @@ public WordBagEstimator.Options CharFeatureExtractor public Options() { WordFeatureExtractor = new WordBagEstimator.Options(); - CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false }; + CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }; } } diff --git a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs index c2fa970088..934a2253ee 100644 --- a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs @@ -54,8 +54,8 @@ internal sealed class Column : ManyToOneColumn [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), - ShortName = "all")] - public bool? AllLengths; + Name = "AllLengths", ShortName = "all")] + public bool? UseAllLengths; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] public int[] MaxNumTerms = null; @@ -76,7 +76,7 @@ internal static Column Parse(string str) internal bool TryUnparse(StringBuilder sb) { Contracts.AssertValue(sb); - if (NgramLength != null || SkipLength != null || AllLengths != null || Utils.Size(MaxNumTerms) > 0 || + if (NgramLength != null || SkipLength != null || UseAllLengths != null || Utils.Size(MaxNumTerms) > 0 || Weighting != null) { return false; @@ -123,7 +123,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa MaxNumTerms = options.MaxNumTerms, NgramLength = options.NgramLength, SkipLength = options.SkipLength, - AllLengths = options.AllLengths, + UseAllLengths = options.UseAllLengths, Weighting = options.Weighting, Columns = new NgramExtractorTransform.Column[options.Columns.Length] }; @@ -146,7 +146,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa NgramLength = column.NgramLength, SkipLength = column.SkipLength, Weighting = column.Weighting, - AllLengths = column.AllLengths + UseAllLengths = column.UseAllLengths }; } @@ -175,8 +175,9 @@ internal sealed class Column : OneToOneColumn public int? SkipLength; [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), ShortName = "all")] - public bool? AllLengths; + "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), + Name = "AllLengths", ShortName = "all")] + public bool? UseAllLengths; // REVIEW: This argument is actually confusing. If you set only one value we will use this value for all ngrams respectfully for example, // if we specify 3 ngrams we will have maxNumTerms * 3. And it also pick first value from this array to run term transform, so if you specify @@ -200,7 +201,7 @@ internal static Column Parse(string str) internal bool TryUnparse(StringBuilder sb) { Contracts.AssertValue(sb); - if (NgramLength != null || SkipLength != null || AllLengths != null || Utils.Size(MaxNumTerms) > 0 || + if (NgramLength != null || SkipLength != null || UseAllLengths != null || Utils.Size(MaxNumTerms) > 0 || Weighting != null) { return false; @@ -225,8 +226,8 @@ internal abstract class ArgumentsBase [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), - ShortName = "all")] - public bool AllLengths = NgramExtractingEstimator.Defaults.AllLengths; + Name = "AllLengths", ShortName = "all")] + public bool UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount }; @@ -347,7 +348,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name, column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, - column.AllLengths ?? options.AllLengths, + column.UseAllLengths ?? options.UseAllLengths, column.Weighting ?? options.Weighting, column.MaxNumTerms ?? options.MaxNumTerms, isTermCol[iinfo] ? column.Name : column.Source @@ -380,7 +381,7 @@ internal static IDataTransform Create(IHostEnvironment env, NgramExtractorArgume Columns = extractorCols, NgramLength = extractorArgs.NgramLength, SkipLength = extractorArgs.SkipLength, - AllLengths = extractorArgs.AllLengths, + UseAllLengths = extractorArgs.UseAllLengths, MaxNumTerms = extractorArgs.MaxNumTerms, Weighting = extractorArgs.Weighting }; diff --git a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs index 1fe0c21b09..7a5641e2a2 100644 --- a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs @@ -129,7 +129,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa Ordered = column.Ordered, MaximumNumberOfInverts = column.MaximumNumberOfInverts, FriendlyNames = options.Columns[iinfo].Source, - AllLengths = column.AllLengths + UseAllLengths = column.UseAllLengths }; } @@ -138,7 +138,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa var featurizeArgs = new NgramHashExtractingTransformer.Options { - AllLengths = options.AllLengths, + UseAllLengths = options.UseAllLengths, NumberOfBits = options.NumberOfBits, NgramLength = options.NgramLength, SkipLength = options.SkipLength, @@ -189,8 +189,8 @@ internal abstract class ColumnBase : ManyToOneColumn [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), - ShortName = "all", SortOrder = 4)] - public bool? AllLengths; + Name = "AllLengths", ShortName = "all", SortOrder = 4)] + public bool? UseAllLengths; } internal sealed class Column : ColumnBase @@ -279,8 +279,8 @@ internal abstract class ArgumentsBase [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to ngramLength or only ngramLength", - ShortName = "all", SortOrder = 4)] - public bool AllLengths = true; + Name = "AllLengths", ShortName = "all", SortOrder = 4)] + public bool UseAllLengths = true; } internal static class DefaultArguments @@ -291,7 +291,7 @@ internal static class DefaultArguments public const uint Seed = 314489979; public const bool Ordered = true; public const int MaximumNumberOfInverts = 0; - public const bool AllLengths = true; + public const bool UseAllLengths = true; } [TlcModule.Component(Name = "NGramHash", FriendlyName = "NGram Hash Extractor Transform", Alias = "NGramHashExtractorTransform,NGramHashExtractor", @@ -369,7 +369,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo], column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, - column.AllLengths ?? options.AllLengths, + column.UseAllLengths ?? options.UseAllLengths, column.NumberOfBits ?? options.NumberOfBits, column.Seed ?? options.Seed, column.Ordered ?? options.Ordered, @@ -439,7 +439,7 @@ internal static IDataTransform Create(NgramHashExtractorArguments extractorArgs, MaximumNumberOfInverts = extractorArgs.MaximumNumberOfInverts, Ordered = extractorArgs.Ordered, Seed = extractorArgs.Seed, - AllLengths = extractorArgs.AllLengths + UseAllLengths = extractorArgs.UseAllLengths }; return Create(h, options, input, termLoaderArgs); diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs index 55fc359957..4f91ea42c3 100644 --- a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs +++ b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs @@ -22,7 +22,7 @@ public sealed class WordBagEstimator : IEstimator private readonly (string outputColumnName, string[] sourceColumnsNames)[] _columns; private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly int _maxNumTerms; private readonly NgramExtractingEstimator.WeightingCriteria _weighting; @@ -44,7 +44,7 @@ public class Options /// /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. /// - public bool AllLengths; + public bool UseAllLengths; /// /// The maximum number of grams to store in the dictionary, for each level of ngrams, @@ -61,7 +61,7 @@ public Options() { NgramLength = 1; SkipLength = NgramExtractingEstimator.Defaults.SkipLength; - AllLengths = NgramExtractingEstimator.Defaults.AllLengths; + UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths; MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount }; Weighting = NgramExtractingEstimator.Defaults.Weighting; } @@ -76,7 +76,7 @@ public Options() /// Name of the column to transform. If set to , the value of the will be used as source. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, @@ -84,10 +84,10 @@ internal WordBagEstimator(IHostEnvironment env, string inputColumnName = null, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting) + : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting) { } @@ -100,7 +100,7 @@ internal WordBagEstimator(IHostEnvironment env, /// The columns containing text to compute bag of word vector. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, @@ -108,10 +108,10 @@ internal WordBagEstimator(IHostEnvironment env, string[] inputColumnNames, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - : this(env, new[] { (outputColumnName, inputColumnNames) }, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting) + : this(env, new[] { (outputColumnName, inputColumnNames) }, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting) { } @@ -123,14 +123,14 @@ internal WordBagEstimator(IHostEnvironment env, /// Pairs of columns to compute bag of word vector. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, (string outputColumnName, string[] inputColumnNames)[] columns, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) { @@ -146,7 +146,7 @@ internal WordBagEstimator(IHostEnvironment env, _columns = columns; _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = useAllLengths; _maxNumTerms = maximumNgramsCount; _weighting = weighting; } @@ -160,7 +160,7 @@ public ITransformer Fit(IDataView input) Columns = _columns.Select(x => new WordBagBuildingTransformer.Column { Name = x.outputColumnName, Source = x.sourceColumnsNames }).ToArray(), NgramLength = _ngramLength, SkipLength = _skipLength, - AllLengths = _allLengths, + UseAllLengths = _useAllLengths, MaxNumTerms = new[] { _maxNumTerms }, Weighting = _weighting }; @@ -193,7 +193,7 @@ public sealed class WordHashBagEstimator : IEstimator private readonly int _numberOfBits; private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly uint _seed; private readonly bool _ordered; private readonly int _maximumNumberOfInverts; @@ -208,7 +208,7 @@ public sealed class WordHashBagEstimator : IEstimator /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -221,12 +221,12 @@ internal WordHashBagEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, bool useOrderedHashing = true, int maximumNumberOfInverts = 0) : this(env, new[] { (outputColumnName, new[] { inputColumnName ?? outputColumnName }) }, numberOfBits: numberOfBits, - ngramLength: ngramLength, skipLength: skipLength, allLengths: allLengths, seed: seed, + ngramLength: ngramLength, skipLength: skipLength, useAllLengths: useAllLengths, seed: seed, useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts) { } @@ -241,7 +241,7 @@ internal WordHashBagEstimator(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -254,12 +254,12 @@ internal WordHashBagEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, bool useOrderedHashing = true, int maximumNumberOfInverts = 0) : this(env, new[] { (outputColumnName, inputColumnNames) }, numberOfBits: numberOfBits, - ngramLength: ngramLength, skipLength: skipLength, allLengths: allLengths, seed: seed, + ngramLength: ngramLength, skipLength: skipLength, useAllLengths: useAllLengths, seed: seed, useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts) { } @@ -273,7 +273,7 @@ internal WordHashBagEstimator(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -285,7 +285,7 @@ internal WordHashBagEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, bool useOrderedHashing = true, int maximumNumberOfInverts = 0) @@ -303,7 +303,7 @@ internal WordHashBagEstimator(IHostEnvironment env, _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = useAllLengths; _seed = seed; _ordered = useOrderedHashing; _maximumNumberOfInverts = maximumNumberOfInverts; @@ -319,7 +319,7 @@ public ITransformer Fit(IDataView input) NumberOfBits = _numberOfBits, NgramLength = _ngramLength, SkipLength = _skipLength, - AllLengths = _allLengths, + UseAllLengths = _useAllLengths, Seed = _seed, Ordered = _ordered, MaximumNumberOfInverts = _maximumNumberOfInverts diff --git a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs index e020dd740f..50d4a38f63 100644 --- a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs +++ b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs @@ -138,7 +138,7 @@ void ExtensibilityModifyTextFeaturization() var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", new TextFeaturizingEstimator.Options { - CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false }, + CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }, WordFeatureExtractor = new WordBagEstimator.Options(), VectorNormalizer = TextFeaturizingEstimator.NormFunction.L1 }, "SentimentText") diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 84b864c84f..50c0439112 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -302,7 +302,7 @@ private void TextFeaturizationOn(string dataPath) // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. .Append(new WordHashBagEstimator(mlContext, "BagOfBigrams","NormalizedMessage", - ngramLength: 2, allLengths: false)) + ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. .Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message")) From 8a5c0c28f77d805f8978a1df189ceaeb7a807f5c Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2019 23:28:34 -0700 Subject: [PATCH 12/12] Finish replacement of allLengths --- .../TextStaticExtensions.cs | 48 +++++++++---------- .../Api/CookbookSamples/CookbookSamples.cs | 2 +- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 4d597837f8..c4ef323c97 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -309,16 +309,16 @@ public override IEstimator Reconcile(IHostEnvironment env, /// The column to apply to. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static Vector ProduceWordBags(this Scalar input, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting); + => new OutPipelineColumn(input, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); } /// @@ -334,11 +334,11 @@ public OutPipelineColumn(Scalar input, int numberOfBits, int ngramLength, int skipLength, - bool allLengths, + bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) - : base(new Reconciler(numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, maximumNumberOfInverts), input) + : base(new Reconciler(numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts), input) { Input = input; } @@ -354,12 +354,12 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly bool _useOrderedHashing; private readonly int _maximumNumberOfInverts; - public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) + public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) { _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; - _useAllLengths = allLengths; + _useAllLengths = useAllLengths; _seed = seed; _useOrderedHashing = useOrderedHashing; _maximumNumberOfInverts = maximumNumberOfInverts; @@ -400,7 +400,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -411,10 +411,10 @@ public static Vector ProduceHashedWordBags(this Scalar input, int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, bool useOrderedHashing = true, - int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, maximumNumberOfInverts); + int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts); } /// @@ -429,10 +429,10 @@ private sealed class OutPipelineColumn : Vector public OutPipelineColumn(PipelineColumn input, int ngramLength, int skipLength, - bool allLengths, + bool useAllLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) - : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input) + : base(new Reconciler(ngramLength, skipLength, useAllLengths, maxNumTerms, weighting), input) { Input = input; } @@ -446,11 +446,11 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly int _maxNgramsCount; private readonly NgramExtractingEstimator.WeightingCriteria _weighting; - public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) + public Reconciler(int ngramLength, int skipLength, bool useAllLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) { _ngramLength = ngramLength; _skipLength = skipLength; - _useAllLengths = allLengths; + _useAllLengths = useAllLengths; _maxNgramsCount = maxNumTerms; _weighting = weighting; @@ -491,16 +491,16 @@ public override IEstimator Reconcile(IHostEnvironment env, /// The column to apply to. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static Vector ProduceNgrams(this VarVector> input, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maximumNgramsCount, weighting); + => new OutPipelineColumn(input, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); } /// @@ -512,8 +512,8 @@ private sealed class OutPipelineColumn : Vector { public readonly VarVector> Input; - public OutPipelineColumn(VarVector> input, int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) - : base(new Reconciler(numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, maximumNumberOfInverts), input) + public OutPipelineColumn(VarVector> input, int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) + : base(new Reconciler(numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts), input) { Input = input; } @@ -529,12 +529,12 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly bool _useOrderedHashing; private readonly int _maximumNumberOfInverts; - public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) + public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) { _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; - _useAllLengths = allLengths; + _useAllLengths = useAllLengths; _seed = seed; _useOrderedHashing = useOrderedHashing; _maximumNumberOfInverts = maximumNumberOfInverts; @@ -578,7 +578,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. @@ -589,9 +589,9 @@ public static Vector ProduceHashedNgrams(this VarVector int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, bool useOrderedHashing = true, - int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, useOrderedHashing, maximumNumberOfInverts); + int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts); } } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 0261377a49..e2bacb6309 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -464,7 +464,7 @@ private void TextFeaturizationOn(string dataPath) BagOfWords: r.Message.NormalizeText().ProduceWordBags(), // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. - BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, allLengths: false), + BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, useAllLengths: false), // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. BagOfTrichar: r.Message.TokenizeIntoCharacters().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf),