diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 8edb9a626a..7a73a1178a 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -772,7 +772,7 @@ var pipeline = // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. .Append(new WordHashBagEstimator(mlContext, "BagOfBigrams","NormalizedMessage", - ngramLength: 2, allLengths: false)) + ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. .Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message")) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs index fa3c6317bf..d1f36d3731 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs @@ -61,7 +61,7 @@ public static void NgramTransform() // 'e' - 1 '' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1 // 'B' - 0 'e' - 6 's' - 3 't' - 6 '' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ... // Preview of the CharsTwoGrams column obtained after processing the input. - var charsTwoGramColumn = transformedData_twochars.GetColumn>(transformedData_onechars.Schema["CharsUnigrams"]); + var charsTwoGramColumn = transformedData_twochars.GetColumn>(transformedData_twochars.Schema["CharsTwograms"]); transformedData_twochars.Schema["CharsTwograms"].GetSlotNames(ref slotNames); printHelper("CharsTwograms", charsTwoGramColumn, slotNames); diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 62e3e20d10..c4ef323c97 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -263,7 +263,7 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable { private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly int _maxNumTerms; private readonly NgramExtractingEstimator.WeightingCriteria _weighting; @@ -271,7 +271,7 @@ public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTe { _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = allLengths; _maxNumTerms = maxNumTerms; _weighting = weighting; @@ -281,7 +281,7 @@ public bool Equals(Reconciler other) { return _ngramLength == other._ngramLength && _skipLength == other._skipLength && - _allLengths == other._allLengths && + _useAllLengths == other._useAllLengths && _maxNumTerms == other._maxNumTerms && _weighting == other._weighting; } @@ -298,7 +298,7 @@ public override IEstimator Reconcile(IHostEnvironment env, foreach (var outCol in toOutput) pairs.Add((outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] })); - return new WordBagEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _allLengths, _maxNumTerms, _weighting); + return new WordBagEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _useAllLengths, _maxNumTerms, _weighting); } } @@ -309,16 +309,16 @@ public override IEstimator Reconcile(IHostEnvironment env, /// The column to apply to. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static Vector ToBagofWords(this Scalar input, + public static Vector ProduceWordBags(this Scalar input, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, - int maxNumTerms = 10000000, + bool useAllLengths = true, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting); + => new OutPipelineColumn(input, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); } /// @@ -334,11 +334,11 @@ public OutPipelineColumn(Scalar input, int numberOfBits, int ngramLength, int skipLength, - bool allLengths, + bool useAllLengths, uint seed, - bool ordered, + bool useOrderedHashing, int maximumNumberOfInverts) - : base(new Reconciler(numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts), input) + : base(new Reconciler(numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts), input) { Input = input; } @@ -349,19 +349,19 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly int _numberOfBits; private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly uint _seed; - private readonly bool _ordered; + private readonly bool _useOrderedHashing; private readonly int _maximumNumberOfInverts; - public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int maximumNumberOfInverts) + public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) { _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = useAllLengths; _seed = seed; - _ordered = ordered; + _useOrderedHashing = useOrderedHashing; _maximumNumberOfInverts = maximumNumberOfInverts; } @@ -370,9 +370,9 @@ public bool Equals(Reconciler other) return _numberOfBits == other._numberOfBits && _ngramLength == other._ngramLength && _skipLength == other._skipLength && - _allLengths == other._allLengths && + _useAllLengths == other._useAllLengths && _seed == other._seed && - _ordered == other._ordered && + _useOrderedHashing == other._useOrderedHashing && _maximumNumberOfInverts == other._maximumNumberOfInverts; } @@ -388,7 +388,7 @@ public override IEstimator Reconcile(IHostEnvironment env, foreach (var outCol in toOutput) pairs.Add((outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] })); - return new WordHashBagEstimator(env, pairs.ToArray(), _numberOfBits, _ngramLength, _skipLength, _allLengths, _seed, _ordered, _maximumNumberOfInverts); + return new WordHashBagEstimator(env, pairs.ToArray(), _numberOfBits, _ngramLength, _skipLength, _useAllLengths, _seed, _useOrderedHashing, _maximumNumberOfInverts); } } @@ -400,21 +400,21 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector ToBagofHashedWords(this Scalar input, + public static Vector ProduceHashedWordBags(this Scalar input, int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, - bool ordered = true, - int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts); + bool useOrderedHashing = true, + int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts); } /// @@ -429,10 +429,10 @@ private sealed class OutPipelineColumn : Vector public OutPipelineColumn(PipelineColumn input, int ngramLength, int skipLength, - bool allLengths, + bool useAllLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) - : base(new Reconciler(ngramLength, skipLength, allLengths, maxNumTerms, weighting), input) + : base(new Reconciler(ngramLength, skipLength, useAllLengths, maxNumTerms, weighting), input) { Input = input; } @@ -442,16 +442,16 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable { private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; - private readonly int _maxNumTerms; + private readonly bool _useAllLengths; + private readonly int _maxNgramsCount; private readonly NgramExtractingEstimator.WeightingCriteria _weighting; - public Reconciler(int ngramLength, int skipLength, bool allLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) + public Reconciler(int ngramLength, int skipLength, bool useAllLengths, int maxNumTerms, NgramExtractingEstimator.WeightingCriteria weighting) { _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; - _maxNumTerms = maxNumTerms; + _useAllLengths = useAllLengths; + _maxNgramsCount = maxNumTerms; _weighting = weighting; } @@ -460,8 +460,8 @@ public bool Equals(Reconciler other) { return _ngramLength == other._ngramLength && _skipLength == other._skipLength && - _allLengths == other._allLengths && - _maxNumTerms == other._maxNumTerms && + _useAllLengths == other._useAllLengths && + _maxNgramsCount == other._maxNgramsCount && _weighting == other._weighting; } @@ -477,7 +477,7 @@ public override IEstimator Reconcile(IHostEnvironment env, foreach (var outCol in toOutput) pairs.Add((outputNames[outCol], inputNames[((OutPipelineColumn)outCol).Input])); - return new NgramExtractingEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _allLengths, _maxNumTerms, _weighting); + return new NgramExtractingEstimator(env, pairs.ToArray(), _ngramLength, _skipLength, _useAllLengths, _maxNgramsCount, _weighting); } } @@ -485,22 +485,22 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words ) in a given tokenized text. /// It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. /// - /// /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static Vector ToNgrams(this VarVector> input, + public static Vector ProduceNgrams(this VarVector> input, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, - int maxNumTerms = 10000000, + bool useAllLengths = true, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new OutPipelineColumn(input, ngramLength, skipLength, allLengths, maxNumTerms, weighting); + => new OutPipelineColumn(input, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); } /// @@ -512,8 +512,8 @@ private sealed class OutPipelineColumn : Vector { public readonly VarVector> Input; - public OutPipelineColumn(VarVector> input, int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int maximumNumberOfInverts) - : base(new Reconciler(numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts), input) + public OutPipelineColumn(VarVector> input, int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) + : base(new Reconciler(numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts), input) { Input = input; } @@ -524,19 +524,19 @@ private sealed class Reconciler : EstimatorReconciler, IEquatable private readonly int _numberOfBits; private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly uint _seed; - private readonly bool _ordered; + private readonly bool _useOrderedHashing; private readonly int _maximumNumberOfInverts; - public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool allLengths, uint seed, bool ordered, int maximumNumberOfInverts) + public Reconciler(int numberOfBits, int ngramLength, int skipLength, bool useAllLengths, uint seed, bool useOrderedHashing, int maximumNumberOfInverts) { _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = useAllLengths; _seed = seed; - _ordered = ordered; + _useOrderedHashing = useOrderedHashing; _maximumNumberOfInverts = maximumNumberOfInverts; } @@ -545,9 +545,9 @@ public bool Equals(Reconciler other) return _numberOfBits == other._numberOfBits && _ngramLength == other._ngramLength && _skipLength == other._skipLength && - _allLengths == other._allLengths && + _useAllLengths == other._useAllLengths && _seed == other._seed && - _ordered == other._ordered && + _useOrderedHashing == other._useOrderedHashing && _maximumNumberOfInverts == other._maximumNumberOfInverts; } @@ -561,7 +561,7 @@ public override IEstimator Reconcile(IHostEnvironment env, var columns = new List(); foreach (var outCol in toOutput) columns.Add(new NgramHashingEstimator.ColumnOptions(outputNames[outCol], new[] { inputNames[((OutPipelineColumn)outCol).Input] }, - _ngramLength, _skipLength, _allLengths, _numberOfBits, _seed, _ordered, _maximumNumberOfInverts)); + _ngramLength, _skipLength, _useAllLengths, _numberOfBits, _seed, _useOrderedHashing, _maximumNumberOfInverts)); return new NgramHashingEstimator(env, columns.ToArray()); } @@ -571,27 +571,27 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Produces a bag of counts of ngrams (sequences of consecutive words of length 1-n) in a given tokenized text. /// It does so by hashing each ngram and using the hash value as the index in the bag. /// - /// is different from - /// in a way that takes tokenized text as input while tokenizes text internally. + /// is different from + /// in a way that takes tokenized text as input while tokenizes text internally. /// /// The column to apply to. /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static Vector ToNgramsHash(this VarVector> input, + public static Vector ProduceHashedNgrams(this VarVector> input, int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, - bool ordered = true, - int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts); + bool useOrderedHashing = true, + int maximumNumberOfInverts = 0) => new OutPipelineColumn(input, numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts); } } diff --git a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs index d13a9e7daf..21337758ca 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs @@ -42,8 +42,8 @@ internal sealed class Column : ManyToOneColumn public int? NgramLength; [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), ShortName = "all")] - public bool? AllLengths; + "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), Name = "AllLengths", ShortName = "all")] + public bool? UseAllLengths; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of tokens to skip when constructing an ngram", @@ -52,7 +52,7 @@ internal sealed class Column : ManyToOneColumn [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 30, inclusive.", - ShortName = "bits")] + Name = "HashBits", ShortName = "bits")] public int? NumberOfBits; [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")] @@ -98,7 +98,7 @@ private protected override bool TryParse(string str) internal bool TryUnparse(StringBuilder sb) { Contracts.AssertValue(sb); - if (NgramLength != null || AllLengths != null || SkipLength != null || Seed != null || + if (NgramLength != null || UseAllLengths != null || SkipLength != null || Seed != null || RehashUnigrams != null || Ordered != null || MaximumNumberOfInverts != null) { return false; @@ -123,8 +123,8 @@ internal sealed class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), - ShortName = "all", SortOrder = 4)] - public bool AllLengths = NgramHashingEstimator.Defaults.AllLengths; + Name = "AllLengths", ShortName = "all", SortOrder = 4)] + public bool UseAllLengths = NgramHashingEstimator.Defaults.UseAllLengths; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of tokens to skip when constructing an ngram", @@ -133,7 +133,7 @@ internal sealed class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Number of bits to hash into. Must be between 1 and 30, inclusive.", - ShortName = "bits", SortOrder = 2)] + Name = "HashBits", ShortName = "bits", SortOrder = 2)] public int NumberOfBits = NgramHashingEstimator.Defaults.NumberOfBits; [Argument(ArgumentType.AtMostOnce, HelpText = "Hashing seed")] @@ -145,7 +145,7 @@ internal sealed class Options [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the position of each source column should be included in the hash (when there are multiple source columns).", ShortName = "ord", SortOrder = 6)] - public bool Ordered = NgramHashingEstimator.Defaults.Ordered; + public bool Ordered = NgramHashingEstimator.Defaults.UseOrderedHashing; [Argument(ArgumentType.AtMostOnce, HelpText = "Limit the number of keys used to generate the slot name to this many. 0 means no invert hashing, -1 means no limit.", ShortName = "ih")] @@ -352,7 +352,7 @@ private static IDataTransform Create(IHostEnvironment env, Options options, IDat item.Source ?? new string[] { item.Name }, item.NgramLength ?? options.NgramLength, item.SkipLength ?? options.SkipLength, - item.AllLengths ?? options.AllLengths, + item.UseAllLengths ?? options.UseAllLengths, item.NumberOfBits ?? options.NumberOfBits, item.Seed ?? options.Seed, item.Ordered ?? options.Ordered, @@ -417,8 +417,8 @@ private NgramIdFinder GetNgramIdFinder(int iinfo) uint mask = (1U << _parent._columns[iinfo].NumberOfBits) - 1; int ngramLength = _parent._columns[iinfo].NgramLength; bool rehash = _parent._columns[iinfo].RehashUnigrams; - bool ordered = _parent._columns[iinfo].Ordered; - bool all = _parent._columns[iinfo].AllLengths; + bool ordered = _parent._columns[iinfo].UseOrderedHashing; + bool all = _parent._columns[iinfo].UseAllLengths; uint seed = _parent._columns[iinfo].Seed; // REVIEW: Consider the case when: @@ -885,13 +885,13 @@ public sealed class ColumnOptions /// Maximum number of tokens to skip when constructing an ngram. public readonly int SkipLength; /// Whether to store all ngram lengths up to , or only . - public readonly bool AllLengths; + public readonly bool UseAllLengths; /// Number of bits to hash into. Must be between 1 and 31, inclusive. public readonly int NumberOfBits; /// Hashing seed. public readonly uint Seed; /// Whether the position of each term should be included in the hash. - public readonly bool Ordered; + public readonly bool UseOrderedHashing; /// /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column. @@ -907,16 +907,16 @@ public sealed class ColumnOptions internal string[] FriendlyNames; /// - /// Describes how the transformer handles one column pair. + /// Describes how the transformer maps several input columns, , to a output column, . /// /// Name of the column resulting from the transformation of . /// Names of the columns to transform. /// Maximum ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to store all ngram lengths up to , or only . + /// Whether to store all ngram lengths up to , or only . /// Number of bits to hash into. Must be between 1 and 31, inclusive. /// Hashing seed. - /// Whether the position of each term should be included in the hash. + /// Whether the position of each term should be included in the hash. /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column. /// Hashing, as such, can map many initial values to one. @@ -927,10 +927,10 @@ public ColumnOptions(string name, string[] inputColumnNames, int ngramLength = NgramHashingEstimator.Defaults.NgramLength, int skipLength = NgramHashingEstimator.Defaults.SkipLength, - bool allLengths = NgramHashingEstimator.Defaults.AllLengths, + bool useAllLengths = NgramHashingEstimator.Defaults.UseAllLengths, int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, uint seed = NgramHashingEstimator.Defaults.Seed, - bool ordered = NgramHashingEstimator.Defaults.Ordered, + bool useOrderedHashing = NgramHashingEstimator.Defaults.UseOrderedHashing, int maximumNumberOfInverts = NgramHashingEstimator.Defaults.MaximumNumberOfInverts, bool rehashUnigrams = NgramHashingEstimator.Defaults.RehashUnigrams) { @@ -945,20 +945,24 @@ public ColumnOptions(string name, if (maximumNumberOfInverts != 0 && numberOfBits >= 31) throw Contracts.ExceptParam(nameof(numberOfBits), $"Cannot support maximumNumberOfInverts for a {0} bit hash. 30 is the maximum possible.", numberOfBits); - if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength) + if (ngramLength == 1 && skipLength != 0) + throw Contracts.ExceptUserArg(nameof(skipLength), string.Format( + "{0} (actual value: {1}) can only be zero when {2} set to one.", nameof(skipLength), skipLength, nameof(ngramLength))); + if (ngramLength + skipLength > NgramBufferBuilder.MaxSkipNgramLength) { throw Contracts.ExceptUserArg(nameof(skipLength), $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}"); } + FriendlyNames = null; Name = name; InputColumnNamesArray = inputColumnNames; NgramLength = ngramLength; SkipLength = skipLength; - AllLengths = allLengths; + UseAllLengths = useAllLengths; NumberOfBits = numberOfBits; Seed = seed; - Ordered = ordered; + UseOrderedHashing = useOrderedHashing; MaximumNumberOfInverts = maximumNumberOfInverts; RehashUnigrams = rehashUnigrams; } @@ -992,8 +996,8 @@ internal ColumnOptions(ModelLoadContext ctx) Contracts.CheckDecode(1 <= NumberOfBits && NumberOfBits <= 30); Seed = ctx.Reader.ReadUInt32(); RehashUnigrams = ctx.Reader.ReadBoolByte(); - Ordered = ctx.Reader.ReadBoolByte(); - AllLengths = ctx.Reader.ReadBoolByte(); + UseOrderedHashing = ctx.Reader.ReadBoolByte(); + UseAllLengths = ctx.Reader.ReadBoolByte(); } internal ColumnOptions(ModelLoadContext ctx, string name, string[] inputColumnNames) @@ -1022,8 +1026,8 @@ internal ColumnOptions(ModelLoadContext ctx, string name, string[] inputColumnNa Contracts.CheckDecode(1 <= NumberOfBits && NumberOfBits <= 30); Seed = ctx.Reader.ReadUInt32(); RehashUnigrams = ctx.Reader.ReadBoolByte(); - Ordered = ctx.Reader.ReadBoolByte(); - AllLengths = ctx.Reader.ReadBoolByte(); + UseOrderedHashing = ctx.Reader.ReadBoolByte(); + UseAllLengths = ctx.Reader.ReadBoolByte(); } internal void Save(ModelSaveContext ctx) @@ -1056,20 +1060,20 @@ internal void Save(ModelSaveContext ctx) ctx.Writer.Write(NumberOfBits); ctx.Writer.Write(Seed); ctx.Writer.WriteBoolByte(RehashUnigrams); - ctx.Writer.WriteBoolByte(Ordered); - ctx.Writer.WriteBoolByte(AllLengths); + ctx.Writer.WriteBoolByte(UseOrderedHashing); + ctx.Writer.WriteBoolByte(UseAllLengths); } } internal static class Defaults { internal const int NgramLength = 2; - internal const bool AllLengths = true; + internal const bool UseAllLengths = true; internal const int SkipLength = 0; internal const int NumberOfBits = 16; internal const uint Seed = 314489979; internal const bool RehashUnigrams = false; - internal const bool Ordered = true; + internal const bool UseOrderedHashing = true; internal const int MaximumNumberOfInverts = 0; } @@ -1089,9 +1093,9 @@ internal static class Defaults /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -1102,11 +1106,11 @@ internal NgramHashingEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int maximumNumberOfInverts = 0) - : this(env, new[] { (outputColumnName, new[] { inputColumnName ?? outputColumnName }) }, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts) + : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, numberOfBits, ngramLength, skipLength, useAllLengths, seed, useOrderedHashing, maximumNumberOfInverts) { } @@ -1123,9 +1127,9 @@ internal NgramHashingEstimator(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -1136,47 +1140,14 @@ internal NgramHashingEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 2, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int maximumNumberOfInverts = 0) - : this(env, new[] { (outputColumnName, inputColumnNames) }, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts) + : this(env, new ColumnOptions(outputColumnName, inputColumnNames, ngramLength, skipLength, useAllLengths, numberOfBits, seed, useOrderedHashing, maximumNumberOfInverts)) { } - /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs ngram vector for each output in - /// - /// is different from in a way that - /// takes tokenized text as input while tokenizes text internally. - /// - /// The environment. - /// Pairs of input columns to output column mappings on which to compute ngram vector. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - internal NgramHashingEstimator(IHostEnvironment env, - (string outputColumnName, string[] inputColumnName)[] columns, - int numberOfBits = 16, - int ngramLength = 2, - int skipLength = 0, - bool allLengths = true, - uint seed = 314489979, - bool ordered = true, - int maximumNumberOfInverts = 0) - : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, allLengths, numberOfBits, seed, ordered, maximumNumberOfInverts)).ToArray()) - { - - } - /// /// Produces a bag of counts of hashed ngrams in /// and outputs ngram vector for each output in diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index ce5fb31119..6e0cfb35f3 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -42,8 +42,8 @@ internal sealed class Column : OneToOneColumn public int? NgramLength; [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), ShortName = "all")] - public bool? AllLengths; + "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), Name = "AllLengths", ShortName = "all")] + public bool? UseAllLengths; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of tokens to skip when constructing an ngram", @@ -69,7 +69,7 @@ internal static Column Parse(string str) internal bool TryUnparse(StringBuilder sb) { Contracts.AssertValue(sb); - if (NgramLength != null || AllLengths != null || SkipLength != null || Utils.Size(MaxNumTerms) != 0) + if (NgramLength != null || UseAllLengths != null || SkipLength != null || Utils.Size(MaxNumTerms) != 0) return false; return TryUnparseCore(sb); } @@ -84,8 +84,8 @@ internal sealed class Options : TransformInputBase public int NgramLength = NgramExtractingEstimator.Defaults.NgramLength; [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether to store all ngram lengths up to ngramLength, or only ngramLength", ShortName = "all")] - public bool AllLengths = NgramExtractingEstimator.Defaults.AllLengths; + "Whether to store all ngram lengths up to ngramLength, or only ngramLength", Name = "AllLengths", ShortName = "all")] + public bool UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths; [Argument(ArgumentType.AtMostOnce, HelpText = "Maximum number of tokens to skip when constructing an ngram", @@ -93,7 +93,7 @@ internal sealed class Options : TransformInputBase public int SkipLength = NgramExtractingEstimator.Defaults.SkipLength; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] - public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms }; + public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount }; [Argument(ArgumentType.AtMostOnce, HelpText = "The weighting criteria")] public NgramExtractingEstimator.WeightingCriteria Weighting = NgramExtractingEstimator.Defaults.Weighting; @@ -253,7 +253,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat // Note: GetNgramIdFinderAdd will control how many ngrams of a specific length will // be added (using lims[iinfo]), therefore we set slotLim to the maximum helpers[iinfo] = new NgramBufferBuilder(ngramLength, skipLength, Utils.ArrayMaxSize, - GetNgramIdFinderAdd(env, counts[iinfo], columns[iinfo].Limits, ngramMaps[iinfo], transformInfos[iinfo].RequireIdf)); + GetNgramIdFinderAdd(env, counts[iinfo], columns[iinfo].MaximumNgramsCounts, ngramMaps[iinfo], transformInfos[iinfo].RequireIdf)); } int cInfoFull = 0; @@ -293,7 +293,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat } } } - AssertValid(env, counts[iinfo], columns[iinfo].Limits, ngramMaps[iinfo]); + AssertValid(env, counts[iinfo], columns[iinfo].MaximumNgramsCounts, ngramMaps[iinfo]); } } @@ -307,7 +307,7 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat for (int iinfo = 0; iinfo < columns.Length; iinfo++) { - AssertValid(env, counts[iinfo], columns[iinfo].Limits, ngramMaps[iinfo]); + AssertValid(env, counts[iinfo], columns[iinfo].MaximumNgramsCounts, ngramMaps[iinfo]); int ngramLength = transformInfos[iinfo].NgramLength; for (int i = 0; i < ngramLength; i++) @@ -319,11 +319,11 @@ private static SequencePool[] Train(IHostEnvironment env, NgramExtractingEstimat } [Conditional("DEBUG")] - private static void AssertValid(IHostEnvironment env, int[] counts, ImmutableArray lims, SequencePool pool) + private static void AssertValid(IHostEnvironment env, int[] counts, IReadOnlyList lims, SequencePool pool) { int count = 0; int countFull = 0; - for (int i = 0; i < lims.Length; i++) + for (int i = 0; i < lims.Count; i++) { env.Assert(counts[i] >= 0); env.Assert(counts[i] <= lims[i]); @@ -334,20 +334,20 @@ private static void AssertValid(IHostEnvironment env, int[] counts, ImmutableArr env.Assert(count == pool.Count); } - private static NgramIdFinder GetNgramIdFinderAdd(IHostEnvironment env, int[] counts, ImmutableArray lims, SequencePool pool, bool requireIdf) + private static NgramIdFinder GetNgramIdFinderAdd(IHostEnvironment env, int[] counts, IReadOnlyList lims, SequencePool pool, bool requireIdf) { Contracts.AssertValue(env); - env.Assert(lims.Length > 0); - env.Assert(lims.Length == Utils.Size(counts)); + env.Assert(lims.Count > 0); + env.Assert(lims.Count == Utils.Size(counts)); int numFull = lims.Count(l => l <= 0); - int ngramLength = lims.Length; + int ngramLength = lims.Count; return (uint[] ngram, int lim, int icol, ref bool more) => { env.Assert(0 < lim && lim <= Utils.Size(ngram)); env.Assert(lim <= Utils.Size(counts)); - env.Assert(lim <= lims.Length); + env.Assert(lim <= lims.Count); env.Assert(icol == 0); var max = lim - 1; @@ -424,7 +424,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa item.Name, item.NgramLength ?? options.NgramLength, item.SkipLength ?? options.SkipLength, - item.AllLengths ?? options.AllLengths, + item.UseAllLengths ?? options.UseAllLengths, item.Weighting ?? options.Weighting, maxNumTerms, item.Source ?? item.Name); @@ -693,9 +693,9 @@ public enum WeightingCriteria internal static class Defaults { public const int NgramLength = 2; - public const bool AllLengths = true; + public const bool UseAllLengths = true; public const int SkipLength = 0; - public const int MaxNumTerms = 10000000; + public const int MaximumNgramsCount = 10000000; public const WeightingCriteria Weighting = WeightingCriteria.Tf; } @@ -711,17 +711,17 @@ internal static class Defaults /// Name of the column to transform. If set to , the value of the will be used as source. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal NgramExtractingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, - bool allLengths = Defaults.AllLengths, - int maxNumTerms = Defaults.MaxNumTerms, + bool useAllLengths = Defaults.UseAllLengths, + int maximumNgramsCount = Defaults.MaximumNgramsCount, WeightingCriteria weighting = Defaults.Weighting) - : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, ngramLength, skipLength, allLengths, maxNumTerms, weighting) + : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting) { } @@ -733,17 +733,17 @@ internal NgramExtractingEstimator(IHostEnvironment env, /// Pairs of columns to compute bag of word vector. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal NgramExtractingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, - bool allLengths = Defaults.AllLengths, - int maxNumTerms = Defaults.MaxNumTerms, + bool useAllLengths = Defaults.UseAllLengths, + int maximumNgramsCount = Defaults.MaximumNgramsCount, WeightingCriteria weighting = Defaults.Weighting) - : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, allLengths, weighting, maxNumTerms)).ToArray()) + : this(env, columns.Select(x => new ColumnOptions(x.outputColumnName, x.inputColumnName, ngramLength, skipLength, useAllLengths, weighting, maximumNgramsCount)).ToArray()) { } @@ -805,14 +805,18 @@ public sealed class ColumnOptions /// Maximum number of tokens to skip when constructing an ngram. public readonly int SkipLength; /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. - public readonly bool AllLengths; + public readonly bool UseAllLengths; /// The weighting criteria. public readonly WeightingCriteria Weighting; /// - /// Contains the maximum number of grams to store in the dictionary, for each level of ngrams, - /// from 1 (in position 0) up to ngramLength (in position ngramLength-1) + /// Underlying state of . /// - public readonly ImmutableArray Limits; + private readonly ImmutableArray _maximumNgramsCounts; + /// + /// Contains the maximum number of terms (that is, n-grams) to store in the dictionary, for each level of n-grams, + /// from n=1 (in position 0) up to n= (in position -1) + /// + public IReadOnlyList MaximumNgramsCounts => _maximumNgramsCounts; /// /// Describes how the transformer handles one Gcn column pair. @@ -821,54 +825,57 @@ public sealed class ColumnOptions /// Name of column to transform. If set to , the value of the will be used as source. /// Maximum ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. + /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. /// The weighting criteria. - /// Maximum number of ngrams to store in the dictionary. + /// Maximum number of n-grams to store in the dictionary. public ColumnOptions(string name, string inputColumnName = null, int ngramLength = Defaults.NgramLength, int skipLength = Defaults.SkipLength, - bool allLengths = Defaults.AllLengths, + bool useAllLengths = Defaults.UseAllLengths, WeightingCriteria weighting = Defaults.Weighting, - int maxNumTerms = Defaults.MaxNumTerms) - : this(name, ngramLength, skipLength, allLengths, weighting, new int[] { maxNumTerms }, inputColumnName ?? name) + int maximumNgramsCount = Defaults.MaximumNgramsCount) + : this(name, ngramLength, skipLength, useAllLengths, weighting, new int[] { maximumNgramsCount }, inputColumnName ?? name) { } internal ColumnOptions(string name, int ngramLength, int skipLength, - bool allLengths, + bool useAllLengths, WeightingCriteria weighting, - int[] maxNumTerms, + int[] maximumNgramsCounts, string inputColumnName = null) { - Name = name; - InputColumnName = inputColumnName ?? name; - NgramLength = ngramLength; - Contracts.CheckUserArg(0 < NgramLength && NgramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength)); - SkipLength = skipLength; - if (NgramLength + SkipLength > NgramBufferBuilder.MaxSkipNgramLength) - { + if (ngramLength == 1 && skipLength != 0) + throw Contracts.ExceptUserArg(nameof(skipLength), string.Format( + "{0} (actual value: {1}) can only be zero when {2} set to one.", nameof(skipLength), skipLength, nameof(ngramLength))); + if (ngramLength + skipLength > NgramBufferBuilder.MaxSkipNgramLength) throw Contracts.ExceptUserArg(nameof(skipLength), $"The sum of skipLength and ngramLength must be less than or equal to {NgramBufferBuilder.MaxSkipNgramLength}"); - } - AllLengths = allLengths; - Weighting = weighting; + Contracts.CheckUserArg(0 < ngramLength && ngramLength <= NgramBufferBuilder.MaxSkipNgramLength, nameof(ngramLength)); + var limits = new int[ngramLength]; - if (!AllLengths) + if (!useAllLengths) { - Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || - Utils.Size(maxNumTerms) == 1 && maxNumTerms[0] > 0, nameof(maxNumTerms)); - limits[ngramLength - 1] = Utils.Size(maxNumTerms) == 0 ? Defaults.MaxNumTerms : maxNumTerms[0]; + Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) == 0 || + Utils.Size(maximumNgramsCounts) == 1 && maximumNgramsCounts[0] > 0, nameof(maximumNgramsCounts)); + limits[ngramLength - 1] = Utils.Size(maximumNgramsCounts) == 0 ? Defaults.MaximumNgramsCount : maximumNgramsCounts[0]; } else { - Contracts.CheckUserArg(Utils.Size(maxNumTerms) <= ngramLength, nameof(maxNumTerms)); - Contracts.CheckUserArg(Utils.Size(maxNumTerms) == 0 || maxNumTerms.All(i => i >= 0) && maxNumTerms[maxNumTerms.Length - 1] > 0, nameof(maxNumTerms)); - var extend = Utils.Size(maxNumTerms) == 0 ? Defaults.MaxNumTerms : maxNumTerms[maxNumTerms.Length - 1]; - limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maxNumTerms) ? maxNumTerms[i] : extend); + Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) <= ngramLength, nameof(maximumNgramsCounts)); + Contracts.CheckUserArg(Utils.Size(maximumNgramsCounts) == 0 || maximumNgramsCounts.All(i => i >= 0) && maximumNgramsCounts[maximumNgramsCounts.Length - 1] > 0, nameof(maximumNgramsCounts)); + var extend = Utils.Size(maximumNgramsCounts) == 0 ? Defaults.MaximumNgramsCount : maximumNgramsCounts[maximumNgramsCounts.Length - 1]; + limits = Utils.BuildArray(ngramLength, i => i < Utils.Size(maximumNgramsCounts) ? maximumNgramsCounts[i] : extend); } - Limits = ImmutableArray.Create(limits); + _maximumNgramsCounts = ImmutableArray.Create(limits); + + Name = name; + InputColumnName = inputColumnName ?? name; + NgramLength = ngramLength; + SkipLength = skipLength; + UseAllLengths = useAllLengths; + Weighting = weighting; } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 5cebe57c94..3aa10978ac 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -2,7 +2,6 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. -using System.Collections.Generic; using Microsoft.ML.Data; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms.Text; @@ -193,8 +192,8 @@ public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextT /// Name of the column to transform. If set to , the value of the will be used as source. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of n-grams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. /// /// @@ -208,32 +207,11 @@ public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Text string inputColumnName = null, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, + bool useAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths, + int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting) => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, - ngramLength, skipLength, allLengths, maxNumTerms, weighting); - - /// - /// Produces a bag of counts of ngrams (sequences of consecutive words) in - /// and outputs bag of word vector for each output in - /// - /// The text-related transform's catalog. - /// Pairs of columns to compute bag of word vector. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. - /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string inputColumnName)[] columns, - int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, - int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, - NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.Defaults.Weighting) - => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, - ngramLength, skipLength, allLengths, maxNumTerms, weighting); + ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in @@ -330,19 +308,19 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa /// Name of the column to transform. If set to , the value of the will be used as source. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, + bool useAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths, + int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, ngramLength, skipLength, allLengths, maxNumTerms); + outputColumnName, inputColumnName, ngramLength, skipLength, useAllLengths, maximumNgramsCount); /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in @@ -353,39 +331,19 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf /// Name of the columns to transform. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string[] inputColumnNames, int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, + bool useAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths, + int maximumNgramsCount = NgramExtractingEstimator.Defaults.MaximumNgramsCount, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, ngramLength, skipLength, allLengths, maxNumTerms, weighting); - - /// - /// Produces a bag of counts of ngrams (sequences of consecutive words) in - /// and outputs bag of word vector for each output in - /// - /// The text-related transform's catalog. - /// Pairs of columns to compute bag of word vector. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. - /// Statistical measure used to evaluate how important a word is to a document in a corpus. - public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string[] inputColumnNames)[] columns, - int ngramLength = NgramExtractingEstimator.Defaults.NgramLength, - int skipLength = NgramExtractingEstimator.Defaults.SkipLength, - bool allLengths = NgramExtractingEstimator.Defaults.AllLengths, - int maxNumTerms = NgramExtractingEstimator.Defaults.MaxNumTerms, - NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - => new WordBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, ngramLength, skipLength, allLengths, maxNumTerms, weighting); + outputColumnName, inputColumnNames, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting); /// /// Produces a bag of counts of hashed ngrams in @@ -397,9 +355,9 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -410,12 +368,14 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.NumberOfBits, int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, - bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, + bool useAllLengths = NgramHashExtractingTransformer.DefaultArguments.UseAllLengths, uint seed = NgramHashExtractingTransformer.DefaultArguments.Seed, - bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, + bool useOrderedHashing = NgramHashExtractingTransformer.DefaultArguments.Ordered, int maximumNumberOfInverts = NgramHashExtractingTransformer.DefaultArguments.MaximumNumberOfInverts) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts); + outputColumnName, inputColumnName, numberOfBits: numberOfBits, ngramLength: ngramLength, + skipLength: skipLength, useAllLengths: useAllLengths, seed: seed, useOrderedHashing: useOrderedHashing, + maximumNumberOfInverts: maximumNumberOfInverts); /// /// Produces a bag of counts of hashed ngrams in @@ -427,9 +387,9 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -440,40 +400,14 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.NumberOfBits, int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, - bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, - uint seed = NgramHashExtractingTransformer.DefaultArguments.Seed, - bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, - int maximumNumberOfInverts = NgramHashExtractingTransformer.DefaultArguments.MaximumNumberOfInverts) - => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts); - - /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs bag of word vector for each output in - /// - /// The text-related transform's catalog. - /// Pairs of columns to compute bag of word vector. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string[] inputColumnNames)[] columns, - int numberOfBits = NgramHashExtractingTransformer.DefaultArguments.NumberOfBits, - int ngramLength = NgramHashExtractingTransformer.DefaultArguments.NgramLength, - int skipLength = NgramHashExtractingTransformer.DefaultArguments.SkipLength, - bool allLengths = NgramHashExtractingTransformer.DefaultArguments.AllLengths, + bool useAllLengths = NgramHashExtractingTransformer.DefaultArguments.UseAllLengths, uint seed = NgramHashExtractingTransformer.DefaultArguments.Seed, - bool ordered = NgramHashExtractingTransformer.DefaultArguments.Ordered, + bool useOrderedHashing = NgramHashExtractingTransformer.DefaultArguments.Ordered, int maximumNumberOfInverts = NgramHashExtractingTransformer.DefaultArguments.MaximumNumberOfInverts) => new WordHashBagEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - columns, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts); + outputColumnName, inputColumnNames, numberOfBits: numberOfBits, ngramLength: ngramLength, + skipLength: skipLength, useAllLengths: useAllLengths, seed: seed, useOrderedHashing: useOrderedHashing, + maximumNumberOfInverts: maximumNumberOfInverts); /// /// Produces a bag of counts of hashed ngrams in @@ -488,9 +422,9 @@ public static WordHashBagEstimator ProduceHashedWordBags(this TransformsCatalog. /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -501,76 +435,26 @@ public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.T int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, int ngramLength = NgramHashingEstimator.Defaults.NgramLength, int skipLength = NgramHashingEstimator.Defaults.SkipLength, - bool allLengths = NgramHashingEstimator.Defaults.AllLengths, + bool useAllLengths = NgramHashingEstimator.Defaults.UseAllLengths, uint seed = NgramHashingEstimator.Defaults.Seed, - bool ordered = NgramHashingEstimator.Defaults.Ordered, + bool useOrderedHashing = NgramHashingEstimator.Defaults.UseOrderedHashing, int maximumNumberOfInverts = NgramHashingEstimator.Defaults.MaximumNumberOfInverts) => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnName, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts); + outputColumnName, inputColumnName, numberOfBits: numberOfBits, ngramLength: ngramLength, skipLength: skipLength, + useAllLengths: useAllLengths, seed: seed, useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts); /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs ngram vector as + /// Produces a bag of counts of hashed ngrams for each . For each column, + /// are the input columns of the output column named as . /// /// is different from in a way that /// takes tokenized text as input while tokenizes text internally. /// /// The text-related transform's catalog. - /// Name of the column resulting from the transformation of . - /// Name of the columns to transform. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. + /// Pairs of columns to compute n-grams. Note that gram indices are generated by hashing. public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, - string outputColumnName, - string[] inputColumnNames, - int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, - int ngramLength = NgramHashingEstimator.Defaults.NgramLength, - int skipLength = NgramHashingEstimator.Defaults.SkipLength, - bool allLengths = NgramHashingEstimator.Defaults.AllLengths, - uint seed = NgramHashingEstimator.Defaults.Seed, - bool ordered = NgramHashingEstimator.Defaults.Ordered, - int maximumNumberOfInverts = NgramHashingEstimator.Defaults.MaximumNumberOfInverts) - => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts); - - /// - /// Produces a bag of counts of hashed ngrams in - /// and outputs ngram vector for each output in - /// - /// is different from in a way that - /// takes tokenized text as input while tokenizes text internally. - /// - /// The text-related transform's catalog. - /// Pairs of columns to compute bag of word vector. - /// Number of bits to hash into. Must be between 1 and 30, inclusive. - /// Ngram length. - /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). - /// During hashing we constuct mappings between original values and the produced hash values. - /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. - /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. - /// 0 does not retain any input values. -1 retains all input values mapping to each hash. - public static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string[] inputColumnNames)[] columns, - int numberOfBits = NgramHashingEstimator.Defaults.NumberOfBits, - int ngramLength = NgramHashingEstimator.Defaults.NgramLength, - int skipLength = NgramHashingEstimator.Defaults.SkipLength, - bool allLengths = NgramHashingEstimator.Defaults.AllLengths, - uint seed = NgramHashingEstimator.Defaults.Seed, - bool ordered = NgramHashingEstimator.Defaults.Ordered, - int maximumNumberOfInverts = NgramHashingEstimator.Defaults.MaximumNumberOfInverts) - => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - columns, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts); + NgramHashingEstimator.ColumnOptions[] columns) + => new NgramHashingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); /// /// Uses LightLDA to transform a document (represented as a vector of floats) diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs index 6c89740117..d7cca7752a 100644 --- a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs +++ b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs @@ -141,7 +141,7 @@ public WordBagEstimator.Options WordFeatureExtractor extractor = new NgramExtractorTransform.NgramExtractorArguments(); extractor.NgramLength = _wordFeatureExtractor.NgramLength; extractor.SkipLength = _wordFeatureExtractor.SkipLength; - extractor.AllLengths = _wordFeatureExtractor.AllLengths; + extractor.UseAllLengths = _wordFeatureExtractor.UseAllLengths; extractor.MaxNumTerms = _wordFeatureExtractor.MaximumNgramsCount; extractor.Weighting = _wordFeatureExtractor.Weighting; } @@ -173,7 +173,7 @@ public WordBagEstimator.Options CharFeatureExtractor extractor = new NgramExtractorTransform.NgramExtractorArguments(); extractor.NgramLength = _charFeatureExtractor.NgramLength; extractor.SkipLength = _charFeatureExtractor.SkipLength; - extractor.AllLengths = _charFeatureExtractor.AllLengths; + extractor.UseAllLengths = _charFeatureExtractor.UseAllLengths; extractor.MaxNumTerms = _charFeatureExtractor.MaximumNgramsCount; extractor.Weighting = _charFeatureExtractor.Weighting; } @@ -187,7 +187,7 @@ public WordBagEstimator.Options CharFeatureExtractor public Options() { WordFeatureExtractor = new WordBagEstimator.Options(); - CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false }; + CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }; } } diff --git a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs index ec5bf8dd1b..934a2253ee 100644 --- a/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordBagTransform.cs @@ -54,8 +54,8 @@ internal sealed class Column : ManyToOneColumn [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), - ShortName = "all")] - public bool? AllLengths; + Name = "AllLengths", ShortName = "all")] + public bool? UseAllLengths; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] public int[] MaxNumTerms = null; @@ -76,7 +76,7 @@ internal static Column Parse(string str) internal bool TryUnparse(StringBuilder sb) { Contracts.AssertValue(sb); - if (NgramLength != null || SkipLength != null || AllLengths != null || Utils.Size(MaxNumTerms) > 0 || + if (NgramLength != null || SkipLength != null || UseAllLengths != null || Utils.Size(MaxNumTerms) > 0 || Weighting != null) { return false; @@ -123,7 +123,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa MaxNumTerms = options.MaxNumTerms, NgramLength = options.NgramLength, SkipLength = options.SkipLength, - AllLengths = options.AllLengths, + UseAllLengths = options.UseAllLengths, Weighting = options.Weighting, Columns = new NgramExtractorTransform.Column[options.Columns.Length] }; @@ -146,7 +146,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa NgramLength = column.NgramLength, SkipLength = column.SkipLength, Weighting = column.Weighting, - AllLengths = column.AllLengths + UseAllLengths = column.UseAllLengths }; } @@ -175,8 +175,9 @@ internal sealed class Column : OneToOneColumn public int? SkipLength; [Argument(ArgumentType.AtMostOnce, HelpText = - "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), ShortName = "all")] - public bool? AllLengths; + "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), + Name = "AllLengths", ShortName = "all")] + public bool? UseAllLengths; // REVIEW: This argument is actually confusing. If you set only one value we will use this value for all ngrams respectfully for example, // if we specify 3 ngrams we will have maxNumTerms * 3. And it also pick first value from this array to run term transform, so if you specify @@ -200,7 +201,7 @@ internal static Column Parse(string str) internal bool TryUnparse(StringBuilder sb) { Contracts.AssertValue(sb); - if (NgramLength != null || SkipLength != null || AllLengths != null || Utils.Size(MaxNumTerms) > 0 || + if (NgramLength != null || SkipLength != null || UseAllLengths != null || Utils.Size(MaxNumTerms) > 0 || Weighting != null) { return false; @@ -225,11 +226,11 @@ internal abstract class ArgumentsBase [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), - ShortName = "all")] - public bool AllLengths = NgramExtractingEstimator.Defaults.AllLengths; + Name = "AllLengths", ShortName = "all")] + public bool UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths; [Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")] - public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms }; + public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount }; [Argument(ArgumentType.AtMostOnce, HelpText = "The weighting criteria")] public NgramExtractingEstimator.WeightingCriteria Weighting = NgramExtractingEstimator.Defaults.Weighting; @@ -315,7 +316,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa termArgs = new ValueToKeyMappingTransformer.Options() { - MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms, + MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaximumNgramsCount, Columns = new ValueToKeyMappingTransformer.Column[termCols.Count] }; } @@ -347,7 +348,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name, column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, - column.AllLengths ?? options.AllLengths, + column.UseAllLengths ?? options.UseAllLengths, column.Weighting ?? options.Weighting, column.MaxNumTerms ?? options.MaxNumTerms, isTermCol[iinfo] ? column.Name : column.Source @@ -380,7 +381,7 @@ internal static IDataTransform Create(IHostEnvironment env, NgramExtractorArgume Columns = extractorCols, NgramLength = extractorArgs.NgramLength, SkipLength = extractorArgs.SkipLength, - AllLengths = extractorArgs.AllLengths, + UseAllLengths = extractorArgs.UseAllLengths, MaxNumTerms = extractorArgs.MaxNumTerms, Weighting = extractorArgs.Weighting }; diff --git a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs index 1fe0c21b09..7a5641e2a2 100644 --- a/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordHashBagProducingTransform.cs @@ -129,7 +129,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa Ordered = column.Ordered, MaximumNumberOfInverts = column.MaximumNumberOfInverts, FriendlyNames = options.Columns[iinfo].Source, - AllLengths = column.AllLengths + UseAllLengths = column.UseAllLengths }; } @@ -138,7 +138,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa var featurizeArgs = new NgramHashExtractingTransformer.Options { - AllLengths = options.AllLengths, + UseAllLengths = options.UseAllLengths, NumberOfBits = options.NumberOfBits, NgramLength = options.NgramLength, SkipLength = options.SkipLength, @@ -189,8 +189,8 @@ internal abstract class ColumnBase : ManyToOneColumn [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), - ShortName = "all", SortOrder = 4)] - public bool? AllLengths; + Name = "AllLengths", ShortName = "all", SortOrder = 4)] + public bool? UseAllLengths; } internal sealed class Column : ColumnBase @@ -279,8 +279,8 @@ internal abstract class ArgumentsBase [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to include all ngram lengths up to ngramLength or only ngramLength", - ShortName = "all", SortOrder = 4)] - public bool AllLengths = true; + Name = "AllLengths", ShortName = "all", SortOrder = 4)] + public bool UseAllLengths = true; } internal static class DefaultArguments @@ -291,7 +291,7 @@ internal static class DefaultArguments public const uint Seed = 314489979; public const bool Ordered = true; public const int MaximumNumberOfInverts = 0; - public const bool AllLengths = true; + public const bool UseAllLengths = true; } [TlcModule.Component(Name = "NGramHash", FriendlyName = "NGram Hash Extractor Transform", Alias = "NGramHashExtractorTransform,NGramHashExtractor", @@ -369,7 +369,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo], column.NgramLength ?? options.NgramLength, column.SkipLength ?? options.SkipLength, - column.AllLengths ?? options.AllLengths, + column.UseAllLengths ?? options.UseAllLengths, column.NumberOfBits ?? options.NumberOfBits, column.Seed ?? options.Seed, column.Ordered ?? options.Ordered, @@ -439,7 +439,7 @@ internal static IDataTransform Create(NgramHashExtractorArguments extractorArgs, MaximumNumberOfInverts = extractorArgs.MaximumNumberOfInverts, Ordered = extractorArgs.Ordered, Seed = extractorArgs.Seed, - AllLengths = extractorArgs.AllLengths + UseAllLengths = extractorArgs.UseAllLengths }; return Create(h, options, input, termLoaderArgs); diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs index 9e31cee9ed..4f91ea42c3 100644 --- a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs +++ b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs @@ -22,7 +22,7 @@ public sealed class WordBagEstimator : IEstimator private readonly (string outputColumnName, string[] sourceColumnsNames)[] _columns; private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly int _maxNumTerms; private readonly NgramExtractingEstimator.WeightingCriteria _weighting; @@ -44,7 +44,7 @@ public class Options /// /// Whether to store all ngram lengths up to ngramLength, or only ngramLength. /// - public bool AllLengths; + public bool UseAllLengths; /// /// The maximum number of grams to store in the dictionary, for each level of ngrams, @@ -61,8 +61,8 @@ public Options() { NgramLength = 1; SkipLength = NgramExtractingEstimator.Defaults.SkipLength; - AllLengths = NgramExtractingEstimator.Defaults.AllLengths; - MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms }; + UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths; + MaximumNgramsCount = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount }; Weighting = NgramExtractingEstimator.Defaults.Weighting; } } @@ -76,18 +76,18 @@ public Options() /// Name of the column to transform. If set to , the value of the will be used as source. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, - int maxNumTerms = 10000000, + bool useAllLengths = true, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, allLengths, maxNumTerms, weighting) + : this(env, outputColumnName, new[] { inputColumnName ?? outputColumnName }, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting) { } @@ -100,18 +100,18 @@ internal WordBagEstimator(IHostEnvironment env, /// The columns containing text to compute bag of word vector. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, string outputColumnName, string[] inputColumnNames, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, - int maxNumTerms = 10000000, + bool useAllLengths = true, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) - : this(env, new[] { (outputColumnName, inputColumnNames) }, ngramLength, skipLength, allLengths, maxNumTerms, weighting) + : this(env, new[] { (outputColumnName, inputColumnNames) }, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting) { } @@ -123,15 +123,15 @@ internal WordBagEstimator(IHostEnvironment env, /// Pairs of columns to compute bag of word vector. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . - /// Maximum number of ngrams to store in the dictionary. + /// Whether to include all ngram lengths up to or only . + /// Maximum number of ngrams to store in the dictionary. /// Statistical measure used to evaluate how important a word is to a document in a corpus. internal WordBagEstimator(IHostEnvironment env, (string outputColumnName, string[] inputColumnNames)[] columns, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, - int maxNumTerms = 10000000, + bool useAllLengths = true, + int maximumNgramsCount = 10000000, NgramExtractingEstimator.WeightingCriteria weighting = NgramExtractingEstimator.WeightingCriteria.Tf) { Contracts.CheckValue(env, nameof(env)); @@ -146,8 +146,8 @@ internal WordBagEstimator(IHostEnvironment env, _columns = columns; _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; - _maxNumTerms = maxNumTerms; + _useAllLengths = useAllLengths; + _maxNumTerms = maximumNgramsCount; _weighting = weighting; } @@ -160,7 +160,7 @@ public ITransformer Fit(IDataView input) Columns = _columns.Select(x => new WordBagBuildingTransformer.Column { Name = x.outputColumnName, Source = x.sourceColumnsNames }).ToArray(), NgramLength = _ngramLength, SkipLength = _skipLength, - AllLengths = _allLengths, + UseAllLengths = _useAllLengths, MaxNumTerms = new[] { _maxNumTerms }, Weighting = _weighting }; @@ -193,7 +193,7 @@ public sealed class WordHashBagEstimator : IEstimator private readonly int _numberOfBits; private readonly int _ngramLength; private readonly int _skipLength; - private readonly bool _allLengths; + private readonly bool _useAllLengths; private readonly uint _seed; private readonly bool _ordered; private readonly int _maximumNumberOfInverts; @@ -208,9 +208,9 @@ public sealed class WordHashBagEstimator : IEstimator /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -221,11 +221,13 @@ internal WordHashBagEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int maximumNumberOfInverts = 0) - : this(env, new[] { (outputColumnName, new[] { inputColumnName ?? outputColumnName }) }, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts) + : this(env, new[] { (outputColumnName, new[] { inputColumnName ?? outputColumnName }) }, numberOfBits: numberOfBits, + ngramLength: ngramLength, skipLength: skipLength, useAllLengths: useAllLengths, seed: seed, + useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts) { } @@ -239,9 +241,9 @@ internal WordHashBagEstimator(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -252,11 +254,13 @@ internal WordHashBagEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int maximumNumberOfInverts = 0) - : this(env, new[] { (outputColumnName, inputColumnNames) }, numberOfBits, ngramLength, skipLength, allLengths, seed, ordered, maximumNumberOfInverts) + : this(env, new[] { (outputColumnName, inputColumnNames) }, numberOfBits: numberOfBits, + ngramLength: ngramLength, skipLength: skipLength, useAllLengths: useAllLengths, seed: seed, + useOrderedHashing: useOrderedHashing, maximumNumberOfInverts: maximumNumberOfInverts) { } @@ -269,9 +273,9 @@ internal WordHashBagEstimator(IHostEnvironment env, /// Number of bits to hash into. Must be between 1 and 30, inclusive. /// Ngram length. /// Maximum number of tokens to skip when constructing an ngram. - /// Whether to include all ngram lengths up to or only . + /// Whether to include all ngram lengths up to or only . /// Hashing seed. - /// Whether the position of each source column should be included in the hash (when there are multiple source columns). + /// Whether the position of each source column should be included in the hash (when there are multiple source columns). /// During hashing we constuct mappings between original values and the produced hash values. /// Text representation of original values are stored in the slot names of the metadata for the new column.Hashing, as such, can map many initial values to one. /// specifies the upper bound of the number of distinct input values mapping to a hash that should be retained. @@ -281,9 +285,9 @@ internal WordHashBagEstimator(IHostEnvironment env, int numberOfBits = 16, int ngramLength = 1, int skipLength = 0, - bool allLengths = true, + bool useAllLengths = true, uint seed = 314489979, - bool ordered = true, + bool useOrderedHashing = true, int maximumNumberOfInverts = 0) { Contracts.CheckValue(env, nameof(env)); @@ -299,9 +303,9 @@ internal WordHashBagEstimator(IHostEnvironment env, _numberOfBits = numberOfBits; _ngramLength = ngramLength; _skipLength = skipLength; - _allLengths = allLengths; + _useAllLengths = useAllLengths; _seed = seed; - _ordered = ordered; + _ordered = useOrderedHashing; _maximumNumberOfInverts = maximumNumberOfInverts; } @@ -315,7 +319,7 @@ public ITransformer Fit(IDataView input) NumberOfBits = _numberOfBits, NgramLength = _ngramLength, SkipLength = _skipLength, - AllLengths = _allLengths, + UseAllLengths = _useAllLengths, Seed = _seed, Ordered = _ordered, MaximumNumberOfInverts = _maximumNumberOfInverts diff --git a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs index e020dd740f..50d4a38f63 100644 --- a/test/Microsoft.ML.Functional.Tests/DataTransformation.cs +++ b/test/Microsoft.ML.Functional.Tests/DataTransformation.cs @@ -138,7 +138,7 @@ void ExtensibilityModifyTextFeaturization() var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", new TextFeaturizingEstimator.Options { - CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false }, + CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false }, WordFeatureExtractor = new WordBagEstimator.Options(), VectorNormalizer = TextFeaturizingEstimator.NormFunction.L1 }, "SentimentText") diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index f18a707999..926e770187 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -575,8 +575,8 @@ public void ConvertToWordBag() var est = data.MakeNewEstimator() .Append(r => ( r.label, - bagofword: r.text.ToBagofWords(), - bagofhashedword: r.text.ToBagofHashedWords())); + bagofword: r.text.ProduceWordBags(), + bagofhashedword: r.text.ProduceHashedWordBags())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; @@ -604,8 +604,8 @@ public void Ngrams() var est = data.MakeNewEstimator() .Append(r => ( r.label, - ngrams: r.text.TokenizeText().ToKey().ToNgrams(), - ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash())); + ngrams: r.text.TokenizeText().ToKey().ProduceNgrams(), + ngramshash: r.text.TokenizeText().ToKey().ProduceHashedNgrams())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; @@ -675,7 +675,7 @@ public void LdaTopicModel() var est = data.MakeNewEstimator() .Append(r => ( r.label, - topics: r.text.ToBagofWords().LatentDirichletAllocation(numberOfTopics: 3, numberOfSummaryTermsPerTopic:5, alphaSum: 10, onFit: m => ldaSummary = m.LdaTopicSummary))); + topics: r.text.ProduceWordBags().LatentDirichletAllocation(numberOfTopics: 3, numberOfSummaryTermsPerTopic:5, alphaSum: 10, onFit: m => ldaSummary = m.LdaTopicSummary))); var transformer = est.Fit(data); var tdata = transformer.Transform(data); @@ -700,8 +700,8 @@ public void FeatureSelection() var est = data.MakeNewEstimator() .Append(r => ( r.label, - bag_of_words_count: r.text.ToBagofWords().SelectFeaturesBasedOnCount(10), - bag_of_words_mi: r.text.ToBagofWords().SelectFeaturesBasedOnMutualInformation(r.label))); + bag_of_words_count: r.text.ProduceWordBags().SelectFeaturesBasedOnCount(10), + bag_of_words_mi: r.text.ProduceWordBags().SelectFeaturesBasedOnMutualInformation(r.label))); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index b60afc07f2..e2bacb6309 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -461,13 +461,13 @@ private void TextFeaturizationOn(string dataPath) TextFeatures: r.Message.FeaturizeText(), // NLP pipeline 1: bag of words. - BagOfWords: r.Message.NormalizeText().ToBagofWords(), + BagOfWords: r.Message.NormalizeText().ProduceWordBags(), // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. - BagOfBigrams: r.Message.NormalizeText().ToBagofHashedWords(ngramLength: 2, allLengths: false), + BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, useAllLengths: false), // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - BagOfTrichar: r.Message.TokenizeIntoCharacters().ToNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), + BagOfTrichar: r.Message.TokenizeIntoCharacters().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 84b864c84f..50c0439112 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -302,7 +302,7 @@ private void TextFeaturizationOn(string dataPath) // NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices. .Append(new WordHashBagEstimator(mlContext, "BagOfBigrams","NormalizedMessage", - ngramLength: 2, allLengths: false)) + ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. .Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message"))