Skip to content

Scrub n-gram hashing and n-gram #2898

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Mar 13, 2019
2 changes: 1 addition & 1 deletion docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,7 @@ var pipeline =

// NLP pipeline 2: bag of bigrams, using hashes instead of dictionary indices.
.Append(new WordHashBagEstimator(mlContext, "BagOfBigrams","NormalizedMessage",
ngramLength: 2, allLengths: false))
ngramLength: 2, useAllLengths: false))

// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
.Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public static void NgramTransform()
// 'e' - 1 '<?>' - 2 'd' - 1 '=' - 4 'R' - 1 'U' - 1 'D' - 2 'E' - 1 'u' - 1 ',' - 1 '2' - 1
// 'B' - 0 'e' - 6 's' - 3 't' - 6 '<?>' - 9 'g' - 2 'a' - 2 'm' - 2 'I' - 0 ''' - 0 'v' - 0 ...
// Preview of the CharsTwoGrams column obtained after processing the input.
var charsTwoGramColumn = transformedData_twochars.GetColumn<VBuffer<float>>(transformedData_onechars.Schema["CharsUnigrams"]);
var charsTwoGramColumn = transformedData_twochars.GetColumn<VBuffer<float>>(transformedData_twochars.Schema["CharsTwograms"]);
transformedData_twochars.Schema["CharsTwograms"].GetSlotNames(ref slotNames);
printHelper("CharsTwograms", charsTwoGramColumn, slotNames);

Expand Down
126 changes: 63 additions & 63 deletions src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs

Large diffs are not rendered by default.

115 changes: 43 additions & 72 deletions src/Microsoft.ML.Transforms/Text/NgramHashingTransformer.cs

Large diffs are not rendered by default.

125 changes: 66 additions & 59 deletions src/Microsoft.ML.Transforms/Text/NgramTransform.cs

Large diffs are not rendered by default.

196 changes: 40 additions & 156 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ public WordBagEstimator.Options WordFeatureExtractor
extractor = new NgramExtractorTransform.NgramExtractorArguments();
extractor.NgramLength = _wordFeatureExtractor.NgramLength;
extractor.SkipLength = _wordFeatureExtractor.SkipLength;
extractor.AllLengths = _wordFeatureExtractor.AllLengths;
extractor.UseAllLengths = _wordFeatureExtractor.UseAllLengths;
extractor.MaxNumTerms = _wordFeatureExtractor.MaximumNgramsCount;
extractor.Weighting = _wordFeatureExtractor.Weighting;
}
Expand Down Expand Up @@ -173,7 +173,7 @@ public WordBagEstimator.Options CharFeatureExtractor
extractor = new NgramExtractorTransform.NgramExtractorArguments();
extractor.NgramLength = _charFeatureExtractor.NgramLength;
extractor.SkipLength = _charFeatureExtractor.SkipLength;
extractor.AllLengths = _charFeatureExtractor.AllLengths;
extractor.UseAllLengths = _charFeatureExtractor.UseAllLengths;
extractor.MaxNumTerms = _charFeatureExtractor.MaximumNgramsCount;
extractor.Weighting = _charFeatureExtractor.Weighting;
}
Expand All @@ -187,7 +187,7 @@ public WordBagEstimator.Options CharFeatureExtractor
public Options()
{
WordFeatureExtractor = new WordBagEstimator.Options();
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, AllLengths = false };
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths = false };
}
}

Expand Down
29 changes: 15 additions & 14 deletions src/Microsoft.ML.Transforms/Text/WordBagTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ internal sealed class Column : ManyToOneColumn

[Argument(ArgumentType.AtMostOnce,
HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength),
ShortName = "all")]
public bool? AllLengths;
Name = "AllLengths", ShortName = "all")]
public bool? UseAllLengths;

[Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")]
public int[] MaxNumTerms = null;
Expand All @@ -76,7 +76,7 @@ internal static Column Parse(string str)
internal bool TryUnparse(StringBuilder sb)
{
Contracts.AssertValue(sb);
if (NgramLength != null || SkipLength != null || AllLengths != null || Utils.Size(MaxNumTerms) > 0 ||
if (NgramLength != null || SkipLength != null || UseAllLengths != null || Utils.Size(MaxNumTerms) > 0 ||
Weighting != null)
{
return false;
Expand Down Expand Up @@ -123,7 +123,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
MaxNumTerms = options.MaxNumTerms,
NgramLength = options.NgramLength,
SkipLength = options.SkipLength,
AllLengths = options.AllLengths,
UseAllLengths = options.UseAllLengths,
Weighting = options.Weighting,
Columns = new NgramExtractorTransform.Column[options.Columns.Length]
};
Expand All @@ -146,7 +146,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
NgramLength = column.NgramLength,
SkipLength = column.SkipLength,
Weighting = column.Weighting,
AllLengths = column.AllLengths
UseAllLengths = column.UseAllLengths
};
}

Expand Down Expand Up @@ -175,8 +175,9 @@ internal sealed class Column : OneToOneColumn
public int? SkipLength;

[Argument(ArgumentType.AtMostOnce, HelpText =
"Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength), ShortName = "all")]
public bool? AllLengths;
"Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength),
Name = "AllLengths", ShortName = "all")]
public bool? UseAllLengths;

// REVIEW: This argument is actually confusing. If you set only one value we will use this value for all ngrams respectfully for example,
// if we specify 3 ngrams we will have maxNumTerms * 3. And it also pick first value from this array to run term transform, so if you specify
Expand All @@ -200,7 +201,7 @@ internal static Column Parse(string str)
internal bool TryUnparse(StringBuilder sb)
{
Contracts.AssertValue(sb);
if (NgramLength != null || SkipLength != null || AllLengths != null || Utils.Size(MaxNumTerms) > 0 ||
if (NgramLength != null || SkipLength != null || UseAllLengths != null || Utils.Size(MaxNumTerms) > 0 ||
Weighting != null)
{
return false;
Expand All @@ -225,11 +226,11 @@ internal abstract class ArgumentsBase

[Argument(ArgumentType.AtMostOnce,
HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength),
ShortName = "all")]
public bool AllLengths = NgramExtractingEstimator.Defaults.AllLengths;
Name = "AllLengths", ShortName = "all")]
public bool UseAllLengths = NgramExtractingEstimator.Defaults.UseAllLengths;

[Argument(ArgumentType.Multiple, HelpText = "Maximum number of ngrams to store in the dictionary", ShortName = "max")]
public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaxNumTerms };
public int[] MaxNumTerms = new int[] { NgramExtractingEstimator.Defaults.MaximumNgramsCount };

[Argument(ArgumentType.AtMostOnce, HelpText = "The weighting criteria")]
public NgramExtractingEstimator.WeightingCriteria Weighting = NgramExtractingEstimator.Defaults.Weighting;
Expand Down Expand Up @@ -315,7 +316,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
termArgs =
new ValueToKeyMappingTransformer.Options()
{
MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaxNumTerms,
MaxNumTerms = Utils.Size(options.MaxNumTerms) > 0 ? options.MaxNumTerms[0] : NgramExtractingEstimator.Defaults.MaximumNgramsCount,
Columns = new ValueToKeyMappingTransformer.Column[termCols.Count]
};
}
Expand Down Expand Up @@ -347,7 +348,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
ngramColumns[iinfo] = new NgramExtractingEstimator.ColumnOptions(column.Name,
column.NgramLength ?? options.NgramLength,
column.SkipLength ?? options.SkipLength,
column.AllLengths ?? options.AllLengths,
column.UseAllLengths ?? options.UseAllLengths,
column.Weighting ?? options.Weighting,
column.MaxNumTerms ?? options.MaxNumTerms,
isTermCol[iinfo] ? column.Name : column.Source
Expand Down Expand Up @@ -380,7 +381,7 @@ internal static IDataTransform Create(IHostEnvironment env, NgramExtractorArgume
Columns = extractorCols,
NgramLength = extractorArgs.NgramLength,
SkipLength = extractorArgs.SkipLength,
AllLengths = extractorArgs.AllLengths,
UseAllLengths = extractorArgs.UseAllLengths,
MaxNumTerms = extractorArgs.MaxNumTerms,
Weighting = extractorArgs.Weighting
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
Ordered = column.Ordered,
MaximumNumberOfInverts = column.MaximumNumberOfInverts,
FriendlyNames = options.Columns[iinfo].Source,
AllLengths = column.AllLengths
UseAllLengths = column.UseAllLengths
};
}

Expand All @@ -138,7 +138,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
var featurizeArgs =
new NgramHashExtractingTransformer.Options
{
AllLengths = options.AllLengths,
UseAllLengths = options.UseAllLengths,
NumberOfBits = options.NumberOfBits,
NgramLength = options.NgramLength,
SkipLength = options.SkipLength,
Expand Down Expand Up @@ -189,8 +189,8 @@ internal abstract class ColumnBase : ManyToOneColumn

[Argument(ArgumentType.AtMostOnce,
HelpText = "Whether to include all ngram lengths up to " + nameof(NgramLength) + " or only " + nameof(NgramLength),
ShortName = "all", SortOrder = 4)]
public bool? AllLengths;
Name = "AllLengths", ShortName = "all", SortOrder = 4)]
public bool? UseAllLengths;
}

internal sealed class Column : ColumnBase
Expand Down Expand Up @@ -279,8 +279,8 @@ internal abstract class ArgumentsBase

[Argument(ArgumentType.AtMostOnce,
HelpText = "Whether to include all ngram lengths up to ngramLength or only ngramLength",
ShortName = "all", SortOrder = 4)]
public bool AllLengths = true;
Name = "AllLengths", ShortName = "all", SortOrder = 4)]
public bool UseAllLengths = true;
}

internal static class DefaultArguments
Expand All @@ -291,7 +291,7 @@ internal static class DefaultArguments
public const uint Seed = 314489979;
public const bool Ordered = true;
public const int MaximumNumberOfInverts = 0;
public const bool AllLengths = true;
public const bool UseAllLengths = true;
}

[TlcModule.Component(Name = "NGramHash", FriendlyName = "NGram Hash Extractor Transform", Alias = "NGramHashExtractorTransform,NGramHashExtractor",
Expand Down Expand Up @@ -369,7 +369,7 @@ internal static IDataTransform Create(IHostEnvironment env, Options options, IDa
new NgramHashingEstimator.ColumnOptions(column.Name, tmpColNames[iinfo],
column.NgramLength ?? options.NgramLength,
column.SkipLength ?? options.SkipLength,
column.AllLengths ?? options.AllLengths,
column.UseAllLengths ?? options.UseAllLengths,
column.NumberOfBits ?? options.NumberOfBits,
column.Seed ?? options.Seed,
column.Ordered ?? options.Ordered,
Expand Down Expand Up @@ -439,7 +439,7 @@ internal static IDataTransform Create(NgramHashExtractorArguments extractorArgs,
MaximumNumberOfInverts = extractorArgs.MaximumNumberOfInverts,
Ordered = extractorArgs.Ordered,
Seed = extractorArgs.Seed,
AllLengths = extractorArgs.AllLengths
UseAllLengths = extractorArgs.UseAllLengths
};

return Create(h, options, input, termLoaderArgs);
Expand Down
Loading