From c5601360e84e2aab68a42db7eceaf384d3aaf378 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 16:00:42 -0700 Subject: [PATCH 1/7] Polish char-level tokenizers --- .../Microsoft.ML.Samples/Dynamic/NgramExtraction.cs | 2 +- src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs | 2 +- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 10 ++++++---- .../Text/TokenizingByCharacters.cs | 12 ++++++++---- .../StaticPipeTests.cs | 2 +- .../Scenarios/Api/CookbookSamples/CookbookSamples.cs | 2 +- .../Api/CookbookSamples/CookbookSamplesDynamicApi.cs | 2 +- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs index d1f36d3731..883c034d64 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs @@ -26,7 +26,7 @@ public static void NgramTransform() // A pipeline to tokenize text as characters and then combine them together into ngrams // The pipeline uses the default settings to featurize. - var charsPipeline = ml.Transforms.Text.TokenizeCharacters("Chars", "SentimentText", useMarkerCharacters: false); + var charsPipeline = ml.Transforms.Text.ProduceCharacterTokens("Chars", "SentimentText", useMarkerCharacters: false); var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1); var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars"); var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline); diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index c4ef323c97..668952e6d9 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -109,7 +109,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// /// The column to apply to. /// Whether to use marker characters to separate words. - public static VarVector> TokenizeIntoCharacters(this Scalar input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters); + public static VarVector> ProduceCharacterTokens(this Scalar input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters); } /// diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 3aa10978ac..c360c9228a 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -55,8 +55,9 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text /// The text-related transform's catalog. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// Whether to use marker characters to separate words. - public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog, + /// Whether to prepend a marker character, , to the beginning, + /// and append another marker character, , to the end of the output vector of characters. + public static TokenizingByCharactersEstimator ProduceCharacterTokens(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters) @@ -67,10 +68,11 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms /// Tokenize incoming text in input columns and output the tokens as output columns. /// /// The text-related transform's catalog. - /// Whether to use marker characters to separate words. + /// Whether to prepend a marker character, , to the beginning, + /// and append another marker character, , to the end of the output vector of characters. /// Pairs of columns to run the tokenization on. - public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog, + public static TokenizingByCharactersEstimator ProduceCharacterTokens(this TransformsCatalog.TextTransforms catalog, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters, params ColumnOptions[] columns) => new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns)); diff --git a/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs b/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs index 731f13990e..ac30aaf0aa 100644 --- a/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs +++ b/src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs @@ -102,7 +102,8 @@ private static VersionInfo GetVersionInfo() /// Tokenize incoming text in input columns and output the tokens as output columns. /// /// The environment. - /// Whether to use marker characters to separate words. + /// Whether to prepend a marker character, , to the beginning, + /// and append another marker character, , to the end of the output vector of characters. /// Pairs of columns to run the tokenization on. internal TokenizingByCharactersTransformer(IHostEnvironment env, bool useMarkerCharacters = TokenizingByCharactersEstimator.Defaults.UseMarkerCharacters, params (string outputColumnName, string inputColumnName)[] columns) : @@ -114,7 +115,7 @@ internal TokenizingByCharactersTransformer(IHostEnvironment env, bool useMarkerC /// /// The names of the output and input column pairs on which the transformation is applied. /// - public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); + internal IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); private protected override void CheckInputColumn(DataViewSchema inputSchema, int col, int srcCol) { @@ -555,6 +556,7 @@ internal static class Defaults { public const bool UseMarkerCharacters = true; } + internal static bool IsColumnTypeValid(DataViewType type) => type.GetItemType() is TextDataViewType; internal const string ExpectedColumnType = "Text"; @@ -565,7 +567,8 @@ internal static class Defaults /// The environment. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. - /// Whether to use marker characters to separate words. + /// Whether to prepend a marker character, , to the beginning, + /// and append another marker character, , to the end of the output vector of characters. internal TokenizingByCharactersEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, bool useMarkerCharacters = Defaults.UseMarkerCharacters) : this(env, useMarkerCharacters, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }) @@ -576,7 +579,8 @@ internal TokenizingByCharactersEstimator(IHostEnvironment env, string outputColu /// Tokenize incoming text in input columns and output the tokens as output columns. /// /// The environment. - /// Whether to use marker characters to separate words. + /// Whether to prepend a marker character, , to the beginning, + /// and append another marker character, , to the end of the output vector of characters. /// Pairs of columns to run the tokenization on. internal TokenizingByCharactersEstimator(IHostEnvironment env, bool useMarkerCharacters = Defaults.UseMarkerCharacters, diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 926e770187..296eabf5fa 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -520,7 +520,7 @@ public void Tokenize() .Append(r => ( r.label, tokens: r.text.TokenizeText(), - chars: r.text.TokenizeIntoCharacters())); + chars: r.text.ProduceCharacterTokens())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index e2bacb6309..8a5920c781 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -467,7 +467,7 @@ private void TextFeaturizationOn(string dataPath) BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, useAllLengths: false), // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - BagOfTrichar: r.Message.TokenizeIntoCharacters().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), + BagOfTrichar: r.Message.ProduceCharacterTokens().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 50c0439112..91405a7585 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -305,7 +305,7 @@ private void TextFeaturizationOn(string dataPath) ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - .Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message")) + .Append(mlContext.Transforms.Text.ProduceCharacterTokens("MessageChars", "Message")) .Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars", ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf)) From ef85fa8f93ac1a98f9dff8231d3812ffa70e0a74 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 16:09:29 -0700 Subject: [PATCH 2/7] Polish word-level tokenizers --- .../Dynamic/KeyToValueValueToKey.cs | 4 ++-- .../Dynamic/StopWordRemoverTransform.cs | 2 +- .../Dynamic/TensorFlow/TextClassification.cs | 2 +- .../Dynamic/WordEmbeddingTransform.cs | 2 +- .../TextStaticExtensions.cs | 2 +- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 15 ++------------- .../Text/WordTokenizing.cs | 12 ++++++++++++ .../StaticPipeTests.cs | 8 ++++---- .../Api/CookbookSamples/CookbookSamples.cs | 2 +- .../CookbookSamples/CookbookSamplesDynamicApi.cs | 2 +- .../TensorflowTests.cs | 2 +- .../Transformers/CategoricalHashTests.cs | 2 +- .../Transformers/TextFeaturizerTests.cs | 2 +- .../Transformers/ValueMappingTests.cs | 2 +- .../Transformers/WordEmbeddingsTests.cs | 4 ++-- 15 files changed, 32 insertions(+), 31 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs index 47d3de37bc..951918ac04 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs @@ -30,7 +30,7 @@ public static void Example() // making use of default settings. string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension - var default_pipeline = ml.Transforms.Text.TokenizeWords("Review") + var default_pipeline = ml.Transforms.Text.ProduceWordTokens("Review") .Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review")); // Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator. @@ -38,7 +38,7 @@ public static void Example() // and condition the order in which they get evaluated by changing keyOrdinality from the default ByOccurence (order in which they get encountered) // to value/alphabetically. string customizedColumnName = "CustomizedKeys"; - var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review") + var customized_pipeline = ml.Transforms.Text.ProduceWordTokens("Review") .Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maximumNumberOfKeys: 10, keyOrdinality: ValueToKeyMappingEstimator.KeyOrdinality.ByValue)); // The transformed data. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs index 33a7fc5fd4..cdf53e09fb 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs @@ -25,7 +25,7 @@ public static void Example() // Let's take SentimentText column and break it into vector of words. string originalTextColumnName = "Words"; - var words = ml.Transforms.Text.TokenizeWords("SentimentText", originalTextColumnName); + var words = ml.Transforms.Text.ProduceWordTokens("SentimentText", originalTextColumnName); // Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages. var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover")); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs index ef78a04e9a..9b61bbcb10 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs @@ -68,7 +68,7 @@ public static void Example() j.Features = features; }; - var engine = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text") + var engine = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text") .Append(mlContext.Transforms.Conversion.MapValue(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") })) .Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize")) .Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" })) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs index 63428fcdf9..c14166e8ff 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs @@ -26,7 +26,7 @@ public static void Example() // Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords. var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) - .Append(ml.Transforms.Text.TokenizeWords("Words", "NormalizedText")) + .Append(ml.Transforms.Text.ProduceWordTokens("Words", "NormalizedText")) .Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData); diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 668952e6d9..5bdb7b6c76 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -55,7 +55,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// /// The column to apply to. /// The separators to use (uses space character by default). - public static VarVector TokenizeText(this Scalar input, char[] separators = null) => new OutPipelineColumn(input, separators); + public static VarVector ProduceWordTokens(this Scalar input, char[] separators = null) => new OutPipelineColumn(input, separators); } /// diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index c360c9228a..9e10fa180d 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -159,29 +159,18 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// The separators to use (uses space character by default). - public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog, + public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, char[] separators = null) => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, separators); - /// - /// Tokenizes incoming text in input columns and outputs the tokens using as separators. - /// - /// The text-related transform's catalog. - /// Pairs of columns to run the tokenization on. - /// The separators to use (uses space character by default). - public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string inputColumnName)[] columns, - char[] separators = null) - => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, separators); - /// /// Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens. /// /// The text-related transform's catalog. /// Pairs of columns to run the tokenization on. - public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog, + public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog, params WordTokenizingEstimator.ColumnOptions[] columns) => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs index 6cc72d22c6..e08fd51273 100644 --- a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs +++ b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs @@ -441,9 +441,21 @@ internal WordTokenizingEstimator(IHostEnvironment env, params ColumnOptions[] co } public sealed class ColumnOptions { + /// + /// Output column name that will be used to store the tokenization result of column. + /// public readonly string Name; + /// + /// Input column name that will be tokenized into words. + /// public readonly string InputColumnName; + /// + /// Seperator list used to tokenize input string. If not specified, space will be used. + /// public IReadOnlyList Separators => SeparatorsArray; + /// + /// State of . Since [] is multable, it's not safe to directly expose this field to users. + /// internal readonly char[] SeparatorsArray; /// diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 296eabf5fa..c366b83b60 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -519,7 +519,7 @@ public void Tokenize() var est = data.MakeNewEstimator() .Append(r => ( r.label, - tokens: r.text.TokenizeText(), + tokens: r.text.ProduceWordTokens(), chars: r.text.ProduceCharacterTokens())); var tdata = est.Fit(data).Transform(data); @@ -547,7 +547,7 @@ public void NormalizeTextAndRemoveStopWords() .Append(r => ( r.label, normalized_text: r.text.NormalizeText(), - words_without_stopwords: r.text.TokenizeText().RemoveStopwords())); + words_without_stopwords: r.text.ProduceWordTokens().RemoveStopwords())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; @@ -604,8 +604,8 @@ public void Ngrams() var est = data.MakeNewEstimator() .Append(r => ( r.label, - ngrams: r.text.TokenizeText().ToKey().ProduceNgrams(), - ngramshash: r.text.TokenizeText().ToKey().ProduceHashedNgrams())); + ngrams: r.text.ProduceWordTokens().ToKey().ProduceNgrams(), + ngramshash: r.text.ProduceWordTokens().ToKey().ProduceHashedNgrams())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 8a5920c781..6ad76d4153 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -472,7 +472,7 @@ private void TextFeaturizationOn(string dataPath) // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real // scenario, it is best to use a different model for more accuracy. - Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) + Embeddings: r.Message.NormalizeText().ProduceWordTokens().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) )); // Let's train our pipeline, and then apply it to the same data. diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 91405a7585..edfb40d5fd 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -312,7 +312,7 @@ private void TextFeaturizationOn(string dataPath) // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real // scenario, it is best to use a different model for more accuracy. - .Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage")) + .Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage")) .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Embeddings", "TokenizedMessage", WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)); diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index e8043a0051..a42f1cea61 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -999,7 +999,7 @@ public void TensorFlowSentimentClassificationTest() // The first pipeline 'dataPipe' tokenzies the string into words and maps each word to an integer which is an index in the dictionary. // Then this integer vector is retrieved from the pipeline and resized to fixed length. // The second pipeline 'tfEnginePipe' takes the resized integer vector and passes it to TensoFlow and gets the classification scores. - var estimator = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text") + var estimator = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text") .Append(mlContext.Transforms.Conversion.MapValue(lookupMap, "Words", "Ids", new ColumnOptions[] { ("Features", "TokenizedWords") })); var dataPipe = estimator.Fit(dataView) .CreatePredictionEngine(mlContext); diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs index e8cb49fe0a..f5dc6c59bc 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs @@ -76,7 +76,7 @@ public void CategoricalHashStatic() row.ScalarString, row.VectorString, // Create a VarVector column - VarVectorString: row.ScalarString.TokenizeText())). + VarVectorString: row.ScalarString.ProduceWordTokens())). Append(row => ( A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind), B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind), diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index e2575557ec..8ecccfe350 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -143,7 +143,7 @@ public void TextNormalizationAndStopwordRemoverWorkout() text: ctx.LoadFloat(1)), hasHeader: true) .Load(sentimentDataPath); var est = ML.Transforms.Text.NormalizeText("text") - .Append(ML.Transforms.Text.TokenizeWords("words", "text")) + .Append(ML.Transforms.Text.ProduceWordTokens("words", "text")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words")) .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY")); diff --git a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs index 66f5ed0db2..0bb6e7c0f9 100644 --- a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs @@ -546,7 +546,7 @@ public void ValueMappingInputIsVectorWorkout() var keys = new List>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; var values = new List() { 1, 2, 3, 4 }; - var est = ML.Transforms.Text.TokenizeWords("TokenizeB", "B") + var est = ML.Transforms.Text.ProduceWordTokens("TokenizeB", "B") .Append(ML.Transforms.Conversion.MapValue(keys, values, new ColumnOptions[] { ("VecB", "TokenizeB") })); TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView); } diff --git a/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs b/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs index 95843dc330..3e42c515c2 100644 --- a/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs @@ -35,7 +35,7 @@ public void TestWordEmbeddings() }).Load(GetDataPath(dataPath)); var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) - .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText")) + .Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var words = est.Fit(data).Transform(data); @@ -70,7 +70,7 @@ public void TestCustomWordEmbeddings() }).Load(GetDataPath(dataPath)); var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) - .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText")) + .Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var words = est.Fit(data).Transform(data); var pathToCustomModel = DeleteOutputPath("custommodel.txt"); From 13e55d280a841a78c84c7818d3622536958da90f Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 17:10:50 -0700 Subject: [PATCH 3/7] Scrub stopword removers --- .../TextStaticExtensions.cs | 4 +-- .../Text/StopWordsRemovingTransformer.cs | 4 +-- .../Text/TextCatalog.cs | 36 ------------------- .../StaticPipeTests.cs | 2 +- 4 files changed, 5 insertions(+), 41 deletions(-) diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index 5bdb7b6c76..de5865cfcd 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -162,8 +162,8 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Remove stop words from incoming text. /// /// The column to apply to. - /// Langauge of the input text. - public static VarVector RemoveStopwords(this VarVector input, + /// Langauge of the input text. It will be used to retrieve a built-in stopword list. + public static VarVector RemoveDefaultStopWords(this VarVector input, StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) => new OutPipelineColumn(input, language); } diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index 2a36dbae24..2244ff58bc 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -133,7 +133,7 @@ private static VersionInfo GetVersionInfo() /// /// Defines the behavior of the transformer. /// - public IReadOnlyCollection Columns => _columns.AsReadOnly(); + internal IReadOnlyCollection Columns => _columns.AsReadOnly(); private readonly StopWordsRemovingEstimator.ColumnOptions[] _columns; private static volatile NormStr.Pool[] _stopWords; @@ -828,7 +828,7 @@ private void LoadStopWords(IChannel ch, ReadOnlyMemory stopwords, string d /// /// The names of the input output column pairs on which this transformation is applied. /// - public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); + internal IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); /// /// Custom stopword remover removes specified list of stop words. diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 9e10fa180d..c78e83430a 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -234,24 +234,6 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) => new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, language); - /// - /// Removes stop words from incoming token streams in input columns - /// and outputs the token streams without stop words as output columns. - /// - /// The text-related transform's catalog. - /// Pairs of columns to remove stop words on. - /// Langauge of the input text columns . - /// - /// - /// - /// - public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string inputColumnName)[] columns, - StopWordsRemovingEstimator.Language language = StopWordsRemovingEstimator.Language.English) - => new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, language); - /// /// Removes stop words from incoming token streams in /// and outputs the token streams without stopwords as . @@ -272,24 +254,6 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa params string[] stopwords) => new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords); - /// - /// Removes stop words from incoming token streams in input columns - /// and outputs the token streams without stop words as output columns. - /// - /// The text-related transform's catalog. - /// Pairs of columns to remove stop words on. - /// Array of words to remove. - /// - /// - /// - /// - public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog, - (string outputColumnName, string inputColumnName)[] columns, - params string[] stopwords) - => new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, stopwords); - /// /// Produces a bag of counts of ngrams (sequences of consecutive words) in /// and outputs bag of word vector as diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index c366b83b60..a6f9fae895 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -547,7 +547,7 @@ public void NormalizeTextAndRemoveStopWords() .Append(r => ( r.label, normalized_text: r.text.NormalizeText(), - words_without_stopwords: r.text.ProduceWordTokens().RemoveStopwords())); + words_without_stopwords: r.text.ProduceWordTokens().RemoveDefaultStopWords())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; From f8096c0c6a10fc088a63278744dd6e2ed3b5b9f6 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 17:15:18 -0700 Subject: [PATCH 4/7] Update cookbook --- docs/code/MlNetCookBook.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 7a73a1178a..25edf3c1b2 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -775,12 +775,12 @@ var pipeline = ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - .Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message")) + .Append(mlContext.Transforms.Text.ProduceCharacterTokens("MessageChars", "Message")) .Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars", ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf)) // NLP pipeline 4: word embeddings. - .Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage")) + .Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage")) .Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage", WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)); From 8e5c515aaae3bcbd9c485a22e3adb5e932e75c6c Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 11 Mar 2019 17:19:06 -0700 Subject: [PATCH 5/7] Address a comment --- src/Microsoft.ML.Transforms/Text/WordTokenizing.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs index e08fd51273..b6278e824a 100644 --- a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs +++ b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs @@ -105,7 +105,7 @@ private static VersionInfo GetVersionInfo() private const string RegistrationName = "DelimitedTokenize"; - public IReadOnlyCollection Columns => _columns.AsReadOnly(); + internal IReadOnlyCollection Columns => _columns.AsReadOnly(); private readonly WordTokenizingEstimator.ColumnOptions[] _columns; private static (string name, string inputColumnName)[] GetColumnPairs(WordTokenizingEstimator.ColumnOptions[] columns) From 883784a1935d50f39418f733208ea0279f942e4a Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Tue, 12 Mar 2019 09:25:56 -0700 Subject: [PATCH 6/7] Rename ProduceCharacterTokens to ProduceCharactersAsKeys --- docs/code/MlNetCookBook.md | 2 +- docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs | 2 +- src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs | 2 +- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 4 ++-- test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs | 2 +- .../Scenarios/Api/CookbookSamples/CookbookSamples.cs | 2 +- .../Api/CookbookSamples/CookbookSamplesDynamicApi.cs | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 25edf3c1b2..9a7c1edad3 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -775,7 +775,7 @@ var pipeline = ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - .Append(mlContext.Transforms.Text.ProduceCharacterTokens("MessageChars", "Message")) + .Append(mlContext.Transforms.Text.ProduceCharactersAsKeys("MessageChars", "Message")) .Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars", ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf)) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs index 883c034d64..8bd0a463c1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs @@ -26,7 +26,7 @@ public static void NgramTransform() // A pipeline to tokenize text as characters and then combine them together into ngrams // The pipeline uses the default settings to featurize. - var charsPipeline = ml.Transforms.Text.ProduceCharacterTokens("Chars", "SentimentText", useMarkerCharacters: false); + var charsPipeline = ml.Transforms.Text.ProduceCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false); var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1); var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars"); var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline); diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index de5865cfcd..b71c8a2d71 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -109,7 +109,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// /// The column to apply to. /// Whether to use marker characters to separate words. - public static VarVector> ProduceCharacterTokens(this Scalar input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters); + public static VarVector> ProduceCharactersAsKeys(this Scalar input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters); } /// diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index c78e83430a..2e46acf916 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -57,7 +57,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text /// Name of the column to transform. If set to , the value of the will be used as source. /// Whether to prepend a marker character, , to the beginning, /// and append another marker character, , to the end of the output vector of characters. - public static TokenizingByCharactersEstimator ProduceCharacterTokens(this TransformsCatalog.TextTransforms catalog, + public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters) @@ -72,7 +72,7 @@ public static TokenizingByCharactersEstimator ProduceCharacterTokens(this Transf /// and append another marker character, , to the end of the output vector of characters. /// Pairs of columns to run the tokenization on. - public static TokenizingByCharactersEstimator ProduceCharacterTokens(this TransformsCatalog.TextTransforms catalog, + public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters, params ColumnOptions[] columns) => new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns)); diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index a6f9fae895..f0fb5844d1 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -520,7 +520,7 @@ public void Tokenize() .Append(r => ( r.label, tokens: r.text.ProduceWordTokens(), - chars: r.text.ProduceCharacterTokens())); + chars: r.text.ProduceCharactersAsKeys())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 6ad76d4153..96a2bb6267 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -467,7 +467,7 @@ private void TextFeaturizationOn(string dataPath) BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, useAllLengths: false), // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - BagOfTrichar: r.Message.ProduceCharacterTokens().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), + BagOfTrichar: r.Message.ProduceCharactersAsKeys().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index edfb40d5fd..0e9c962539 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -305,7 +305,7 @@ private void TextFeaturizationOn(string dataPath) ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - .Append(mlContext.Transforms.Text.ProduceCharacterTokens("MessageChars", "Message")) + .Append(mlContext.Transforms.Text.ProduceCharactersAsKeys("MessageChars", "Message")) .Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars", ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf)) From d99e19298c413b706fff72b49a1d252855dbb028 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 13 Mar 2019 11:36:56 -0700 Subject: [PATCH 7/7] Address comments --- .../Dynamic/KeyToValueValueToKey.cs | 4 ++-- .../Microsoft.ML.Samples/Dynamic/NgramExtraction.cs | 2 +- .../Dynamic/StopWordRemoverTransform.cs | 2 +- .../Dynamic/TensorFlow/TextClassification.cs | 2 +- .../Dynamic/WordEmbeddingTransform.cs | 2 +- src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs | 4 ++-- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 8 ++++---- .../StaticPipeTests.cs | 10 +++++----- .../Scenarios/Api/CookbookSamples/CookbookSamples.cs | 4 ++-- .../Api/CookbookSamples/CookbookSamplesDynamicApi.cs | 4 ++-- .../TensorflowTests.cs | 2 +- .../Transformers/CategoricalHashTests.cs | 2 +- .../Transformers/TextFeaturizerTests.cs | 2 +- .../Transformers/ValueMappingTests.cs | 2 +- .../Transformers/WordEmbeddingsTests.cs | 4 ++-- 15 files changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs index 951918ac04..c7f5636d47 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs @@ -30,7 +30,7 @@ public static void Example() // making use of default settings. string defaultColumnName = "DefaultKeys"; // REVIEW create through the catalog extension - var default_pipeline = ml.Transforms.Text.ProduceWordTokens("Review") + var default_pipeline = ml.Transforms.Text.TokenizeIntoWords("Review") .Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review")); // Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator. @@ -38,7 +38,7 @@ public static void Example() // and condition the order in which they get evaluated by changing keyOrdinality from the default ByOccurence (order in which they get encountered) // to value/alphabetically. string customizedColumnName = "CustomizedKeys"; - var customized_pipeline = ml.Transforms.Text.ProduceWordTokens("Review") + var customized_pipeline = ml.Transforms.Text.TokenizeIntoWords("Review") .Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maximumNumberOfKeys: 10, keyOrdinality: ValueToKeyMappingEstimator.KeyOrdinality.ByValue)); // The transformed data. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs index 8bd0a463c1..45f89fd80b 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs @@ -26,7 +26,7 @@ public static void NgramTransform() // A pipeline to tokenize text as characters and then combine them together into ngrams // The pipeline uses the default settings to featurize. - var charsPipeline = ml.Transforms.Text.ProduceCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false); + var charsPipeline = ml.Transforms.Text.TokenizeIntoCharactersAsKeys("Chars", "SentimentText", useMarkerCharacters: false); var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1); var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars"); var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs index cdf53e09fb..134a3ca3a0 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs @@ -25,7 +25,7 @@ public static void Example() // Let's take SentimentText column and break it into vector of words. string originalTextColumnName = "Words"; - var words = ml.Transforms.Text.ProduceWordTokens("SentimentText", originalTextColumnName); + var words = ml.Transforms.Text.TokenizeIntoWords("SentimentText", originalTextColumnName); // Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages. var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover")); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs index 9b61bbcb10..b2b5363a8d 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs @@ -68,7 +68,7 @@ public static void Example() j.Features = features; }; - var engine = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text") + var engine = mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "Sentiment_Text") .Append(mlContext.Transforms.Conversion.MapValue(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") })) .Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize")) .Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" })) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs index c14166e8ff..1830b3e171 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs @@ -26,7 +26,7 @@ public static void Example() // Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords. var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) - .Append(ml.Transforms.Text.ProduceWordTokens("Words", "NormalizedText")) + .Append(ml.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText")) .Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData); diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs index b71c8a2d71..1424c849a9 100644 --- a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs +++ b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs @@ -55,7 +55,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// /// The column to apply to. /// The separators to use (uses space character by default). - public static VarVector ProduceWordTokens(this Scalar input, char[] separators = null) => new OutPipelineColumn(input, separators); + public static VarVector TokenizeIntoWords(this Scalar input, char[] separators = null) => new OutPipelineColumn(input, separators); } /// @@ -109,7 +109,7 @@ public override IEstimator Reconcile(IHostEnvironment env, /// /// The column to apply to. /// Whether to use marker characters to separate words. - public static VarVector> ProduceCharactersAsKeys(this Scalar input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters); + public static VarVector> TokenizeIntoCharactersAsKeys(this Scalar input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters); } /// diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 2e46acf916..ef56dbf065 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -57,7 +57,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text /// Name of the column to transform. If set to , the value of the will be used as source. /// Whether to prepend a marker character, , to the beginning, /// and append another marker character, , to the end of the output vector of characters. - public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, + public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters) @@ -72,7 +72,7 @@ public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this Trans /// and append another marker character, , to the end of the output vector of characters. /// Pairs of columns to run the tokenization on. - public static TokenizingByCharactersEstimator ProduceCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, + public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters, params ColumnOptions[] columns) => new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns)); @@ -159,7 +159,7 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// The separators to use (uses space character by default). - public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog, + public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, char[] separators = null) @@ -170,7 +170,7 @@ public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.T /// /// The text-related transform's catalog. /// Pairs of columns to run the tokenization on. - public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog, + public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog, params WordTokenizingEstimator.ColumnOptions[] columns) => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index f0fb5844d1..660c83209b 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -519,8 +519,8 @@ public void Tokenize() var est = data.MakeNewEstimator() .Append(r => ( r.label, - tokens: r.text.ProduceWordTokens(), - chars: r.text.ProduceCharactersAsKeys())); + tokens: r.text.TokenizeIntoWords(), + chars: r.text.TokenizeIntoCharactersAsKeys())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; @@ -547,7 +547,7 @@ public void NormalizeTextAndRemoveStopWords() .Append(r => ( r.label, normalized_text: r.text.NormalizeText(), - words_without_stopwords: r.text.ProduceWordTokens().RemoveDefaultStopWords())); + words_without_stopwords: r.text.TokenizeIntoWords().RemoveDefaultStopWords())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; @@ -604,8 +604,8 @@ public void Ngrams() var est = data.MakeNewEstimator() .Append(r => ( r.label, - ngrams: r.text.ProduceWordTokens().ToKey().ProduceNgrams(), - ngramshash: r.text.ProduceWordTokens().ToKey().ProduceHashedNgrams())); + ngrams: r.text.TokenizeIntoWords().ToKey().ProduceNgrams(), + ngramshash: r.text.TokenizeIntoWords().ToKey().ProduceHashedNgrams())); var tdata = est.Fit(data).Transform(data); var schema = tdata.AsDynamic.Schema; diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 96a2bb6267..cbcbbcf231 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -467,12 +467,12 @@ private void TextFeaturizationOn(string dataPath) BagOfBigrams: r.Message.NormalizeText().ProduceHashedWordBags(ngramLength: 2, useAllLengths: false), // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - BagOfTrichar: r.Message.ProduceCharactersAsKeys().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), + BagOfTrichar: r.Message.TokenizeIntoCharactersAsKeys().ProduceNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf), // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real // scenario, it is best to use a different model for more accuracy. - Embeddings: r.Message.NormalizeText().ProduceWordTokens().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) + Embeddings: r.Message.NormalizeText().TokenizeIntoWords().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding) )); // Let's train our pipeline, and then apply it to the same data. diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 0e9c962539..6fb2202692 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -305,14 +305,14 @@ private void TextFeaturizationOn(string dataPath) ngramLength: 2, useAllLengths: false)) // NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting. - .Append(mlContext.Transforms.Text.ProduceCharactersAsKeys("MessageChars", "Message")) + .Append(mlContext.Transforms.Text.TokenizeIntoCharactersAsKeys("MessageChars", "Message")) .Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars", ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf)) // NLP pipeline 4: word embeddings. // PretrainedModelKind.Sswe is used here for performance of the test. In a real // scenario, it is best to use a different model for more accuracy. - .Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage")) + .Append(mlContext.Transforms.Text.TokenizeIntoWords("TokenizedMessage", "NormalizedMessage")) .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Embeddings", "TokenizedMessage", WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)); diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index a42f1cea61..b6fecd7a8d 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -999,7 +999,7 @@ public void TensorFlowSentimentClassificationTest() // The first pipeline 'dataPipe' tokenzies the string into words and maps each word to an integer which is an index in the dictionary. // Then this integer vector is retrieved from the pipeline and resized to fixed length. // The second pipeline 'tfEnginePipe' takes the resized integer vector and passes it to TensoFlow and gets the classification scores. - var estimator = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text") + var estimator = mlContext.Transforms.Text.TokenizeIntoWords("TokenizedWords", "Sentiment_Text") .Append(mlContext.Transforms.Conversion.MapValue(lookupMap, "Words", "Ids", new ColumnOptions[] { ("Features", "TokenizedWords") })); var dataPipe = estimator.Fit(dataView) .CreatePredictionEngine(mlContext); diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs index f5dc6c59bc..58940f19c2 100644 --- a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs @@ -76,7 +76,7 @@ public void CategoricalHashStatic() row.ScalarString, row.VectorString, // Create a VarVector column - VarVectorString: row.ScalarString.ProduceWordTokens())). + VarVectorString: row.ScalarString.TokenizeIntoWords())). Append(row => ( A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind), B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind), diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 8ecccfe350..4445cbae7f 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -143,7 +143,7 @@ public void TextNormalizationAndStopwordRemoverWorkout() text: ctx.LoadFloat(1)), hasHeader: true) .Load(sentimentDataPath); var est = ML.Transforms.Text.NormalizeText("text") - .Append(ML.Transforms.Text.ProduceWordTokens("words", "text")) + .Append(ML.Transforms.Text.TokenizeIntoWords("words", "text")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words")) .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY")); diff --git a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs index 0bb6e7c0f9..c98617c8a8 100644 --- a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs @@ -546,7 +546,7 @@ public void ValueMappingInputIsVectorWorkout() var keys = new List>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; var values = new List() { 1, 2, 3, 4 }; - var est = ML.Transforms.Text.ProduceWordTokens("TokenizeB", "B") + var est = ML.Transforms.Text.TokenizeIntoWords("TokenizeB", "B") .Append(ML.Transforms.Conversion.MapValue(keys, values, new ColumnOptions[] { ("VecB", "TokenizeB") })); TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView); } diff --git a/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs b/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs index 3e42c515c2..dc105520e9 100644 --- a/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs @@ -35,7 +35,7 @@ public void TestWordEmbeddings() }).Load(GetDataPath(dataPath)); var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) - .Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText")) + .Append(ML.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var words = est.Fit(data).Transform(data); @@ -70,7 +70,7 @@ public void TestCustomWordEmbeddings() }).Load(GetDataPath(dataPath)); var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) - .Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText")) + .Append(ML.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText")) .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); var words = est.Fit(data).Transform(data); var pathToCustomModel = DeleteOutputPath("custommodel.txt");