From 15be23ce2cd48f8639a7b7acd75f2b25a1da04e7 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Fri, 29 Mar 2019 12:38:32 -0700 Subject: [PATCH 1/6] Created sample for 'ApplyWordEmbedding' API. --- .../Text/ApplyCustomWordEmbedding.cs | 79 +++++++++++++++++++ .../Transforms/Text/ApplyWordEmbedding.cs | 68 ++++++++++++++++ .../Text/TextCatalog.cs | 4 +- 3 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs new file mode 100644 index 0000000000..297df0dffb --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs @@ -0,0 +1,79 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ApplyCustomWordEmbedding + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as + // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + var pathToCustomModel = @".\custommodel.txt"; + using (StreamWriter file = new StreamWriter(pathToCustomModel, false)) + { + + file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not confirm to the ' ' pattern, and is therefore ignored"); + file.WriteLine("greate" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f)); + file.WriteLine("product" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f)); + file.WriteLine("like" + " " + string.Join(" ", -1f, 100.0f, -100f)); + file.WriteLine("buy" + " " + string.Join(" ", 0f, 0f, 20f)); + } + + // A pipeline for converting text into a 9-dimension word embedding vector using the custom word embedding model. + // The 'ApplyWordEmbedding' computes the minimum, average and maximum values for each token's embedding vector. + // Tokens in 'custommodel.txt' model are represented as 3-dimension vector. + // Therefore, the output is of 9-dimension [min, avg, max]. + // + // The 'ApplyWordEmbedding' API requires vector of text as input. + // The pipeline first normalizes and tokenizes text then applies word embedding transformation. + var textPipeline = mlContext.Transforms.Text.NormalizeText("Text") + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")) + .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", pathToCustomModel, "Tokens")); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to get the embedding vector from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to convert the text into embedding vector. + var data = new TextData() { Text = "This is a greate product. I would like to buy it again." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the embedding vector. + Console.WriteLine($"Number of Features: {prediction.Features.Length}"); + + // Print the embedding vector. + Console.Write("Features: "); + foreach (var f in prediction.Features) + Console.Write($"{f:F4} "); + + // Expected output: + // Number of Features: 9 + // Features: -1.0000 0.0000 -100.0000 0.0000 34.0000 -25.6667 1.0000 100.0000 20.0000 + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] Features { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs new file mode 100644 index 0000000000..b7530be587 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs @@ -0,0 +1,68 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ApplyWordEmbedding + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as + // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + // A pipeline for converting text into a 150-dimension embedding vector using pretrained 'SentimentSpecificWordEmbedding' model. + // The 'ApplyWordEmbedding' computes the minimum, average and maximum values for each token's embedding vector. + // Tokens in 'SentimentSpecificWordEmbedding' model are represented as 50-dimension vector. + // Therefore, the output is of 150-dimension [min, avg, max]. + // + // The 'ApplyWordEmbedding' API requires vector of text as input. + // The pipeline first normalizes and tokenizes text then applies word embedding transformation. + var textPipeline = mlContext.Transforms.Text.NormalizeText("Text") + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text")) + .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Features", "Tokens", + Transforms.Text.WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to get the embedding vector from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to convert the text into embedding vector. + var data = new TextData() { Text = "This is a greate product. I would like to buy it again." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the embedding vector. + Console.WriteLine($"Number of Features: {prediction.Features.Length}"); + + // Print the embedding vector. + Console.Write("Features: "); + foreach (var f in prediction.Features) + Console.Write($"{f:F4} "); + + // Expected output: + // Number of Features: 150 + // Features: -1.2489 0.2384 -1.3034 -0.9135 -3.4978 -0.1784 -1.3823 -0.3863 -2.5262 -0.8950 ... + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] Features { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 2be9e4dd7d..1d0ef69a33 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -125,7 +125,7 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text /// /// /// /// /// @@ -143,7 +143,7 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T /// /// /// /// /// From 58e2d4be07734fcebb4b01034e31a6fda1109da9 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Fri, 29 Mar 2019 18:24:20 -0700 Subject: [PATCH 2/6] Addressed reviewers' comments. --- .../Transforms/Text/ApplyCustomWordEmbedding.cs | 15 ++++++++------- .../Dynamic/Transforms/Text/ApplyWordEmbedding.cs | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs index 297df0dffb..4ac4ab18da 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs @@ -21,15 +21,16 @@ public static void Example() // Convert sample list to an empty IDataView. var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + // Write a custom 3-dimensional word embedding model with 4 words. + // Each line follows ' ' pattern. + // Lines that do not confirm to the pattern are ignored. var pathToCustomModel = @".\custommodel.txt"; using (StreamWriter file = new StreamWriter(pathToCustomModel, false)) { - - file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not confirm to the ' ' pattern, and is therefore ignored"); - file.WriteLine("greate" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f)); - file.WriteLine("product" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f)); - file.WriteLine("like" + " " + string.Join(" ", -1f, 100.0f, -100f)); - file.WriteLine("buy" + " " + string.Join(" ", 0f, 0f, 20f)); + file.WriteLine("great 1.0 2.0 3.0"); + file.WriteLine("product -1.0 -2.0 -3.0"); + file.WriteLine("like -1 100.0 -100"); + file.WriteLine("buy 0 0 20"); } // A pipeline for converting text into a 9-dimension word embedding vector using the custom word embedding model. @@ -50,7 +51,7 @@ public static void Example() var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); // Call the prediction API to convert the text into embedding vector. - var data = new TextData() { Text = "This is a greate product. I would like to buy it again." }; + var data = new TextData() { Text = "This is a great product. I would like to buy it again." }; var prediction = predictionEngine.Predict(data); // Print the length of the embedding vector. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs index b7530be587..0a58a2da07 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs @@ -39,7 +39,7 @@ public static void Example() var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); // Call the prediction API to convert the text into embedding vector. - var data = new TextData() { Text = "This is a greate product. I would like to buy it again." }; + var data = new TextData() { Text = "This is a great product. I would like to buy it again." }; var prediction = predictionEngine.Predict(data); // Print the length of the embedding vector. From a3ec5d3870a39c08206309dd50286bd717e2f2f2 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 1 Apr 2019 10:39:12 -0700 Subject: [PATCH 3/6] Deleted old embedding sample. --- .../Dynamic/WordEmbeddingTransform.cs | 109 ------------------ 1 file changed, 109 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs deleted file mode 100644 index 1830b3e171..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs +++ /dev/null @@ -1,109 +0,0 @@ -using System; -using System.Collections.Generic; -using System.IO; -using Microsoft.ML.Data; -using Microsoft.ML.Transforms.Text; -namespace Microsoft.ML.Samples.Dynamic -{ - public static class WordEmbeddingTransform - { - public static void Example() - { - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // Get a small dataset as an IEnumerable and convert to IDataView. - var data = SamplesUtils.DatasetUtils.GetSentimentData(); - var trainData = ml.Data.LoadFromEnumerable(data); - - // Preview of the data. - // - // Sentiment SentimentText - // true Best game I've ever played. - // false ==RUDE== Dude, 2. - // true Until the next game, this is the best Xbox game! - - // Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords. - var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false) - .Append(ml.Transforms.Text.TokenizeIntoWords("Words", "NormalizedText")) - .Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words")); - - var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData); - // Preview of the CleanWords column obtained after processing SentimentText. - var cleanWords = wordsDataview.GetColumn[]>(wordsDataview.Schema["CleanWords"]); - Console.WriteLine($" CleanWords column obtained post-transformation."); - foreach (var featureRow in cleanWords) - { - foreach (var value in featureRow) - Console.Write($"{value} "); - Console.WriteLine(""); - } - - Console.WriteLine("==================================================="); - // best game ive played - // == rude == dude 2 - // game best xbox game - - // Small helper to print wordembeddings in the console. - Action> printEmbeddings = (columnName, column) => - { - Console.WriteLine($"{columnName} column obtained post-transformation."); - foreach (var featureRow in column) - { - foreach (var value in featureRow) - Console.Write($"{value} "); - Console.WriteLine(""); - } - - Console.WriteLine("==================================================="); - }; - - // Let's apply pretrained word embedding model GloVeTwitter25D. - // 25D means each word mapped into 25 dimensional space, basically each word represented by 25 float values. - var gloveWordEmbedding = ml.Transforms.Text.ApplyWordEmbedding("GloveEmbeddings", "CleanWords", - WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter25D); - - // We also have option to apply custom word embedding models. - // Let's first create one. - // Format is following: - // First line is ignored if it is a header for your file. - // Each next line contains a single word followed by either a tab or space, and a list of floats also separated by a tab or space. - // Size of array of floats should be same for whole file. - var pathToCustomModel = @".\custommodel.txt"; - using (StreamWriter file = new StreamWriter(pathToCustomModel, false)) - { - - file.WriteLine("This is custom file for 4 words with 3 dimensional word embedding vector. This first line in this file does not conform to the ' ' pattern, and is therefore ignored"); - file.WriteLine("xbox" + " " + string.Join(" ", 1.0f, 2.0f, 3.0f)); - file.WriteLine("game" + " " + string.Join(" ", -1.0f, -2.0f, -3.0f)); - file.WriteLine("dude" + " " + string.Join(" ", -1f, 100.0f, -100f)); - file.WriteLine("best" + " " + string.Join(" ", 0f, 0f, 20f)); - } - // Now let's add custom embedding on top of same words. - var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ApplyWordEmbedding("CustomEmbeddings", @".\custommodel.txt", "CleanWords")); - - // And do all required transformations. - var embeddingDataview = pipeline.Fit(wordsDataview).Transform(wordsDataview); - - var customEmbeddings = embeddingDataview.GetColumn(embeddingDataview.Schema["CustomEmbeddings"]); - printEmbeddings("GloveEmbeddings", customEmbeddings); - - // -1 -2 -3 -0.5 -1 8.5 0 0 20 - // -1 100 -100 -1 100 -100 -1 100 -100 - // 1 -2 -3 -0.25 -0.5 4.25 1 2 20 - // As you can see above we output 9 values for each line - // We go through each word present in row and extract 3 floats for it (if we can find that word in model). - // First 3 floats in output values represent minimum values (for each dimension) for extracted values. - // Second set of 3 floats in output represent average (for each dimension) for extracted values. - // Third set of 3 floats in output represent maximum values (for each dimension) for extracted values. - // Preview of GloveEmbeddings. - var gloveEmbeddings = embeddingDataview.GetColumn(embeddingDataview.Schema["GloveEmbeddings"]); - printEmbeddings("GloveEmbeddings", gloveEmbeddings); - // 0.23166 0.048825 0.26878 -1.3945 -0.86072 -0.026778 0.84075 -0.81987 -1.6681 -1.0658 -0.30596 0.50974 ... - //-0.094905 0.61109 0.52546 - 0.2516 0.054786 0.022661 1.1801 0.33329 - 0.85388 0.15471 - 0.5984 0.4364 ... - // 0.23166 0.048825 0.26878 - 1.3945 - 0.30044 - 0.16523 0.47251 0.10276 - 0.20978 - 0.68094 - 0.30596 ... - - } - } -} From 64ff94669b26a914d7e8feebe7f0663928d5660c Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 1 Apr 2019 11:29:04 -0700 Subject: [PATCH 4/6] Created samples for TokenizeIntoWords and RemoveStopWords APIs. --- .../Dynamic/StopWordRemoverTransform.cs | 82 ------------------- .../Transforms/Text/RemoveDefaultStopWords.cs | 59 +++++++++++++ .../Transforms/Text/RemoveStopWords.cs | 59 +++++++++++++ .../Transforms/Text/TokenizeIntoWords.cs | 57 +++++++++++++ .../Text/TextCatalog.cs | 17 +++- 5 files changed, 188 insertions(+), 86 deletions(-) delete mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs deleted file mode 100644 index 134a3ca3a0..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs +++ /dev/null @@ -1,82 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; - -namespace Microsoft.ML.Samples.Dynamic -{ - public static class StopWordRemoverTransform - { - public static void Example() - { - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // Get a small dataset as an IEnumerable and convert to IDataView. - var data = SamplesUtils.DatasetUtils.GetSentimentData(); - var trainData = ml.Data.LoadFromEnumerable(data); - - // Preview of the data. - // - // Sentiment SentimentText - // true Best game I've ever played. - // false ==RUDE== Dude, 2. - // true Until the next game, this is the best Xbox game! - - // Let's take SentimentText column and break it into vector of words. - string originalTextColumnName = "Words"; - var words = ml.Transforms.Text.TokenizeIntoWords("SentimentText", originalTextColumnName); - - // Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages. - var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover")); - - // Another pipeline, that removes words specified by user. We do case insensitive comparison for the stop words. - var customizedPipeline = words.Append(ml.Transforms.Text.RemoveStopWords(originalTextColumnName, "RemovedWords", - new[] { "XBOX" })); - - // The transformed data for both pipelines. - var transformedDataDefault = defaultPipeline.Fit(trainData).Transform(trainData); - var transformedDataCustomized = customizedPipeline.Fit(trainData).Transform(trainData); - - // Small helper to print the text inside the columns, in the console. - Action>>> printHelper = (columnName, column) => - { - Console.WriteLine($"{columnName} column obtained post-transformation."); - foreach (var featureRow in column) - { - foreach (var value in featureRow.GetValues()) - Console.Write($"{value}|"); - Console.WriteLine(""); - } - - Console.WriteLine("==================================================="); - }; - - // Preview the result of breaking string into array of words. - var originalText = transformedDataDefault.GetColumn>>(transformedDataDefault.Schema[originalTextColumnName]); - printHelper(originalTextColumnName, originalText); - // Best|game|I've|ever|played.| - // == RUDE ==| Dude,| 2 | - // Until | the | next | game,| this |is| the | best | Xbox | game!| - - // Preview the result of cleaning with default stop word remover. - var defaultRemoverData = transformedDataDefault.GetColumn>>(transformedDataDefault.Schema["DefaultRemover"]); - printHelper("DefaultRemover", defaultRemoverData); - // Best|game|I've|played.| - // == RUDE ==| Dude,| 2 | - // game,| best | Xbox | game!| - // As you can see "Until, the, next, this, is" was removed. - - - // Preview the result of cleaning with default customized stop word remover. - var customizeRemoverData = transformedDataCustomized.GetColumn>>(transformedDataCustomized.Schema["RemovedWords"]); - printHelper("RemovedWords", customizeRemoverData); - - // Best|game|I've|ever|played.| - // == RUDE ==| Dude,| 2 | - // Until | the | next | game,| this |is| the | best | game!| - //As you can see Xbox was removed. - - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs new file mode 100644 index 0000000000..6e5af5fb67 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs @@ -0,0 +1,59 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class RemoveDefaultStopWords + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'RemoveDefaultStopWords' does not require training data as + // the estimator ('StopWordsRemovingEstimator') created by 'RemoveDefaultStopWords' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + // A pipeline for removing stop words from input text/string. + // The pipeline first tokenizes text into words then removes stop words. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") + .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("WordsWithoutStopWords", "Words", language: StopWordsRemovingEstimator.Language.English)); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to remove the stop words from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to remove stop words. + var data = new TextData() { Text = "ML.NET's RemoveDefaultStopWords API removes stop words from the text/string. It requires the text/string to be tokenized beforehand." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the word vector after the stop words removed. + Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}"); + + // Print the word vector without stop words. + Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}"); + + // Expected output: + // Number of words: 11 + // Words without stop words: ML.NET's,RemoveDefaultStopWords,API,removes,stop,words,text/string.,requires,text/string,tokenized,beforehand. + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public string[] WordsWithoutStopWords { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs new file mode 100644 index 0000000000..6b9a6a6a07 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs @@ -0,0 +1,59 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class RemoveStopWords + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'RemoveStopWords' does not require training data as + // the estimator ('CustomStopWordsRemovingEstimator') created by 'RemoveStopWords' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + // A pipeline for removing stop words from input text/string. + // The pipeline first tokenizes text into words then removes stop words. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") + .Append(mlContext.Transforms.Text.RemoveStopWords("WordsWithoutStopWords", "Words", stopwords: new[] { "a", "the", "from", "by" })); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to remove the stop words from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to remove stop words. + var data = new TextData() { Text = "ML.NET's RemoveStopWords API removes stop words from the text/string using a list of stop words provided by the user." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the word vector after the stop words removed. + Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}"); + + // Print the word vector without stop words. + Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}"); + + // Expected output: + // Number of words: 14 + // Words without stop words: ML.NET's,RemoveStopWords,API,removes,stop,words,text/string,using,list,of,stop,words,provided,user. + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public string[] WordsWithoutStopWords { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs new file mode 100644 index 0000000000..d3275e6482 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs @@ -0,0 +1,57 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class TokenizeIntoWords + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'TokenizeIntoWords' does not require training data as + // the estimator ('WordTokenizingEstimator') created by 'TokenizeIntoWords' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var emptySamples = new List(); + + // Convert sample list to an empty IDataView. + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); + + // A pipeline for converting text into vector of words. + var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' }); + + // Fit to data. + var textTransformer = textPipeline.Fit(emptyDataView); + + // Create the prediction engine to get the word vector from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Call the prediction API to convert the text into words. + var data = new TextData() { Text = "ML.NET's TokenizeIntoWords API splits text/string into words using the list of characters provided as separators." }; + var prediction = predictionEngine.Predict(data); + + // Print the length of the word vector. + Console.WriteLine($"Number of words: {prediction.Words.Length}"); + + // Print the word vector. + Console.WriteLine($"\nWords: {string.Join(",", prediction.Words)}"); + + // Expected output: + // Number of words: 15 + // Words: ML.NET's,TokenizeIntoWords,API,splits,text/string,into,words,using,the,list,of,characters,provided,as,separators. + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public string[] Words { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 1d0ef69a33..f0294730e4 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -179,6 +179,13 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// The separators to use (uses space character by default). + /// + /// + /// + /// + /// public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, @@ -247,8 +254,9 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te /// /// /// + /// [!code-csharp[RemoveStopWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs)] + /// ]]> + /// /// public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog, string outputColumnName, @@ -267,8 +275,9 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC /// /// /// + /// [!code-csharp[RemoveStopWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs)] + /// ]]> + /// /// public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog, string outputColumnName, From 1bc241d37910e6e8c13cc760827b056a7bee5057 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 1 Apr 2019 17:29:32 -0700 Subject: [PATCH 5/6] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs | 2 +- .../Dynamic/Transforms/Text/ApplyWordEmbedding.cs | 2 +- .../Dynamic/Transforms/Text/NormalizeText.cs | 2 +- .../Dynamic/Transforms/Text/RemoveDefaultStopWords.cs | 2 +- .../Dynamic/Transforms/Text/RemoveStopWords.cs | 2 +- .../Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs | 2 +- .../Dynamic/Transforms/Text/TokenizeIntoWords.cs | 4 +++- 7 files changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs index 4ac4ab18da..c3ee04dbbe 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs @@ -13,7 +13,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as + // Create an empty list as the dataset. The 'ApplyWordEmbedding' does not require training data as // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs index 0a58a2da07..c1a62e21f5 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as + // Create an empty list as the dataset. The 'ApplyWordEmbedding' does not require training data as // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs index 920ea4353c..3fa83cf3ca 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'NormalizeText' API does not require training data as + // Create an empty list as the dataset. The 'NormalizeText' API does not require training data as // the estimator ('TextNormalizingEstimator') created by 'NormalizeText' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs index 6e5af5fb67..a6bec688a3 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'RemoveDefaultStopWords' does not require training data as + // Create an empty list as the dataset. The 'RemoveDefaultStopWords' does not require training data as // the estimator ('StopWordsRemovingEstimator') created by 'RemoveDefaultStopWords' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs index 6b9a6a6a07..501ab8ae68 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'RemoveStopWords' does not require training data as + // Create an empty list as the dataset. The 'RemoveStopWords' does not require training data as // the estimator ('CustomStopWordsRemovingEstimator') created by 'RemoveStopWords' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs index 9c443b459a..922269d222 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'TokenizeIntoCharactersAsKeys' does not require training data as + // Create an empty list as the dataset. The 'TokenizeIntoCharactersAsKeys' does not require training data as // the estimator ('TokenizingByCharactersEstimator') created by 'TokenizeIntoCharactersAsKeys' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs index d3275e6482..1f98bd5a21 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs @@ -12,7 +12,7 @@ public static void Example() // as well as the source of randomness. var mlContext = new MLContext(); - // Create an empty data sample list. The 'TokenizeIntoWords' does not require training data as + // Create an empty list as the dataset. The 'TokenizeIntoWords' does not require training data as // the estimator ('WordTokenizingEstimator') created by 'TokenizeIntoWords' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. var emptySamples = new List(); @@ -21,6 +21,8 @@ public static void Example() var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); // A pipeline for converting text into vector of words. + // The following call to 'TokenizeIntoWords' tokenizes text/string into words using space as a separator. + // Space is also a default value for the 'separators' argument if it is not specified. var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' }); // Fit to data. From 672ade68572a9127cace1e825bb1fb35cf72e898 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 2 Apr 2019 13:26:22 -0700 Subject: [PATCH 6/6] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/RemoveDefaultStopWords.cs | 3 ++- .../Dynamic/Transforms/Text/RemoveStopWords.cs | 3 ++- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs index a6bec688a3..ddd5a56750 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs @@ -22,6 +22,7 @@ public static void Example() // A pipeline for removing stop words from input text/string. // The pipeline first tokenizes text into words then removes stop words. + // The 'RemoveDefaultStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words. var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("WordsWithoutStopWords", "Words", language: StopWordsRemovingEstimator.Language.English)); @@ -32,7 +33,7 @@ public static void Example() var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); // Call the prediction API to remove stop words. - var data = new TextData() { Text = "ML.NET's RemoveDefaultStopWords API removes stop words from the text/string. It requires the text/string to be tokenized beforehand." }; + var data = new TextData() { Text = "ML.NET's RemoveDefaultStopWords API removes stop words from tHe text/string. It requires the text/string to be tokenized beforehand." }; var prediction = predictionEngine.Predict(data); // Print the length of the word vector after the stop words removed. diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs index 501ab8ae68..a412920496 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs @@ -22,6 +22,7 @@ public static void Example() // A pipeline for removing stop words from input text/string. // The pipeline first tokenizes text into words then removes stop words. + // The 'RemoveStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words. var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") .Append(mlContext.Transforms.Text.RemoveStopWords("WordsWithoutStopWords", "Words", stopwords: new[] { "a", "the", "from", "by" })); @@ -32,7 +33,7 @@ public static void Example() var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); // Call the prediction API to remove stop words. - var data = new TextData() { Text = "ML.NET's RemoveStopWords API removes stop words from the text/string using a list of stop words provided by the user." }; + var data = new TextData() { Text = "ML.NET's RemoveStopWords API removes stop words from tHe text/string using a list of stop words provided by the user." }; var prediction = predictionEngine.Predict(data); // Print the length of the word vector after the stop words removed. diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 115ece0934..db412be77c 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -261,7 +261,7 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te /// /// /// /// ///