From 9471b5d8fbd4731cd5551400b822dc1ad5c57925 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 2 Apr 2019 15:44:11 -0700 Subject: [PATCH 1/6] Created samples for 'ProduceWordBags' and 'ProduceHashedWordBags' API. --- .../Transforms/Text/ProduceHashedWordBags.cs | 69 +++++++++++++++ .../Transforms/Text/ProduceWordBags.cs | 87 +++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs new file mode 100644 index 0000000000..750b246210 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs @@ -0,0 +1,69 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ProduceHashedWordBags + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset as an IEnumerable. + var samples = new List() + { + new TextData(){ Text = "This is an example to compute bag-of-word features using hashing." }, + new TextData(){ Text = "ML.NET's ProduceHashedWordBags API produces count of Ngrams and hashes it as an index into a vector of given bit length." }, + new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, + new TextData(){ Text = "computing Ngram and hash them to the index given by hash value." }, + new TextData(){ Text = "The hashing schem reduces the size of the output feature vector" }, + new TextData(){ Text = "which is useful in case when number of Ngrams is very large." }, + }; + + // Convert training data to IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for converting text into numeric bag-of-word features using hashing. + // The following call to 'ProduceHashedWordBags' implicitly tokenizes the text/string into words/tokens. + // Please note that the length of the output feature vector depends on the 'numberOfBits' settings. + var textPipeline = mlContext.Transforms.Text.ProduceHashedWordBags("BagOfWordFeatures", "Text", + numberOfBits: 8, ngramLength: 3, useAllLengths: false); + + // Fit to data. + var textTransformer = textPipeline.Fit(dataview); + + // Create the prediction engine to get the bag-of-word features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Convert the text into numeric features. + var prediction = predictionEngine.Predict(samples[0]); + + // Print the length of the feature vector. + Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); + + // Print the first 10 feature values. + Console.Write("Features: "); + for (int i = 0; i < 10; i++) + Console.Write($"{prediction.BagOfWordFeatures[i]:F4} "); + + // Expected output: + // Number of Features: 256 + // Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] BagOfWordFeatures { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs new file mode 100644 index 0000000000..2548b935e5 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs @@ -0,0 +1,87 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ProduceWordBags + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset as an IEnumerable. + var samples = new List() + { + new TextData(){ Text = "This is an example to compute bag-of-word features." }, + new TextData(){ Text = "ML.NET's ProduceWordBags API produces bag-of-word features from input text." }, + new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, + new TextData(){ Text = "computing Ngram and their neumeric values." }, + new TextData(){ Text = "Each position in the output vector corresponds to a particular Ngram." }, + new TextData(){ Text = "The value at each position corresponds to," }, + new TextData(){ Text = "the number of times Ngram occured in the data (Tf), or" }, + new TextData(){ Text = "the inverse of the number of documents contain the Ngram (Idf), or." }, + new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." }, + }; + + // Convert training data to IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for converting text into numeric bag-of-word features. + // The following call to 'ProduceWordBags' implicitly tokenizes the text/string into words/tokens. + // Please note that the length of the output feature vector depends on the Ngram settings. + var textPipeline = mlContext.Transforms.Text.ProduceWordBags("BagOfWordFeatures", "Text", + ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf); + + // Fit to data. + var textTransformer = textPipeline.Fit(dataview); + var transformedDataView = textTransformer.Transform(dataview); + + // Create the prediction engine to get the bag-of-word features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Convert the text into numeric features. + var prediction = predictionEngine.Predict(samples[0]); + + // Print the length of the feature vector. + Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); + + // Preview of the produced Ngrams. + VBuffer> slotNames = default; + transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); + var BagOfWordFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["BagOfWordFeatures"]); + var slots = slotNames.GetValues(); + Console.Write("Ngrams: "); + foreach (var featureRow in BagOfWordFeaturesColumn) + { + foreach (var item in featureRow.Items()) + Console.Write($"{slots[item.Key]} "); + Console.WriteLine(); + } + + // Print the first 10 feature values. + Console.Write("Features: "); + for (int i = 0; i < 10; i++) + Console.Write($"{prediction.BagOfWordFeatures[i]:F4} "); + + // Expected output: + // Number of Features: 62 + // Ngrams: This|is|an is|an|example an|example|to example|to|compute to|compute|bag-of-word compute|bag-of-word|features. ML.NET's|ProduceWordBags|API ProduceWordBags|API|produces API|produces|bag-of-word produces|bag-of-word|features ... + // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 ... + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public float[] BagOfWordFeatures { get; set; } + } + } +} From 1b2dacaef7df692701e4ad91c6eddc66f84736b1 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 10:51:51 -0700 Subject: [PATCH 2/6] Updated comments! --- .../Dynamic/Transforms/Text/ProduceWordBags.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs index 2548b935e5..03661bd95b 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs @@ -51,6 +51,8 @@ public static void Example() Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); // Preview of the produced Ngrams. + // Get the slot names from the column's metadata. + // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); var BagOfWordFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["BagOfWordFeatures"]); From 202f05140e83159720ce511df2b8fa73b2d11c1e Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 14:15:42 -0700 Subject: [PATCH 3/6] Addressed reviewers' comments. --- .../Transforms/Text/ProduceHashedWordBags.cs | 14 +++++++------- .../Dynamic/Transforms/Text/ProduceWordBags.cs | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs index 750b246210..56e2b0315f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs @@ -18,11 +18,11 @@ public static void Example() var samples = new List() { new TextData(){ Text = "This is an example to compute bag-of-word features using hashing." }, - new TextData(){ Text = "ML.NET's ProduceHashedWordBags API produces count of Ngrams and hashes it as an index into a vector of given bit length." }, + new TextData(){ Text = "ML.NET's ProduceHashedWordBags API produces count of n-grams and hashes it as an index into a vector of given bit length." }, new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, - new TextData(){ Text = "computing Ngram and hash them to the index given by hash value." }, - new TextData(){ Text = "The hashing schem reduces the size of the output feature vector" }, - new TextData(){ Text = "which is useful in case when number of Ngrams is very large." }, + new TextData(){ Text = "computing n-grams and hash them to the index given by hash value." }, + new TextData(){ Text = "The hashing reduces the size of the output feature vector" }, + new TextData(){ Text = "which is useful in case when number of n-grams is very large." }, }; // Convert training data to IDataView. @@ -32,7 +32,7 @@ public static void Example() // The following call to 'ProduceHashedWordBags' implicitly tokenizes the text/string into words/tokens. // Please note that the length of the output feature vector depends on the 'numberOfBits' settings. var textPipeline = mlContext.Transforms.Text.ProduceHashedWordBags("BagOfWordFeatures", "Text", - numberOfBits: 8, ngramLength: 3, useAllLengths: false); + numberOfBits: 5, ngramLength: 3, useAllLengths: false); // Fit to data. var textTransformer = textPipeline.Fit(dataview); @@ -52,8 +52,8 @@ public static void Example() Console.Write($"{prediction.BagOfWordFeatures[i]:F4} "); // Expected output: - // Number of Features: 256 - // Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 + // Number of Features: 32 + // Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 2.0000 0.0000 0.0000 } public class TextData diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs index 03661bd95b..b55afcf0e9 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs @@ -20,11 +20,11 @@ public static void Example() new TextData(){ Text = "This is an example to compute bag-of-word features." }, new TextData(){ Text = "ML.NET's ProduceWordBags API produces bag-of-word features from input text." }, new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, - new TextData(){ Text = "computing Ngram and their neumeric values." }, - new TextData(){ Text = "Each position in the output vector corresponds to a particular Ngram." }, + new TextData(){ Text = "computing n-grams and their neumeric values." }, + new TextData(){ Text = "Each position in the output vector corresponds to a particular n-gram." }, new TextData(){ Text = "The value at each position corresponds to," }, - new TextData(){ Text = "the number of times Ngram occured in the data (Tf), or" }, - new TextData(){ Text = "the inverse of the number of documents contain the Ngram (Idf), or." }, + new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" }, + new TextData(){ Text = "the inverse of the number of documents contain the n-gram (Idf)," }, new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." }, }; @@ -33,7 +33,7 @@ public static void Example() // A pipeline for converting text into numeric bag-of-word features. // The following call to 'ProduceWordBags' implicitly tokenizes the text/string into words/tokens. - // Please note that the length of the output feature vector depends on the Ngram settings. + // Please note that the length of the output feature vector depends on the n-gram settings. var textPipeline = mlContext.Transforms.Text.ProduceWordBags("BagOfWordFeatures", "Text", ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf); @@ -50,14 +50,14 @@ public static void Example() // Print the length of the feature vector. Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); - // Preview of the produced Ngrams. + // Preview of the produced n-grams. // Get the slot names from the column's metadata. // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); var BagOfWordFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["BagOfWordFeatures"]); var slots = slotNames.GetValues(); - Console.Write("Ngrams: "); + Console.Write("N-grams: "); foreach (var featureRow in BagOfWordFeaturesColumn) { foreach (var item in featureRow.Items()) @@ -72,7 +72,7 @@ public static void Example() // Expected output: // Number of Features: 62 - // Ngrams: This|is|an is|an|example an|example|to example|to|compute to|compute|bag-of-word compute|bag-of-word|features. ML.NET's|ProduceWordBags|API ProduceWordBags|API|produces API|produces|bag-of-word produces|bag-of-word|features ... + // N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|bag-of-word compute|bag-of-word|features. ML.NET's|ProduceWordBags|API ProduceWordBags|API|produces API|produces|bag-of-word produces|bag-of-word|features ... // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 ... } From d288792d5fc74f3f6ac657846d096a591ef867b2 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 3 Apr 2019 16:21:54 -0700 Subject: [PATCH 4/6] Addressed reviewers' comments. --- .../Transforms/Text/ProduceHashedWordBags.cs | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs index 56e2b0315f..60144f5f7f 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs @@ -32,10 +32,14 @@ public static void Example() // The following call to 'ProduceHashedWordBags' implicitly tokenizes the text/string into words/tokens. // Please note that the length of the output feature vector depends on the 'numberOfBits' settings. var textPipeline = mlContext.Transforms.Text.ProduceHashedWordBags("BagOfWordFeatures", "Text", - numberOfBits: 5, ngramLength: 3, useAllLengths: false); + numberOfBits: 5, + ngramLength: 3, + useAllLengths: false, + maximumNumberOfInverts: 1); // Fit to data. var textTransformer = textPipeline.Fit(dataview); + var transformedDataView = textTransformer.Transform(dataview); // Create the prediction engine to get the bag-of-word features extracted from the text. var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); @@ -46,6 +50,21 @@ public static void Example() // Print the length of the feature vector. Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); + // Preview of the produced n-grams. + // Get the slot names from the column's metadata. + // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. + VBuffer> slotNames = default; + transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); + var BagOfWordFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["BagOfWordFeatures"]); + var slots = slotNames.GetValues(); + Console.Write("N-grams: "); + foreach (var featureRow in BagOfWordFeaturesColumn) + { + foreach (var item in featureRow.Items()) + Console.Write($"{slots[item.Key]} "); + Console.WriteLine(); + } + // Print the first 10 feature values. Console.Write("Features: "); for (int i = 0; i < 10; i++) @@ -53,7 +72,8 @@ public static void Example() // Expected output: // Number of Features: 32 - // Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 2.0000 0.0000 0.0000 + // N-grams: an|example|to is|an|example example|to|compute This|is|an compute|bag-of-word|features bag-of-word|features|using to|compute|bag-of-word ML.NET's|ProduceHashedWordBags|API as|an|index API|produces|count ... + // Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 2.0000 0.0000 0.0000 ... } public class TextData From bdca2a56816cbf5963f45f9c10ad1df8fd9cb6cd Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 4 Apr 2019 10:28:55 -0700 Subject: [PATCH 5/6] Changed input/output classes to private. --- .../Dynamic/Transforms/Text/ProduceHashedWordBags.cs | 4 ++-- .../Dynamic/Transforms/Text/ProduceWordBags.cs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs index 60144f5f7f..a5cf7f1fcf 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs @@ -76,12 +76,12 @@ public static void Example() // Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 2.0000 0.0000 0.0000 ... } - public class TextData + private class TextData { public string Text { get; set; } } - public class TransformedTextData : TextData + private class TransformedTextData : TextData { public float[] BagOfWordFeatures { get; set; } } diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs index b55afcf0e9..7052b9bbfb 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs @@ -76,12 +76,12 @@ public static void Example() // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 ... } - public class TextData + private class TextData { public string Text { get; set; } } - public class TransformedTextData : TextData + private class TransformedTextData : TextData { public float[] BagOfWordFeatures { get; set; } } From bc60f0936f21675af4945d15ca1e12540b4835b2 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 4 Apr 2019 12:27:20 -0700 Subject: [PATCH 6/6] Addressed reviewers' comments. --- .../Dynamic/Transforms/Text/ProduceHashedWordBags.cs | 2 +- .../Dynamic/Transforms/Text/ProduceWordBags.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs index a5cf7f1fcf..8a23517149 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs @@ -52,7 +52,7 @@ public static void Example() // Preview of the produced n-grams. // Get the slot names from the column's metadata. - // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. + // The slot names for a vector column corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); var BagOfWordFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["BagOfWordFeatures"]); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs index 7052b9bbfb..9e9ab553a9 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs @@ -52,7 +52,7 @@ public static void Example() // Preview of the produced n-grams. // Get the slot names from the column's metadata. - // If the column is a vector column the slot names corresponds to the names associated with each position in the vector. + // The slot names for a vector column corresponds to the names associated with each position in the vector. VBuffer> slotNames = default; transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); var BagOfWordFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["BagOfWordFeatures"]);