-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Created samples for TokenizeIntoWords and RemoveStopWords APIs. #3156
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
15be23c
58e2d4b
a3ec5d3
64ff946
9241a21
ad6f967
1bc241d
672ade6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using Microsoft.ML.Transforms.Text; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class RemoveDefaultStopWords | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Create an empty list as the dataset. The 'RemoveDefaultStopWords' does not require training data as | ||
// the estimator ('StopWordsRemovingEstimator') created by 'RemoveDefaultStopWords' API is not a trainable estimator. | ||
// The empty list is only needed to pass input schema to the pipeline. | ||
var emptySamples = new List<TextData>(); | ||
|
||
// Convert sample list to an empty IDataView. | ||
var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); | ||
|
||
// A pipeline for removing stop words from input text/string. | ||
// The pipeline first tokenizes text into words then removes stop words. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
shall we add link to list of stop words? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also if you can modify string to include "tHe" or something like that and show it was removed (because we compare by ignoring casing (I hope so) would be nice. In reply to: 271477611 [](ancestors = 271477611) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the link should go into the documentation instead of sample. In reply to: 271478137 [](ancestors = 271478137,271477611) |
||
// The 'RemoveDefaultStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words. | ||
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") | ||
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("WordsWithoutStopWords", "Words", language: StopWordsRemovingEstimator.Language.English)); | ||
|
||
// Fit to data. | ||
var textTransformer = textPipeline.Fit(emptyDataView); | ||
|
||
// Create the prediction engine to remove the stop words from the input text/string. | ||
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); | ||
|
||
// Call the prediction API to remove stop words. | ||
var data = new TextData() { Text = "ML.NET's RemoveDefaultStopWords API removes stop words from tHe text/string. It requires the text/string to be tokenized beforehand." }; | ||
var prediction = predictionEngine.Predict(data); | ||
|
||
// Print the length of the word vector after the stop words removed. | ||
Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}"); | ||
|
||
// Print the word vector without stop words. | ||
Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}"); | ||
|
||
// Expected output: | ||
// Number of words: 11 | ||
// Words without stop words: ML.NET's,RemoveDefaultStopWords,API,removes,stop,words,text/string.,requires,text/string,tokenized,beforehand. | ||
} | ||
|
||
public class TextData | ||
{ | ||
public string Text { get; set; } | ||
} | ||
|
||
public class TransformedTextData : TextData | ||
{ | ||
public string[] WordsWithoutStopWords { get; set; } | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using Microsoft.ML.Transforms.Text; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class RemoveStopWords | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Create an empty list as the dataset. The 'RemoveStopWords' does not require training data as | ||
// the estimator ('CustomStopWordsRemovingEstimator') created by 'RemoveStopWords' API is not a trainable estimator. | ||
// The empty list is only needed to pass input schema to the pipeline. | ||
var emptySamples = new List<TextData>(); | ||
|
||
// Convert sample list to an empty IDataView. | ||
var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); | ||
|
||
// A pipeline for removing stop words from input text/string. | ||
// The pipeline first tokenizes text into words then removes stop words. | ||
// The 'RemoveStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words. | ||
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text") | ||
.Append(mlContext.Transforms.Text.RemoveStopWords("WordsWithoutStopWords", "Words", stopwords: new[] { "a", "the", "from", "by" })); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I would throw few words regarding casing. |
||
|
||
// Fit to data. | ||
var textTransformer = textPipeline.Fit(emptyDataView); | ||
|
||
// Create the prediction engine to remove the stop words from the input text/string. | ||
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); | ||
|
||
// Call the prediction API to remove stop words. | ||
var data = new TextData() { Text = "ML.NET's RemoveStopWords API removes stop words from tHe text/string using a list of stop words provided by the user." }; | ||
var prediction = predictionEngine.Predict(data); | ||
|
||
// Print the length of the word vector after the stop words removed. | ||
Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}"); | ||
|
||
// Print the word vector without stop words. | ||
Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}"); | ||
|
||
// Expected output: | ||
// Number of words: 14 | ||
// Words without stop words: ML.NET's,RemoveStopWords,API,removes,stop,words,text/string,using,list,of,stop,words,provided,user. | ||
} | ||
|
||
public class TextData | ||
{ | ||
public string Text { get; set; } | ||
} | ||
|
||
public class TransformedTextData : TextData | ||
{ | ||
public string[] WordsWithoutStopWords { get; set; } | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Text; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class TokenizeIntoWords | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Create an empty list as the dataset. The 'TokenizeIntoWords' does not require training data as | ||
// the estimator ('WordTokenizingEstimator') created by 'TokenizeIntoWords' API is not a trainable estimator. | ||
// The empty list is only needed to pass input schema to the pipeline. | ||
var emptySamples = new List<TextData>(); | ||
|
||
// Convert sample list to an empty IDataView. | ||
var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); | ||
|
||
// A pipeline for converting text into vector of words. | ||
// The following call to 'TokenizeIntoWords' tokenizes text/string into words using space as a separator. | ||
// Space is also a default value for the 'separators' argument if it is not specified. | ||
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' }); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Can you add details about what to expect here with default values? One thing we may want to mention is removing any whitespace. #Resolved |
||
|
||
// Fit to data. | ||
var textTransformer = textPipeline.Fit(emptyDataView); | ||
|
||
// Create the prediction engine to get the word vector from the input text/string. | ||
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); | ||
|
||
// Call the prediction API to convert the text into words. | ||
var data = new TextData() { Text = "ML.NET's TokenizeIntoWords API splits text/string into words using the list of characters provided as separators." }; | ||
var prediction = predictionEngine.Predict(data); | ||
|
||
// Print the length of the word vector. | ||
Console.WriteLine($"Number of words: {prediction.Words.Length}"); | ||
|
||
// Print the word vector. | ||
Console.WriteLine($"\nWords: {string.Join(",", prediction.Words)}"); | ||
|
||
// Expected output: | ||
// Number of words: 15 | ||
// Words: ML.NET's,TokenizeIntoWords,API,splits,text/string,into,words,using,the,list,of,characters,provided,as,separators. | ||
} | ||
|
||
public class TextData | ||
{ | ||
public string Text { get; set; } | ||
} | ||
|
||
public class TransformedTextData : TextData | ||
{ | ||
public string[] Words { get; set; } | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any reason why you prefer this format other than
<see cref="StopWordsRemovingEstimator">
? #PendingThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I doubt if it works in normal comments. works in xml comments.
In reply to: 271080226 [](ancestors = 271080226)