Skip to content

Created samples for 'FeaturizeText' API. #3120

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;

namespace Microsoft.ML.Samples.Dynamic
{
public static class FeaturizeText
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
var samples = new List<TextData>()
{
new TextData(){ Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features." },
new TextData(){ Text = "This API can be used as a featurizer to perform text classification." },
new TextData(){ Text = "There are a number of approaches to text classification." },
new TextData(){ Text = "One of the simplest and most common approaches is called “Bag of Words”." },
new TextData(){ Text = "Text classification can be used for a wide variety of tasks" },
new TextData(){ Text = "such as sentiment analysis, topic detection, intent identification etc." },
};

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for converting text into numeric features.
// The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with default parameters.
// The default settings for the TextFeaturizingEstimator are
// * StopWordsRemover: None
// * CaseMode: Lowercase
// * OutputTokensColumnName: None
// * KeepDiacritics: false, KeepPunctuations: true, KeepNumbers: true
// * WordFeatureExtractor: NgramLength = 1
// * CharFeatureExtractor: NgramLength = 3, UseAllLengths = false
// The length of the output feature vector depends on these settings.
var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", "Text");

// Fit to data.
var textTransformer = textPipeline.Fit(dataview);

// Create the prediction engine to get the features extracted from the text.
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);

// Convert the text into numeric features.
var prediction = predictionEngine.Predict(samples[0]);

// Print the length of the feature vector.
Console.WriteLine($"Number of Features: {prediction.Features.Length}");

// Print the first 10 feature values.
Console.Write("Features: ");
for (int i = 0; i < 10; i++)
Console.Write($"{prediction.Features[i]:F4} ");

// Expected output:
// Number of Features: 332
// Features: 0.0857 0.0857 0.0857 0.0857 0.0857 0.0857 0.0857 0.0857 0.0857 0.1715 ...
}

public class TextData
{
public string Text { get; set; }
}

public class TransformedTextData : TextData
{
public float[] Features { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
using System;
using System.Collections.Generic;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;

namespace Microsoft.ML.Samples.Dynamic
{
public static class FeaturizeTextWithOptions
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
var samples = new List<TextData>()
{
new TextData(){ Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features." },
new TextData(){ Text = "This API can be used as a featurizer to perform text classification." },
new TextData(){ Text = "There are a number of approaches to text classification." },
new TextData(){ Text = "One of the simplest and most common approaches is called “Bag of Words”." },
new TextData(){ Text = "Text classification can be used for a wide variety of tasks" },
new TextData(){ Text = "such as sentiment analysis, topic detection, intent identification etc." },
};

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for converting text into numeric features.
// The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with given parameters.
// The length of the output feature vector depends on these settings.
var options = new TextFeaturizingEstimator.Options()
{
// Also output tokenized words
OutputTokensColumnName = "OutputTokens",
CaseMode = TextNormalizingEstimator.CaseMode.Lower,
// Use ML.NET's built-in stop word remover
StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English },
WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true },
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths= false },
};
var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", options, "Text");

// Fit to data.
var textTransformer = textPipeline.Fit(dataview);

// Create the prediction engine to get the features extracted from the text.
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);

// Convert the text into numeric features.
var prediction = predictionEngine.Predict(samples[0]);

// Print the length of the feature vector.
Console.WriteLine($"Number of Features: {prediction.Features.Length}");

// Print feature values and tokens.
Console.Write("Features: ");
for (int i = 0; i < 10; i++)
Console.Write($"{prediction.Features[i]:F4} ");

Console.WriteLine($"\nTokens: {string.Join(",", prediction.OutputTokens)}");

// Expected output:
// Number of Features: 282
// Features: 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.1881 ...
// Tokens: ml.net's,featurizetext,api,uses,composition,basic,transforms,convert,text,numeric,features.
}

public class TextData
{
public string Text { get; set; }
}

public class TransformedTextData : TextData
{
public float[] Features { get; set; }
public string[] OutputTokens { get; set; }
}
}
}
9 changes: 8 additions & 1 deletion src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@ public static class TextCatalog
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeText.cs)]
/// ]]>
/// </format>
/// </example>
public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null)
Expand All @@ -38,7 +45,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs)]
/// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeTextWithOptions.cs)]
/// ]]>
/// </format>
/// </example>
Expand Down