Skip to content

Scrub word embedding transform #2891

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/code/MlNetCookBook.md
Original file line number Diff line number Diff line change
Expand Up @@ -782,7 +782,7 @@ var pipeline =
// NLP pipeline 4: word embeddings.
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
.Append(mlContext.Transforms.Text.ExtractWordEmbeddings("Embeddings", "TokenizedMessage",
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D));
WordEmbeddingsExtractingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));

// Let's train our pipeline, and then apply it to the same data.
// Note that even on a small dataset of 70KB the pipeline above can take up to a minute to completely train.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ public static void Example()

// Let's apply pretrained word embedding model GloVeTwitter25D.
// 25D means each word mapped into 25 dimensional space, basically each word represented by 25 float values.
var gloveWordEmbedding = ml.Transforms.Text.ExtractWordEmbeddings("GloveEmbeddings", "CleanWords",
WordEmbeddingsExtractingEstimator.PretrainedModelKind.GloVeTwitter25D);
var gloveWordEmbedding = ml.Transforms.Text.ApplyWordEmbedding("GloveEmbeddings", "CleanWords",
WordEmbeddingEstimator.PretrainedModelKind.GloVeTwitter25D);

// We also have option to apply custom word embedding models.
// Let's first create one.
Expand All @@ -81,7 +81,7 @@ public static void Example()
file.WriteLine("best" + " " + string.Join(" ", 0f, 0f, 20f));
}
// Now let's add custom embedding on top of same words.
var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ExtractWordEmbeddings("CustomEmbeddings", @".\custommodel.txt", "CleanWords"));
var pipeline = gloveWordEmbedding.Append(ml.Transforms.Text.ApplyWordEmbedding("CustomEmbeddings", @".\custommodel.txt", "CleanWords"));

// And do all required transformations.
var embeddingDataview = pipeline.Fit(wordsDataview).Transform(wordsDataview);
Expand Down
16 changes: 8 additions & 8 deletions src/Microsoft.ML.StaticPipe/WordEmbeddingsStaticExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public static class WordEmbeddingsStaticExtensions
/// <param name="input">Vector of tokenized text.</param>
/// <param name="modelKind">The pretrained word embedding model.</param>
/// <returns></returns>
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
public static Vector<float> WordEmbeddings(this VarVector<string> input, WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
{
Contracts.CheckValue(input, nameof(input));
return new OutColumn(input, modelKind);
Expand All @@ -33,7 +33,7 @@ private sealed class OutColumn : Vector<float>
{
public PipelineColumn Input { get; }

public OutColumn(VarVector<string> input, WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
public OutColumn(VarVector<string> input, WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
: base(new Reconciler(modelKind), input)
{
Input = input;
Expand All @@ -48,10 +48,10 @@ public OutColumn(VarVector<string> input, string customModelFile = null)

private sealed class Reconciler : EstimatorReconciler
{
private readonly WordEmbeddingsExtractingEstimator.PretrainedModelKind? _modelKind;
private readonly WordEmbeddingEstimator.PretrainedModelKind? _modelKind;
private readonly string _customLookupTable;

public Reconciler(WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
public Reconciler(WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
{
_modelKind = modelKind;
_customLookupTable = null;
Expand All @@ -71,18 +71,18 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
{
Contracts.Assert(toOutput.Length == 1);

var cols = new WordEmbeddingsExtractingEstimator.ColumnOptions[toOutput.Length];
var cols = new WordEmbeddingEstimator.ColumnOptions[toOutput.Length];
for (int i = 0; i < toOutput.Length; ++i)
{
var outCol = (OutColumn)toOutput[i];
cols[i] = new WordEmbeddingsExtractingEstimator.ColumnOptions(outputNames[outCol], inputNames[outCol.Input]);
cols[i] = new WordEmbeddingEstimator.ColumnOptions(outputNames[outCol], inputNames[outCol.Input]);
}

bool customLookup = !string.IsNullOrWhiteSpace(_customLookupTable);
if (customLookup)
return new WordEmbeddingsExtractingEstimator(env, _customLookupTable, cols);
return new WordEmbeddingEstimator(env, _customLookupTable, cols);
else
return new WordEmbeddingsExtractingEstimator(env, _modelKind.Value, cols);
return new WordEmbeddingEstimator(env, _modelKind.Value, cols);
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions src/Microsoft.ML.Transforms/EntryPoints/TextAnalytics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -132,16 +132,16 @@ public static CommonOutputs.TransformOutput LightLda(IHostEnvironment env, Laten
}

[TlcModule.EntryPoint(Name = "Transforms.WordEmbeddings",
Desc = WordEmbeddingsExtractingTransformer.Summary,
UserName = WordEmbeddingsExtractingTransformer.UserName,
ShortName = WordEmbeddingsExtractingTransformer.ShortName)]
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingsExtractingTransformer.Options input)
Desc = WordEmbeddingTransformer.Summary,
UserName = WordEmbeddingTransformer.UserName,
ShortName = WordEmbeddingTransformer.ShortName)]
public static CommonOutputs.TransformOutput WordEmbeddings(IHostEnvironment env, WordEmbeddingTransformer.Options input)
{
Contracts.CheckValue(env, nameof(env));
env.CheckValue(input, nameof(input));

var h = EntryPointUtils.CheckArgsAndCreateHost(env, "WordEmbeddings", input);
var view = WordEmbeddingsExtractingTransformer.Create(h, input, input.Data);
var view = WordEmbeddingTransformer.Create(h, input, input.Data);
return new CommonOutputs.TransformOutput()
{
Model = new TransformModelImpl(h, view, input.Data),
Expand Down
22 changes: 11 additions & 11 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,19 +101,19 @@ public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.Text
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingsExtractingEstimator.PretrainedModelKind"/> to use. </param>
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs)]
/// ]]>
/// </format>
/// </example>
public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog,
public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string inputColumnName = null,
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe)
=> new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);
WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, modelKind);

/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
/// <param name="catalog">The text-related transform's catalog.</param>
Expand All @@ -127,16 +127,16 @@ public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this Trans
/// ]]>
/// </format>
/// </example>
public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog,
public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog,
string outputColumnName,
string customModelFile,
string inputColumnName = null)
=> new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
outputColumnName, customModelFile, inputColumnName ?? outputColumnName);

/// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
/// <param name="catalog">The text-related transform's catalog.</param>
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingsExtractingEstimator.PretrainedModelKind"/> to use. </param>
/// <param name="modelKind">The embeddings <see cref="WordEmbeddingEstimator.PretrainedModelKind"/> to use. </param>
/// <param name="columns">The array columns, and per-column configurations to extract embeedings from.</param>
/// <example>
/// <format type="text/markdown">
Expand All @@ -145,10 +145,10 @@ public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this Trans
/// ]]>
/// </format>
/// </example>
public static WordEmbeddingsExtractingEstimator ExtractWordEmbeddings(this TransformsCatalog.TextTransforms catalog,
WordEmbeddingsExtractingEstimator.PretrainedModelKind modelKind = WordEmbeddingsExtractingEstimator.PretrainedModelKind.Sswe,
params WordEmbeddingsExtractingEstimator.ColumnOptions[] columns)
=> new WordEmbeddingsExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);
public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.TextTransforms catalog,
WordEmbeddingEstimator.PretrainedModelKind modelKind = WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding,
params WordEmbeddingEstimator.ColumnOptions[] columns)
=> new WordEmbeddingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), modelKind, columns);

/// <summary>
/// Tokenizes incoming text in <paramref name="inputColumnName"/>, using <paramref name="separators"/> as separators,
Expand Down
Loading