Skip to content

Converted listed text transforms into transformers/estimators. #953

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Sep 21, 2018
Merged
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Transforms/TermEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ public static class Defaults
/// Convenience constructor for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="inputColumn">Name of the output column.</param>
/// <param name="outputColumn">Name of the column to be transformed. If this is null '<paramref name="inputColumn"/>' will be used.</param>
/// <param name="inputColumn">Name of the column to be transformed.</param>
/// <param name="outputColumn">Name of the output column. If this is null '<paramref name="inputColumn"/>' will be used.</param>
/// <param name="maxNumTerms">Maximum number of terms to keep per column when auto-training.</param>
/// <param name="sort">How items should be ordered when vectorized. By default, they will be in the order encountered.
/// If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').</param>
Expand Down
479 changes: 479 additions & 0 deletions src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs

Large diffs are not rendered by default.

613 changes: 613 additions & 0 deletions src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/bag_of_words.tsv

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/ngrams.tsv

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/words_without_stopwords.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=text:TX:0
#@ col=words_without_stopwords:TX:1-**
#@ }
text
==rude== dude, you are rude upload that carl picture back, or else. ==rude== dude, you rude upload carl picture back, else.
== ok! == im going to vandalize wild ones wiki then!!! == ok! == im going vandalize wild ones wiki then!!!
stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas, stop trolling, zapatancas, calling liar merely demonstartes you arer zapatancas. you choose chase legitimate editor site ignore i editor record isnt 99% trolling wishes completely ignored sockpuppet like yourself. consensus overwhelmingly you your trollin g lover zapatancas,
==you're cool== you seem like a really cool guy... *bursts out laughing at sarcasm*. ==you're cool== you like really cool guy... *bursts laughing sarcasm*.
12 changes: 12 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/bag_of_words.tsv

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/ngrams.tsv

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/words_without_stopwords.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=text:TX:0
#@ col=words_without_stopwords:TX:1-**
#@ }
text
==rude== dude, you are rude upload that carl picture back, or else. ==rude== dude, you rude upload carl picture back, else.
== ok! == im going to vandalize wild ones wiki then!!! == ok! == im going vandalize wild ones wiki then!!!
stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas, stop trolling, zapatancas, calling liar merely demonstartes you arer zapatancas. you choose chase legitimate editor site ignore i editor record isnt 99% trolling wishes completely ignored sockpuppet like yourself. consensus overwhelmingly you your trollin g lover zapatancas,
==you're cool== you seem like a really cool guy... *bursts out laughing at sarcasm*. ==you're cool== you like really cool guy... *bursts laughing sarcasm*.
87 changes: 87 additions & 0 deletions test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,93 @@ public void Tokenize()
Assert.True(type.ItemType.AsKey.RawKind == DataKind.U2);
}

[Fact]
public void NormalizeTextAndRemoveStopWords()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
normalized_text: r.text.NormalizeText(),
words_without_stopwords: r.text.TokenizeText().RemoveStopwords()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("words_without_stopwords", out int stopwordsCol));
var type = schema.GetColumnType(stopwordsCol);
Assert.True(type.IsVector && !type.IsKnownSizeVector && type.ItemType.IsText);

Assert.True(schema.TryGetColumnIndex("normalized_text", out int normTextCol));
type = schema.GetColumnType(normTextCol);
Assert.True(type.IsText && !type.IsVector);
}

[Fact]
public void ConvertToWordBag()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
bagofword: r.text.ToBagofWords(),
bagofhashedword: r.text.ToBagofHashedWords()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("bagofword", out int bagofwordCol));
var type = schema.GetColumnType(bagofwordCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

Assert.True(schema.TryGetColumnIndex("bagofhashedword", out int bagofhashedwordCol));
type = schema.GetColumnType(bagofhashedwordCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
}

[Fact]
public void Ngrams()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
ngrams: r.text.TokenizeText().ToKey().ToNgrams(),
ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("ngrams", out int ngramsCol));
var type = schema.GetColumnType(ngramsCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

Assert.True(schema.TryGetColumnIndex("ngramshash", out int ngramshashCol));
type = schema.GetColumnType(ngramshashCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
}


[Fact]
public void LpGcNormAndWhitening()
Expand Down
109 changes: 109 additions & 0 deletions test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,114 @@ public void TextTokenizationWorkout()
CheckEquality("Text", "tokenized.tsv");
Done();
}


[Fact]
public void TextNormalizationAndStopwordRemoverWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new TextNormalizer(Env,"text")
.Append(new WordTokenizer(Env, "text", "words"))
.Append(new StopwordRemover(Env, "words", "words_without_stopwords"));
TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "words_without_stopwords.tsv");
Done();
}

[Fact]
public void WordBagWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is invalid data the same as the valid one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not valid. Here text is tried to be loaded as float.


In reply to: 219582043 [](ancestors = 219582043)

label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new WordBagEstimator(Env, "text", "bag_of_words").
Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash"));

// The following call fails because of the following issue
// https://github.com/dotnet/machinelearning/issues/969
// TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

var outputPath = GetOutputPath("Text", "bag_of_words.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "bag_of_words", "bag_of_wordshash");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "bag_of_words.tsv");
Done();
}

[Fact]
public void NgramWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new WordTokenizer(Env, "text", "text")
.Append(new TermEstimator(Env, "text", "terms"))
.Append(new NgramEstimator(Env, "terms", "ngrams"))
.Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

// The following call fails because of the following issue
// https://github.com/dotnet/machinelearning/issues/969
// TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

var outputPath = GetOutputPath("Text", "ngrams.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "ngrams.tsv");
Done();
}
}
}