Skip to content

Added tests for text featurizer options (Part2). #3036

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Mar 21, 2019
102 changes: 96 additions & 6 deletions test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ private class TestClass
{
public string A;
public string[] OutputTokens;
public float[] Features = null;
}

[Fact]
Expand All @@ -41,7 +42,7 @@ public void TextFeaturizerWithPredefinedStopWordRemoverTest()
var dataView = ML.Data.LoadFromEnumerable(data);

var options = new TextFeaturizingEstimator.Options() { StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), OutputTokensColumnName = "OutputTokens" };
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A");
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);
var prediction = engine.Predict(data[0]);
Expand All @@ -51,6 +52,95 @@ public void TextFeaturizerWithPredefinedStopWordRemoverTest()
Assert.Equal("stop words", string.Join(" ", prediction.OutputTokens));
}

[Fact]
public void TextFeaturizerWithWordFeatureExtractorTest()
{
var data = new[] { new TestClass() { A = "This is some text in english", OutputTokens=null},
new TestClass() { A = "This is another example", OutputTokens=null } };
var dataView = ML.Data.LoadFromEnumerable(data);

var options = new TextFeaturizingEstimator.Options()
{
WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
CharFeatureExtractor = null,
Norm = TextFeaturizingEstimator.NormFunction.None,
OutputTokensColumnName = "OutputTokens"
};
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);

var prediction = engine.Predict(data[0]);
Assert.Equal(data[0].A.ToLower(), string.Join(" ", prediction.OutputTokens));
var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f };
Assert.Equal(expected, prediction.Features);

prediction = engine.Predict(data[1]);
Assert.Equal(data[1].A.ToLower(), string.Join(" ", prediction.OutputTokens));
expected = new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f };
Assert.Equal(expected, prediction.Features);
}

[Fact]
public void TextFeaturizerWithCharFeatureExtractorTest()
{
var data = new[] { new TestClass() { A = "abc efg", OutputTokens=null},
new TestClass() { A = "xyz", OutputTokens=null } };
var dataView = ML.Data.LoadFromEnumerable(data);

var options = new TextFeaturizingEstimator.Options()
{
WordFeatureExtractor = null,
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
Norm = TextFeaturizingEstimator.NormFunction.None,
OutputTokensColumnName = "OutputTokens"
};
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);

var prediction = engine.Predict(data[0]);
Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens));
var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f };
Copy link
Member

@sfilipi sfilipi Mar 21, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1.0f [](start = 89, length = 4)

should this be 0? or is this the end marker? #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its the end marker.


In reply to: 267967436 [](ancestors = 267967436)

Assert.Equal(expected, prediction.Features);

prediction = engine.Predict(data[1]);
Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens));
expected = new float[] { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f };
Assert.Equal(expected, prediction.Features);
}

[Fact]
public void TextFeaturizerWithL2NormTest()
{
var data = new[] { new TestClass() { A = "abc xyz", OutputTokens=null},
new TestClass() { A = "xyz", OutputTokens=null } };
var dataView = ML.Data.LoadFromEnumerable(data);

var options = new TextFeaturizingEstimator.Options()
{
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1},
Copy link
Member

@sfilipi sfilipi Mar 21, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CharFeatureExtractor = new WordBagEstimator.Options() [](start = 16, length = 54)

Is this correct? it doesn't read right to initialize a CharExtractor with the options of a WordBagEstimator... #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice catch! I have added this as a part of issue in #2895.


In reply to: 267968148 [](ancestors = 267968148)

Norm = TextFeaturizingEstimator.NormFunction.L2,
OutputTokensColumnName = "OutputTokens"
};
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);

var prediction = engine.Predict(data[0]);
Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens));
var exp1 = 0.333333343f;
var exp2 = 0.707106769f;
var expected = new float[] { exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp2, exp2 };
Assert.Equal(expected, prediction.Features);

prediction = engine.Predict(data[1]);
exp1 = 0.4472136f;
Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens));
expected = new float[] { exp1, 0.0f, 0.0f, 0.0f, 0.0f, exp1, exp1, exp1, exp1, 0.0f, 1.0f };
Assert.Equal(expected, prediction.Features);
}

[Fact]
public void TextFeaturizerWithCustomStopWordRemoverTest()
{
Expand All @@ -67,7 +157,7 @@ public void TextFeaturizerWithCustomStopWordRemoverTest()
OutputTokensColumnName = "OutputTokens",
CaseMode = TextNormalizingEstimator.CaseMode.None
};
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A");
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);
var prediction = engine.Predict(data[0]);
Expand All @@ -84,7 +174,7 @@ private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingE
CaseMode = caseMode,
OutputTokensColumnName = "OutputTokens"
};
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A");
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);
var prediction1 = engine.Predict(data[0]);
Expand Down Expand Up @@ -133,7 +223,7 @@ private void TestKeepNumbers(IDataView dataView, TestClass[] data, bool keepNumb
CaseMode = TextNormalizingEstimator.CaseMode.None,
OutputTokensColumnName = "OutputTokens"
};
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A");
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);
var prediction1 = engine.Predict(data[0]);
Expand Down Expand Up @@ -170,7 +260,7 @@ private void TestKeepPunctuations(IDataView dataView, TestClass[] data, bool kee
CaseMode = TextNormalizingEstimator.CaseMode.None,
OutputTokensColumnName = "OutputTokens"
};
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A");
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);
var prediction1 = engine.Predict(data[0]);
Expand Down Expand Up @@ -208,7 +298,7 @@ private void TestKeepDiacritics(IDataView dataView, TestClass[] data, bool keepD
CaseMode = TextNormalizingEstimator.CaseMode.None,
OutputTokensColumnName = "OutputTokens"
};
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A");
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A");
var model = pipeline.Fit(dataView);
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML);
var prediction1 = engine.Predict(data[0]);
Expand Down