-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Added tests for text featurizer options (Part2). #3036
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1820e23
b81b73a
a7f01b8
5a10ab4
90d3a53
797b90f
a2062ec
1ebf44a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ private class TestClass | |
{ | ||
public string A; | ||
public string[] OutputTokens; | ||
public float[] Features = null; | ||
} | ||
|
||
[Fact] | ||
|
@@ -41,7 +42,7 @@ public void TextFeaturizerWithPredefinedStopWordRemoverTest() | |
var dataView = ML.Data.LoadFromEnumerable(data); | ||
|
||
var options = new TextFeaturizingEstimator.Options() { StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), OutputTokensColumnName = "OutputTokens" }; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
var prediction = engine.Predict(data[0]); | ||
|
@@ -51,6 +52,95 @@ public void TextFeaturizerWithPredefinedStopWordRemoverTest() | |
Assert.Equal("stop words", string.Join(" ", prediction.OutputTokens)); | ||
} | ||
|
||
[Fact] | ||
public void TextFeaturizerWithWordFeatureExtractorTest() | ||
{ | ||
var data = new[] { new TestClass() { A = "This is some text in english", OutputTokens=null}, | ||
new TestClass() { A = "This is another example", OutputTokens=null } }; | ||
var dataView = ML.Data.LoadFromEnumerable(data); | ||
|
||
var options = new TextFeaturizingEstimator.Options() | ||
{ | ||
WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, | ||
CharFeatureExtractor = null, | ||
Norm = TextFeaturizingEstimator.NormFunction.None, | ||
OutputTokensColumnName = "OutputTokens" | ||
}; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
|
||
var prediction = engine.Predict(data[0]); | ||
Assert.Equal(data[0].A.ToLower(), string.Join(" ", prediction.OutputTokens)); | ||
var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }; | ||
Assert.Equal(expected, prediction.Features); | ||
|
||
prediction = engine.Predict(data[1]); | ||
Assert.Equal(data[1].A.ToLower(), string.Join(" ", prediction.OutputTokens)); | ||
expected = new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }; | ||
Assert.Equal(expected, prediction.Features); | ||
} | ||
|
||
[Fact] | ||
public void TextFeaturizerWithCharFeatureExtractorTest() | ||
{ | ||
var data = new[] { new TestClass() { A = "abc efg", OutputTokens=null}, | ||
new TestClass() { A = "xyz", OutputTokens=null } }; | ||
var dataView = ML.Data.LoadFromEnumerable(data); | ||
|
||
var options = new TextFeaturizingEstimator.Options() | ||
{ | ||
WordFeatureExtractor = null, | ||
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, | ||
Norm = TextFeaturizingEstimator.NormFunction.None, | ||
OutputTokensColumnName = "OutputTokens" | ||
}; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
|
||
var prediction = engine.Predict(data[0]); | ||
Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens)); | ||
var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f }; | ||
Assert.Equal(expected, prediction.Features); | ||
|
||
prediction = engine.Predict(data[1]); | ||
Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens)); | ||
expected = new float[] { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f }; | ||
Assert.Equal(expected, prediction.Features); | ||
} | ||
|
||
[Fact] | ||
public void TextFeaturizerWithL2NormTest() | ||
{ | ||
var data = new[] { new TestClass() { A = "abc xyz", OutputTokens=null}, | ||
new TestClass() { A = "xyz", OutputTokens=null } }; | ||
var dataView = ML.Data.LoadFromEnumerable(data); | ||
|
||
var options = new TextFeaturizingEstimator.Options() | ||
{ | ||
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Is this correct? it doesn't read right to initialize a CharExtractor with the options of a WordBagEstimator... #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
Norm = TextFeaturizingEstimator.NormFunction.L2, | ||
OutputTokensColumnName = "OutputTokens" | ||
}; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
|
||
var prediction = engine.Predict(data[0]); | ||
Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens)); | ||
var exp1 = 0.333333343f; | ||
var exp2 = 0.707106769f; | ||
var expected = new float[] { exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp2, exp2 }; | ||
Assert.Equal(expected, prediction.Features); | ||
|
||
prediction = engine.Predict(data[1]); | ||
exp1 = 0.4472136f; | ||
Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens)); | ||
expected = new float[] { exp1, 0.0f, 0.0f, 0.0f, 0.0f, exp1, exp1, exp1, exp1, 0.0f, 1.0f }; | ||
Assert.Equal(expected, prediction.Features); | ||
} | ||
|
||
[Fact] | ||
public void TextFeaturizerWithCustomStopWordRemoverTest() | ||
{ | ||
|
@@ -67,7 +157,7 @@ public void TextFeaturizerWithCustomStopWordRemoverTest() | |
OutputTokensColumnName = "OutputTokens", | ||
CaseMode = TextNormalizingEstimator.CaseMode.None | ||
}; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
var prediction = engine.Predict(data[0]); | ||
|
@@ -84,7 +174,7 @@ private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingE | |
CaseMode = caseMode, | ||
OutputTokensColumnName = "OutputTokens" | ||
}; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
var prediction1 = engine.Predict(data[0]); | ||
|
@@ -133,7 +223,7 @@ private void TestKeepNumbers(IDataView dataView, TestClass[] data, bool keepNumb | |
CaseMode = TextNormalizingEstimator.CaseMode.None, | ||
OutputTokensColumnName = "OutputTokens" | ||
}; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
var prediction1 = engine.Predict(data[0]); | ||
|
@@ -170,7 +260,7 @@ private void TestKeepPunctuations(IDataView dataView, TestClass[] data, bool kee | |
CaseMode = TextNormalizingEstimator.CaseMode.None, | ||
OutputTokensColumnName = "OutputTokens" | ||
}; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
var prediction1 = engine.Predict(data[0]); | ||
|
@@ -208,7 +298,7 @@ private void TestKeepDiacritics(IDataView dataView, TestClass[] data, bool keepD | |
CaseMode = TextNormalizingEstimator.CaseMode.None, | ||
OutputTokensColumnName = "OutputTokens" | ||
}; | ||
var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); | ||
var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); | ||
var model = pipeline.Fit(dataView); | ||
var engine = model.CreatePredictionEngine<TestClass, TestClass>(ML); | ||
var prediction1 = engine.Predict(data[0]); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should this be 0? or is this the end marker? #Resolved
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its the end marker.
In reply to: 267967436 [](ancestors = 267967436)