From 1820e2367197574061f2d20e357728e1658f760c Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 18 Mar 2019 17:26:09 -0700 Subject: [PATCH 1/5] Added tests for text featurizer options. --- .../Text/TextFeaturizingEstimator.cs | 2 +- .../Transformers/TextFeaturizerTests.cs | 215 +++++++++++++++++- 2 files changed, 215 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs index 0ebdc836aa..98577a058b 100644 --- a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs +++ b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs @@ -395,7 +395,7 @@ internal TextFeaturizingEstimator(IHostEnvironment env, string name, IEnumerable if (options != null) OptionalSettings = options; - _stopWordsRemover = null; + _stopWordsRemover = OptionalSettings.StopWordsRemover; _dictionary = null; _wordFeatureExtractor = OptionalSettings.WordFeatureExtractorFactory; _charFeatureExtractor = OptionalSettings.CharFeatureExtractorFactory; diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 64a8a93f9d..6a5ef3cf5e 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -1,9 +1,10 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. using System; using System.IO; +using System.Text.RegularExpressions; using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Data.IO; @@ -26,6 +27,218 @@ public TextFeaturizerTests(ITestOutputHelper helper) { } + private class TestClass + { + public string A; + [ColumnName("OutputText_TransformedText")] + public string[] OutputText; + } + + [Fact] + public void TextFeaturizerWithPredefinedStopWordRemoverTest() + { + var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputText=null}, + new TestClass() { A = "No stop words", OutputText=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + var options = new TextFeaturizingEstimator.Options() { StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), OutputTokens = true }; + var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine< TestClass, TestClass>(ML); + var prediction = engine.Predict(data[0]); + Assert.Equal("text english stop words", string.Join(" ", prediction.OutputText)); + + prediction = engine.Predict(data[1]); + Assert.Equal("stop words", string.Join(" ", prediction.OutputText)); + } + + [Fact] + public void TextFeaturizerWithCustomStopWordRemoverTest() + { + var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputText=null}, + new TestClass() { A = "No stop words", OutputText=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + var options = new TextFeaturizingEstimator.Options() + { + StopWordsRemoverOptions = new CustomStopWordsRemovingEstimator.Options() + { + StopWords = new[] { "stop", "words" } + }, + OutputTokens = true, + CaseMode = TextNormalizingEstimator.CaseMode.None + }; + var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine(ML); + var prediction = engine.Predict(data[0]); + Assert.Equal("This is some text with english", string.Join(" ", prediction.OutputText)); + + prediction = engine.Predict(data[1]); + Assert.Equal("No", string.Join(" ", prediction.OutputText)); + } + + private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingEstimator.CaseMode caseMode) + { + var options = new TextFeaturizingEstimator.Options() + { + CaseMode = caseMode, + OutputTokens = true + }; + var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine(ML); + var prediction1 = engine.Predict(data[0]); + var prediction2 = engine.Predict(data[1]); + + string expected1 = null; + string expected2 = null; + if (caseMode == TextNormalizingEstimator.CaseMode.Upper) + { + expected1 = data[0].A.ToUpper(); + expected2 = data[1].A.ToUpper(); + } + else if (caseMode == TextNormalizingEstimator.CaseMode.Lower) + { + expected1 = data[0].A.ToLower(); + expected2 = data[1].A.ToLower(); + } + else if (caseMode == TextNormalizingEstimator.CaseMode.None) + { + expected1 = data[0].A; + expected2 = data[1].A; + } + + Assert.Equal(expected1, string.Join(" ", prediction1.OutputText)); + Assert.Equal(expected2, string.Join(" ", prediction2.OutputText)); + } + + [Fact] + public void TextFeaturizerWithUpperCaseTest() + { + var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputText=null}, + new TestClass() { A = "No stop words", OutputText=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + TestCaseMode(dataView, data, TextNormalizingEstimator.CaseMode.Lower); + TestCaseMode(dataView, data, TextNormalizingEstimator.CaseMode.Upper); + TestCaseMode(dataView, data, TextNormalizingEstimator.CaseMode.None); + } + + + private void TestKeepNumbers(IDataView dataView, TestClass[] data, bool keepNumbers) + { + var options = new TextFeaturizingEstimator.Options() + { + KeepNumbers = keepNumbers, + CaseMode = TextNormalizingEstimator.CaseMode.None, + OutputTokens = true + }; + var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine(ML); + var prediction1 = engine.Predict(data[0]); + var prediction2 = engine.Predict(data[1]); + + if (keepNumbers) + { + Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputText)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + } + else + { + Assert.Equal(data[0].A.Replace("123 ", "").Replace("425", "").Replace("25", "").Replace("23", ""), string.Join(" ", prediction1.OutputText)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + } + } + + [Fact] + public void TextFeaturizerWithKeepNumbersTest() + { + var data = new[] { new TestClass() { A = "This is some text with numbers 123 $425 25.23", OutputText=null}, + new TestClass() { A = "No numbers", OutputText=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + TestKeepNumbers(dataView, data, true); + TestKeepNumbers(dataView, data, false); + } + + private void TestKeepPunctuations(IDataView dataView, TestClass[] data, bool keepPunctuations) + { + var options = new TextFeaturizingEstimator.Options() + { + KeepPunctuations = keepPunctuations, + CaseMode = TextNormalizingEstimator.CaseMode.None, + OutputTokens = true + }; + var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine(ML); + var prediction1 = engine.Predict(data[0]); + var prediction2 = engine.Predict(data[1]); + + if (keepPunctuations) + { + Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputText)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + } + else + { + var expected = Regex.Replace(data[0].A, "[,|_|'|\"|;|\\.]", ""); + Assert.Equal(expected, string.Join(" ", prediction1.OutputText)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + } + } + + [Fact] + public void TextFeaturizerWithKeepPunctuationsTest() + { + var data = new[] { new TestClass() { A = "This, is; some_ ,text 'with\" punctuations.", OutputText=null}, + new TestClass() { A = "No punctuations", OutputText=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + TestKeepPunctuations(dataView, data, true); + TestKeepPunctuations(dataView, data, false); + } + + private void TestKeepDiacritics(IDataView dataView, TestClass[] data, bool keepDiacritics) + { + var options = new TextFeaturizingEstimator.Options() + { + KeepDiacritics = keepDiacritics, + CaseMode = TextNormalizingEstimator.CaseMode.None, + OutputTokens = true + }; + var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine(ML); + var prediction1 = engine.Predict(data[0]); + var prediction2 = engine.Predict(data[1]); + + if (keepDiacritics) + { + Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputText)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + } + else + { + Assert.Equal("This is some text with diacritics", string.Join(" ", prediction1.OutputText)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + } + } + + [Fact] + public void TextFeaturizerWithKeepDiacriticsTest() + { + var data = new[] { new TestClass() { A = "Thîs îs sóme text with diácrîtîcs", OutputText=null}, + new TestClass() { A = "No diacritics", OutputText=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + TestKeepDiacritics(dataView, data, true); + TestKeepDiacritics(dataView, data, false); + } + + [Fact] public void TextFeaturizerWorkout() { From a7f01b84054aa3e45b0cfc2ca94ef37b37037df4 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 19 Mar 2019 11:39:56 -0700 Subject: [PATCH 2/5] Addressed reviewers' comments. --- .../Transformers/TextFeaturizerTests.cs | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index de824bfd49..e0f32bcaa1 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -30,33 +30,32 @@ public TextFeaturizerTests(ITestOutputHelper helper) private class TestClass { public string A; - [ColumnName("OutputText_TransformedText")] - public string[] OutputText; + public string[] OutputTokens; } - [Fact] + [Fact] public void TextFeaturizerWithPredefinedStopWordRemoverTest() { - var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputText=null}, - new TestClass() { A = "No stop words", OutputText=null } }; + var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputTokens=null}, + new TestClass() { A = "No stop words", OutputTokens=null } }; var dataView = ML.Data.LoadFromEnumerable(data); - var options = new TextFeaturizingEstimator.Options() { StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), OutputTokens = true }; + var options = new TextFeaturizingEstimator.Options() { StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); var model = pipeline.Fit(dataView); - var engine = model.CreatePredictionEngine< TestClass, TestClass>(ML); + var engine = model.CreatePredictionEngine(ML); var prediction = engine.Predict(data[0]); - Assert.Equal("text english stop words", string.Join(" ", prediction.OutputText)); + Assert.Equal("text english stop words", string.Join(" ", prediction.OutputTokens)); prediction = engine.Predict(data[1]); - Assert.Equal("stop words", string.Join(" ", prediction.OutputText)); + Assert.Equal("stop words", string.Join(" ", prediction.OutputTokens)); } [Fact] public void TextFeaturizerWithCustomStopWordRemoverTest() { - var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputText=null}, - new TestClass() { A = "No stop words", OutputText=null } }; + var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputTokens=null}, + new TestClass() { A = "No stop words", OutputTokens=null } }; var dataView = ML.Data.LoadFromEnumerable(data); var options = new TextFeaturizingEstimator.Options() @@ -65,17 +64,17 @@ public void TextFeaturizerWithCustomStopWordRemoverTest() { StopWords = new[] { "stop", "words" } }, - OutputTokens = true, + OutputTokensColumnName = "OutputTokens", CaseMode = TextNormalizingEstimator.CaseMode.None }; var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine(ML); var prediction = engine.Predict(data[0]); - Assert.Equal("This is some text with english", string.Join(" ", prediction.OutputText)); + Assert.Equal("This is some text with english", string.Join(" ", prediction.OutputTokens)); prediction = engine.Predict(data[1]); - Assert.Equal("No", string.Join(" ", prediction.OutputText)); + Assert.Equal("No", string.Join(" ", prediction.OutputTokens)); } private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingEstimator.CaseMode caseMode) @@ -83,7 +82,7 @@ private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingE var options = new TextFeaturizingEstimator.Options() { CaseMode = caseMode, - OutputTokens = true + OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); var model = pipeline.Fit(dataView); @@ -109,15 +108,15 @@ private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingE expected2 = data[1].A; } - Assert.Equal(expected1, string.Join(" ", prediction1.OutputText)); - Assert.Equal(expected2, string.Join(" ", prediction2.OutputText)); + Assert.Equal(expected1, string.Join(" ", prediction1.OutputTokens)); + Assert.Equal(expected2, string.Join(" ", prediction2.OutputTokens)); } [Fact] public void TextFeaturizerWithUpperCaseTest() { - var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputText=null}, - new TestClass() { A = "No stop words", OutputText=null } }; + var data = new[] { new TestClass() { A = "This is some text with english stop words", OutputTokens=null}, + new TestClass() { A = "No stop words", OutputTokens=null } }; var dataView = ML.Data.LoadFromEnumerable(data); TestCaseMode(dataView, data, TextNormalizingEstimator.CaseMode.Lower); @@ -132,7 +131,7 @@ private void TestKeepNumbers(IDataView dataView, TestClass[] data, bool keepNumb { KeepNumbers = keepNumbers, CaseMode = TextNormalizingEstimator.CaseMode.None, - OutputTokens = true + OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); var model = pipeline.Fit(dataView); @@ -142,21 +141,21 @@ private void TestKeepNumbers(IDataView dataView, TestClass[] data, bool keepNumb if (keepNumbers) { - Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputText)); - Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputTokens)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } else { - Assert.Equal(data[0].A.Replace("123 ", "").Replace("425", "").Replace("25", "").Replace("23", ""), string.Join(" ", prediction1.OutputText)); - Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + Assert.Equal(data[0].A.Replace("123 ", "").Replace("425", "").Replace("25", "").Replace("23", ""), string.Join(" ", prediction1.OutputTokens)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } } [Fact] public void TextFeaturizerWithKeepNumbersTest() { - var data = new[] { new TestClass() { A = "This is some text with numbers 123 $425 25.23", OutputText=null}, - new TestClass() { A = "No numbers", OutputText=null } }; + var data = new[] { new TestClass() { A = "This is some text with numbers 123 $425 25.23", OutputTokens=null}, + new TestClass() { A = "No numbers", OutputTokens=null } }; var dataView = ML.Data.LoadFromEnumerable(data); TestKeepNumbers(dataView, data, true); @@ -169,7 +168,7 @@ private void TestKeepPunctuations(IDataView dataView, TestClass[] data, bool kee { KeepPunctuations = keepPunctuations, CaseMode = TextNormalizingEstimator.CaseMode.None, - OutputTokens = true + OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); var model = pipeline.Fit(dataView); @@ -179,22 +178,22 @@ private void TestKeepPunctuations(IDataView dataView, TestClass[] data, bool kee if (keepPunctuations) { - Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputText)); - Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputTokens)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } else { var expected = Regex.Replace(data[0].A, "[,|_|'|\"|;|\\.]", ""); - Assert.Equal(expected, string.Join(" ", prediction1.OutputText)); - Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + Assert.Equal(expected, string.Join(" ", prediction1.OutputTokens)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } } [Fact] public void TextFeaturizerWithKeepPunctuationsTest() { - var data = new[] { new TestClass() { A = "This, is; some_ ,text 'with\" punctuations.", OutputText=null}, - new TestClass() { A = "No punctuations", OutputText=null } }; + var data = new[] { new TestClass() { A = "This, is; some_ ,text 'with\" punctuations.", OutputTokens=null}, + new TestClass() { A = "No punctuations", OutputTokens=null } }; var dataView = ML.Data.LoadFromEnumerable(data); TestKeepPunctuations(dataView, data, true); @@ -207,7 +206,7 @@ private void TestKeepDiacritics(IDataView dataView, TestClass[] data, bool keepD { KeepDiacritics = keepDiacritics, CaseMode = TextNormalizingEstimator.CaseMode.None, - OutputTokens = true + OutputTokensColumnName = "OutputTokens" }; var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); var model = pipeline.Fit(dataView); @@ -217,21 +216,21 @@ private void TestKeepDiacritics(IDataView dataView, TestClass[] data, bool keepD if (keepDiacritics) { - Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputText)); - Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + Assert.Equal(data[0].A, string.Join(" ", prediction1.OutputTokens)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } else { - Assert.Equal("This is some text with diacritics", string.Join(" ", prediction1.OutputText)); - Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputText)); + Assert.Equal("This is some text with diacritics", string.Join(" ", prediction1.OutputTokens)); + Assert.Equal(data[1].A, string.Join(" ", prediction2.OutputTokens)); } } [Fact] public void TextFeaturizerWithKeepDiacriticsTest() { - var data = new[] { new TestClass() { A = "Thîs îs sóme text with diácrîtîcs", OutputText=null}, - new TestClass() { A = "No diacritics", OutputText=null } }; + var data = new[] { new TestClass() { A = "Thîs îs sóme text with diácrîtîcs", OutputTokens=null}, + new TestClass() { A = "No diacritics", OutputTokens=null } }; var dataView = ML.Data.LoadFromEnumerable(data); TestKeepDiacritics(dataView, data, true); From 90d3a531123cf70b5da5ecf89e155e6b314d0346 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 19 Mar 2019 16:16:05 -0700 Subject: [PATCH 3/5] Added tests for word and character ngram feature extraction using text featurizer. --- .../Transformers/TextFeaturizerTests.cs | 146 +++++++++++++++++- 1 file changed, 140 insertions(+), 6 deletions(-) diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index e0f32bcaa1..ae083ca61b 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -31,6 +31,7 @@ private class TestClass { public string A; public string[] OutputTokens; + public float[] Features = null; } [Fact] @@ -41,7 +42,7 @@ public void TextFeaturizerWithPredefinedStopWordRemoverTest() var dataView = ML.Data.LoadFromEnumerable(data); var options = new TextFeaturizingEstimator.Options() { StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options(), OutputTokensColumnName = "OutputTokens" }; - var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine(ML); var prediction = engine.Predict(data[0]); @@ -51,6 +52,139 @@ public void TextFeaturizerWithPredefinedStopWordRemoverTest() Assert.Equal("stop words", string.Join(" ", prediction.OutputTokens)); } + [Fact] + public void TextFeaturizerWithWordFeatureExtractorTest() + { + var data = new[] { new TestClass() { A = "This is some text in english", OutputTokens=null}, + new TestClass() { A = "This is another example", OutputTokens=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + var options = new TextFeaturizingEstimator.Options() + { + WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, + CharFeatureExtractor = null, + Norm = TextFeaturizingEstimator.NormFunction.None, + OutputTokensColumnName = "OutputTokens" + }; + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine(ML); + + var prediction = engine.Predict(data[0]); + Assert.Equal("this is some text in english", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(1.0f, prediction.Features[0]); + Assert.Equal(1.0f, prediction.Features[1]); + Assert.Equal(1.0f, prediction.Features[3]); + Assert.Equal(1.0f, prediction.Features[4]); + Assert.Equal(1.0f, prediction.Features[5]); + Assert.Equal(0.0f, prediction.Features[6]); + Assert.Equal(0.0f, prediction.Features[7]); + + prediction = engine.Predict(data[1]); + Assert.Equal("this is another example", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(1.0f, prediction.Features[0]); + Assert.Equal(1.0f, prediction.Features[1]); + Assert.Equal(0.0f, prediction.Features[3]); + Assert.Equal(0.0f, prediction.Features[4]); + Assert.Equal(0.0f, prediction.Features[5]); + Assert.Equal(1.0f, prediction.Features[6]); + Assert.Equal(1.0f, prediction.Features[7]); + } + + [Fact] + public void TextFeaturizerWithCharFeatureExtractorTest() + { + var data = new[] { new TestClass() { A = "abc efg", OutputTokens=null}, + new TestClass() { A = "xyz", OutputTokens=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + var options = new TextFeaturizingEstimator.Options() + { + WordFeatureExtractor = null, + CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, + Norm = TextFeaturizingEstimator.NormFunction.None, + OutputTokensColumnName = "OutputTokens" + }; + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine(ML); + + var prediction = engine.Predict(data[0]); + Assert.Equal("abc efg", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(1.0f, prediction.Features[0]); + Assert.Equal(1.0f, prediction.Features[1]); + Assert.Equal(1.0f, prediction.Features[3]); + Assert.Equal(1.0f, prediction.Features[4]); + Assert.Equal(1.0f, prediction.Features[5]); + Assert.Equal(1.0f, prediction.Features[6]); + Assert.Equal(1.0f, prediction.Features[7]); + Assert.Equal(1.0f, prediction.Features[8]); + Assert.Equal(0.0f, prediction.Features[9]); + Assert.Equal(0.0f, prediction.Features[10]); + Assert.Equal(0.0f, prediction.Features[11]); + + prediction = engine.Predict(data[1]); + Assert.Equal("xyz", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(1.0f, prediction.Features[0]); + Assert.Equal(0.0f, prediction.Features[1]); + Assert.Equal(0.0f, prediction.Features[3]); + Assert.Equal(0.0f, prediction.Features[4]); + Assert.Equal(0.0f, prediction.Features[5]); + Assert.Equal(0.0f, prediction.Features[6]); + Assert.Equal(0.0f, prediction.Features[7]); + Assert.Equal(1.0f, prediction.Features[8]); + Assert.Equal(1.0f, prediction.Features[9]); + Assert.Equal(1.0f, prediction.Features[10]); + Assert.Equal(1.0f, prediction.Features[11]); + } + + [Fact] + public void TextFeaturizerWithL2NormTest() + { + var data = new[] { new TestClass() { A = "abc xyz", OutputTokens=null}, + new TestClass() { A = "xyz", OutputTokens=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + var options = new TextFeaturizingEstimator.Options() + { + CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1}, + Norm = TextFeaturizingEstimator.NormFunction.L2, + OutputTokensColumnName = "OutputTokens" + }; + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); + var model = pipeline.Fit(dataView); + var engine = model.CreatePredictionEngine(ML); + + var prediction = engine.Predict(data[0]); + Assert.Equal("abc xyz", string.Join(" ", prediction.OutputTokens)); + var expected1 = 0.333333343f; + var expected2 = 0.707106769f; + Assert.Equal(expected1, prediction.Features[0], 4); + Assert.Equal(expected1, prediction.Features[1], 4); + Assert.Equal(expected1, prediction.Features[3], 4); + Assert.Equal(expected1, prediction.Features[4], 4); + Assert.Equal(expected1, prediction.Features[5], 4); + Assert.Equal(expected1, prediction.Features[6], 4); + Assert.Equal(expected1, prediction.Features[7], 4); + Assert.Equal(expected1, prediction.Features[8], 4); + Assert.Equal(expected2, prediction.Features[9], 4); + Assert.Equal(expected2, prediction.Features[10], 4); + + prediction = engine.Predict(data[1]); + expected1 = 0.4472136f; + Assert.Equal("xyz", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(expected1, prediction.Features[0], 4); + Assert.Equal(0.0f, prediction.Features[1], 4); + Assert.Equal(0.0f, prediction.Features[3], 4); + Assert.Equal(0.0f, prediction.Features[4], 4); + Assert.Equal(expected1, prediction.Features[5], 4); + Assert.Equal(expected1, prediction.Features[6], 4); + Assert.Equal(expected1, prediction.Features[7], 4); + Assert.Equal(expected1, prediction.Features[8], 4); + Assert.Equal(0.0f, prediction.Features[9], 4); + Assert.Equal(1.0f, prediction.Features[10], 4); + } + [Fact] public void TextFeaturizerWithCustomStopWordRemoverTest() { @@ -67,7 +201,7 @@ public void TextFeaturizerWithCustomStopWordRemoverTest() OutputTokensColumnName = "OutputTokens", CaseMode = TextNormalizingEstimator.CaseMode.None }; - var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine(ML); var prediction = engine.Predict(data[0]); @@ -84,7 +218,7 @@ private void TestCaseMode(IDataView dataView, TestClass[] data, TextNormalizingE CaseMode = caseMode, OutputTokensColumnName = "OutputTokens" }; - var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine(ML); var prediction1 = engine.Predict(data[0]); @@ -133,7 +267,7 @@ private void TestKeepNumbers(IDataView dataView, TestClass[] data, bool keepNumb CaseMode = TextNormalizingEstimator.CaseMode.None, OutputTokensColumnName = "OutputTokens" }; - var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine(ML); var prediction1 = engine.Predict(data[0]); @@ -170,7 +304,7 @@ private void TestKeepPunctuations(IDataView dataView, TestClass[] data, bool kee CaseMode = TextNormalizingEstimator.CaseMode.None, OutputTokensColumnName = "OutputTokens" }; - var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine(ML); var prediction1 = engine.Predict(data[0]); @@ -208,7 +342,7 @@ private void TestKeepDiacritics(IDataView dataView, TestClass[] data, bool keepD CaseMode = TextNormalizingEstimator.CaseMode.None, OutputTokensColumnName = "OutputTokens" }; - var pipeline = ML.Transforms.Text.FeaturizeText("OutputText", options, "A"); + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "A"); var model = pipeline.Fit(dataView); var engine = model.CreatePredictionEngine(ML); var prediction1 = engine.Predict(data[0]); From a2062ec19b506be55c30f36ee70e329da88be893 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Wed, 20 Mar 2019 15:15:27 -0700 Subject: [PATCH 4/5] Addressed reviewers' comments. --- .../Transformers/TextFeaturizerTests.cs | 74 ++++--------------- 1 file changed, 15 insertions(+), 59 deletions(-) diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index ae083ca61b..1e7ddebf4f 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -72,23 +72,13 @@ public void TextFeaturizerWithWordFeatureExtractorTest() var prediction = engine.Predict(data[0]); Assert.Equal("this is some text in english", string.Join(" ", prediction.OutputTokens)); - Assert.Equal(1.0f, prediction.Features[0]); - Assert.Equal(1.0f, prediction.Features[1]); - Assert.Equal(1.0f, prediction.Features[3]); - Assert.Equal(1.0f, prediction.Features[4]); - Assert.Equal(1.0f, prediction.Features[5]); - Assert.Equal(0.0f, prediction.Features[6]); - Assert.Equal(0.0f, prediction.Features[7]); + var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }; + Assert.Equal(expected, prediction.Features); prediction = engine.Predict(data[1]); Assert.Equal("this is another example", string.Join(" ", prediction.OutputTokens)); - Assert.Equal(1.0f, prediction.Features[0]); - Assert.Equal(1.0f, prediction.Features[1]); - Assert.Equal(0.0f, prediction.Features[3]); - Assert.Equal(0.0f, prediction.Features[4]); - Assert.Equal(0.0f, prediction.Features[5]); - Assert.Equal(1.0f, prediction.Features[6]); - Assert.Equal(1.0f, prediction.Features[7]); + expected = new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }; + Assert.Equal(expected, prediction.Features); } [Fact] @@ -111,31 +101,13 @@ public void TextFeaturizerWithCharFeatureExtractorTest() var prediction = engine.Predict(data[0]); Assert.Equal("abc efg", string.Join(" ", prediction.OutputTokens)); - Assert.Equal(1.0f, prediction.Features[0]); - Assert.Equal(1.0f, prediction.Features[1]); - Assert.Equal(1.0f, prediction.Features[3]); - Assert.Equal(1.0f, prediction.Features[4]); - Assert.Equal(1.0f, prediction.Features[5]); - Assert.Equal(1.0f, prediction.Features[6]); - Assert.Equal(1.0f, prediction.Features[7]); - Assert.Equal(1.0f, prediction.Features[8]); - Assert.Equal(0.0f, prediction.Features[9]); - Assert.Equal(0.0f, prediction.Features[10]); - Assert.Equal(0.0f, prediction.Features[11]); + var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f }; + Assert.Equal(expected, prediction.Features); prediction = engine.Predict(data[1]); Assert.Equal("xyz", string.Join(" ", prediction.OutputTokens)); - Assert.Equal(1.0f, prediction.Features[0]); - Assert.Equal(0.0f, prediction.Features[1]); - Assert.Equal(0.0f, prediction.Features[3]); - Assert.Equal(0.0f, prediction.Features[4]); - Assert.Equal(0.0f, prediction.Features[5]); - Assert.Equal(0.0f, prediction.Features[6]); - Assert.Equal(0.0f, prediction.Features[7]); - Assert.Equal(1.0f, prediction.Features[8]); - Assert.Equal(1.0f, prediction.Features[9]); - Assert.Equal(1.0f, prediction.Features[10]); - Assert.Equal(1.0f, prediction.Features[11]); + expected = new float[] { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + Assert.Equal(expected, prediction.Features); } [Fact] @@ -157,32 +129,16 @@ public void TextFeaturizerWithL2NormTest() var prediction = engine.Predict(data[0]); Assert.Equal("abc xyz", string.Join(" ", prediction.OutputTokens)); - var expected1 = 0.333333343f; - var expected2 = 0.707106769f; - Assert.Equal(expected1, prediction.Features[0], 4); - Assert.Equal(expected1, prediction.Features[1], 4); - Assert.Equal(expected1, prediction.Features[3], 4); - Assert.Equal(expected1, prediction.Features[4], 4); - Assert.Equal(expected1, prediction.Features[5], 4); - Assert.Equal(expected1, prediction.Features[6], 4); - Assert.Equal(expected1, prediction.Features[7], 4); - Assert.Equal(expected1, prediction.Features[8], 4); - Assert.Equal(expected2, prediction.Features[9], 4); - Assert.Equal(expected2, prediction.Features[10], 4); + var exp1 = 0.333333343f; + var exp2 = 0.707106769f; + var expected = new float[] { exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp2, exp2 }; + Assert.Equal(expected, prediction.Features); prediction = engine.Predict(data[1]); - expected1 = 0.4472136f; + exp1 = 0.4472136f; Assert.Equal("xyz", string.Join(" ", prediction.OutputTokens)); - Assert.Equal(expected1, prediction.Features[0], 4); - Assert.Equal(0.0f, prediction.Features[1], 4); - Assert.Equal(0.0f, prediction.Features[3], 4); - Assert.Equal(0.0f, prediction.Features[4], 4); - Assert.Equal(expected1, prediction.Features[5], 4); - Assert.Equal(expected1, prediction.Features[6], 4); - Assert.Equal(expected1, prediction.Features[7], 4); - Assert.Equal(expected1, prediction.Features[8], 4); - Assert.Equal(0.0f, prediction.Features[9], 4); - Assert.Equal(1.0f, prediction.Features[10], 4); + expected = new float[] { exp1, 0.0f, 0.0f, 0.0f, 0.0f, exp1, exp1, exp1, exp1, 0.0f, 1.0f }; + Assert.Equal(expected, prediction.Features); } [Fact] From 1ebf44a517e9c0671a820caa478640e0d887bf8c Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 21 Mar 2019 15:16:22 -0700 Subject: [PATCH 5/5] Addressed reviewers' comments. --- .../Transformers/TextFeaturizerTests.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 1e7ddebf4f..6604961c57 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -71,12 +71,12 @@ public void TextFeaturizerWithWordFeatureExtractorTest() var engine = model.CreatePredictionEngine(ML); var prediction = engine.Predict(data[0]); - Assert.Equal("this is some text in english", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(data[0].A.ToLower(), string.Join(" ", prediction.OutputTokens)); var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }; Assert.Equal(expected, prediction.Features); prediction = engine.Predict(data[1]); - Assert.Equal("this is another example", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(data[1].A.ToLower(), string.Join(" ", prediction.OutputTokens)); expected = new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }; Assert.Equal(expected, prediction.Features); } @@ -100,12 +100,12 @@ public void TextFeaturizerWithCharFeatureExtractorTest() var engine = model.CreatePredictionEngine(ML); var prediction = engine.Predict(data[0]); - Assert.Equal("abc efg", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens)); var expected = new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f, 0.0f }; Assert.Equal(expected, prediction.Features); prediction = engine.Predict(data[1]); - Assert.Equal("xyz", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens)); expected = new float[] { 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 1.0f }; Assert.Equal(expected, prediction.Features); } @@ -128,7 +128,7 @@ public void TextFeaturizerWithL2NormTest() var engine = model.CreatePredictionEngine(ML); var prediction = engine.Predict(data[0]); - Assert.Equal("abc xyz", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(data[0].A, string.Join(" ", prediction.OutputTokens)); var exp1 = 0.333333343f; var exp2 = 0.707106769f; var expected = new float[] { exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp1, exp2, exp2 }; @@ -136,7 +136,7 @@ public void TextFeaturizerWithL2NormTest() prediction = engine.Predict(data[1]); exp1 = 0.4472136f; - Assert.Equal("xyz", string.Join(" ", prediction.OutputTokens)); + Assert.Equal(data[1].A, string.Join(" ", prediction.OutputTokens)); expected = new float[] { exp1, 0.0f, 0.0f, 0.0f, 0.0f, exp1, exp1, exp1, exp1, 0.0f, 1.0f }; Assert.Equal(expected, prediction.Features); }