dotnet · zeahmed · Sep 21, 2018 · Sep 17, 2018 · Sep 17, 2018 · Sep 17, 2018
diff --git a/src/Microsoft.ML.Data/Transforms/TermEstimator.cs b/src/Microsoft.ML.Data/Transforms/TermEstimator.cs
@@ -25,8 +25,8 @@ public static class Defaults
         /// Convenience constructor for public facing API.
         /// </summary>
         /// <param name="env">Host Environment.</param>
-        /// <param name="inputColumn">Name of the output column.</param>
-        /// <param name="outputColumn">Name of the column to be transformed. If this is null '<paramref name="inputColumn"/>' will be used.</param>
+        /// <param name="inputColumn">Name of the column to be transformed.</param>
+        /// <param name="outputColumn">Name of the output column. If this is null '<paramref name="inputColumn"/>' will be used.</param>
         /// <param name="maxNumTerms">Maximum number of terms to keep per column when auto-training.</param>
         /// <param name="sort">How items should be ordered when vectorized. By default, they will be in the order encountered.
         /// If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').</param>

diff --git a/src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs b/src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs
diff --git a/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs b/src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs
diff --git a/test/BaselineOutput/SingleDebug/Text/bag_of_words.tsv b/test/BaselineOutput/SingleDebug/Text/bag_of_words.tsv
diff --git a/test/BaselineOutput/SingleDebug/Text/ngrams.tsv b/test/BaselineOutput/SingleDebug/Text/ngrams.tsv
diff --git a/test/BaselineOutput/SingleDebug/Text/words_without_stopwords.tsv b/test/BaselineOutput/SingleDebug/Text/words_without_stopwords.tsv
@@ -0,0 +1,11 @@
+#@ TextLoader{
+#@   header+
+#@   sep=tab
+#@   col=text:TX:0
+#@   col=words_without_stopwords:TX:1-**
+#@ }
+text
+==rude== dude, you are rude upload that carl picture back, or else.	==rude==	dude,	you	rude	upload	carl	picture	back,	else.
+== ok! ==  im going to vandalize wild ones wiki then!!!   	==	ok!	==	im	going	vandalize	wild	ones	wiki	then!!!
+stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas,  	stop	trolling,	zapatancas,	calling	liar	merely	demonstartes	you	arer	zapatancas.	you	choose	chase	legitimate	editor	site	ignore	i	editor	record	isnt	99%	trolling	wishes	completely	ignored	sockpuppet	like	yourself.	consensus	overwhelmingly	you	your	trollin	g	lover	zapatancas,
+==you're cool==  you seem like a really cool guy... *bursts out laughing at sarcasm*.	==you're	cool==	you	like	really	cool	guy...	*bursts	laughing	sarcasm*.
diff --git a/test/BaselineOutput/SingleRelease/Text/bag_of_words.tsv b/test/BaselineOutput/SingleRelease/Text/bag_of_words.tsv
diff --git a/test/BaselineOutput/SingleRelease/Text/ngrams.tsv b/test/BaselineOutput/SingleRelease/Text/ngrams.tsv
diff --git a/test/BaselineOutput/SingleRelease/Text/words_without_stopwords.tsv b/test/BaselineOutput/SingleRelease/Text/words_without_stopwords.tsv
@@ -0,0 +1,11 @@
+#@ TextLoader{
+#@   header+
+#@   sep=tab
+#@   col=text:TX:0
+#@   col=words_without_stopwords:TX:1-**
+#@ }
+text
+==rude== dude, you are rude upload that carl picture back, or else.	==rude==	dude,	you	rude	upload	carl	picture	back,	else.
+== ok! ==  im going to vandalize wild ones wiki then!!!   	==	ok!	==	im	going	vandalize	wild	ones	wiki	then!!!
+stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas,  	stop	trolling,	zapatancas,	calling	liar	merely	demonstartes	you	arer	zapatancas.	you	choose	chase	legitimate	editor	site	ignore	i	editor	record	isnt	99%	trolling	wishes	completely	ignored	sockpuppet	like	yourself.	consensus	overwhelmingly	you	your	trollin	g	lover	zapatancas,
+==you're cool==  you seem like a really cool guy... *bursts out laughing at sarcasm*.	==you're	cool==	you	like	really	cool	guy...	*bursts	laughing	sarcasm*.
diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
@@ -433,6 +433,93 @@ public void Tokenize()
             Assert.True(type.ItemType.AsKey.RawKind == DataKind.U2);
         }
 
+        [Fact]
+        public void NormalizeTextAndRemoveStopWords()
+        {
+            var env = new ConsoleEnvironment(seed: 0);
+            var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
+            var reader = TextLoader.CreateReader(env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadText(1)), hasHeader: true);
+            var dataSource = new MultiFileSource(dataPath);
+            var data = reader.Read(dataSource);
+
+            var est = data.MakeNewEstimator()
+                .Append(r => (
+                    r.label,
+                    normalized_text: r.text.NormalizeText(),
+                    words_without_stopwords: r.text.TokenizeText().RemoveStopwords()));
+
+            var tdata = est.Fit(data).Transform(data);
+            var schema = tdata.AsDynamic.Schema;
+
+            Assert.True(schema.TryGetColumnIndex("words_without_stopwords", out int stopwordsCol));
+            var type = schema.GetColumnType(stopwordsCol);
+            Assert.True(type.IsVector && !type.IsKnownSizeVector && type.ItemType.IsText);
+
+            Assert.True(schema.TryGetColumnIndex("normalized_text", out int normTextCol));
+            type = schema.GetColumnType(normTextCol);
+            Assert.True(type.IsText && !type.IsVector);
+        }
+
+        [Fact]
+        public void ConvertToWordBag()
+        {
+            var env = new ConsoleEnvironment(seed: 0);
+            var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
+            var reader = TextLoader.CreateReader(env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadText(1)), hasHeader: true);
+            var dataSource = new MultiFileSource(dataPath);
+            var data = reader.Read(dataSource);
+
+            var est = data.MakeNewEstimator()
+                .Append(r => (
+                    r.label,
+                    bagofword: r.text.ToBagofWords(),
+                    bagofhashedword: r.text.ToBagofHashedWords()));
+
+            var tdata = est.Fit(data).Transform(data);
+            var schema = tdata.AsDynamic.Schema;
+
+            Assert.True(schema.TryGetColumnIndex("bagofword", out int bagofwordCol));
+            var type = schema.GetColumnType(bagofwordCol);
+            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
+
+            Assert.True(schema.TryGetColumnIndex("bagofhashedword", out int bagofhashedwordCol));
+            type = schema.GetColumnType(bagofhashedwordCol);
+            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
+        }
+
+        [Fact]
+        public void Ngrams()
+        {
+            var env = new ConsoleEnvironment(seed: 0);
+            var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
+            var reader = TextLoader.CreateReader(env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadText(1)), hasHeader: true);
+            var dataSource = new MultiFileSource(dataPath);
+            var data = reader.Read(dataSource);
+
+            var est = data.MakeNewEstimator()
+                .Append(r => (
+                    r.label,
+                    ngrams: r.text.TokenizeText().ToKey().ToNgrams(),
+                    ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash()));
+
+            var tdata = est.Fit(data).Transform(data);
+            var schema = tdata.AsDynamic.Schema;
+
+            Assert.True(schema.TryGetColumnIndex("ngrams", out int ngramsCol));
+            var type = schema.GetColumnType(ngramsCol);
+            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
+
+            Assert.True(schema.TryGetColumnIndex("ngramshash", out int ngramshashCol));
+            type = schema.GetColumnType(ngramshashCol);
+            Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
+        }
+
 
         [Fact]
         public void LpGcNormAndWhitening()

diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
@@ -88,5 +88,114 @@ public void TextTokenizationWorkout()
             CheckEquality("Text", "tokenized.tsv");
             Done();
         }
+
+
+        [Fact]
+        public void TextNormalizationAndStopwordRemoverWorkout()
+        {
+            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
+            var data = TextLoader.CreateReader(Env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadText(1)), hasHeader: true)
+                .Read(new MultiFileSource(sentimentDataPath));
+
+            var invalidData = TextLoader.CreateReader(Env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadFloat(1)), hasHeader: true)
+                .Read(new MultiFileSource(sentimentDataPath));
+
+            var est = new TextNormalizer(Env,"text")
+                .Append(new WordTokenizer(Env, "text", "words"))
+                .Append(new StopwordRemover(Env, "words", "words_without_stopwords"));
+            TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);
+
+            var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");
+            using (var ch = Env.Start("save"))
+            {
+                var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
+                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
+                savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords");
+
+                using (var fs = File.Create(outputPath))
+                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
+            }
+
+            CheckEquality("Text", "words_without_stopwords.tsv");
+            Done();
+        }
+
+        [Fact]
+        public void WordBagWorkout()
+        {
+            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
+            var data = TextLoader.CreateReader(Env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadText(1)), hasHeader: true)
+                .Read(new MultiFileSource(sentimentDataPath));
+
+            var invalidData = TextLoader.CreateReader(Env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadFloat(1)), hasHeader: true)
+                .Read(new MultiFileSource(sentimentDataPath));
+
+            var est = new WordBagEstimator(Env, "text", "bag_of_words").
+                Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash"));
+
+            // The following call fails because of the following issue
+            // https://github.com/dotnet/machinelearning/issues/969
+            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);
+
+            var outputPath = GetOutputPath("Text", "bag_of_words.tsv");
+            using (var ch = Env.Start("save"))
+            {
+                var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
+                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
+                savedData = new ChooseColumnsTransform(Env, savedData, "text", "bag_of_words", "bag_of_wordshash");
+
+                using (var fs = File.Create(outputPath))
+                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
+            }
+
+            CheckEquality("Text", "bag_of_words.tsv");
+            Done();
+        }
+
+        [Fact]
+        public void NgramWorkout()
+        {
+            string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
+            var data = TextLoader.CreateReader(Env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadText(1)), hasHeader: true)
+                .Read(new MultiFileSource(sentimentDataPath));
+
+            var invalidData = TextLoader.CreateReader(Env, ctx => (
+                    label: ctx.LoadBool(0),
+                    text: ctx.LoadFloat(1)), hasHeader: true)
+                .Read(new MultiFileSource(sentimentDataPath));
+
+            var est = new WordTokenizer(Env, "text", "text")
+                .Append(new TermEstimator(Env, "text", "terms"))
+                .Append(new NgramEstimator(Env, "terms", "ngrams"))
+                .Append(new NgramHashEstimator(Env, "terms", "ngramshash"));
+
+            // The following call fails because of the following issue
+            // https://github.com/dotnet/machinelearning/issues/969
+            // TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);
+
+            var outputPath = GetOutputPath("Text", "ngrams.tsv");
+            using (var ch = Env.Start("save"))
+            {
+                var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
+                IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
+                savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash");
+
+                using (var fs = File.Create(outputPath))
+                    DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
+            }
+
+            CheckEquality("Text", "ngrams.tsv");
+            Done();
+        }
     }
 }