From 1e5913d9e7c89e5a2f1a45f8e4d3d33578b26742 Mon Sep 17 00:00:00 2001 From: Anipik Date: Mon, 10 Sep 2018 16:54:43 -0700 Subject: [PATCH 1/3] wordEMbedding --- .../Text/WordEmbeddingsTransform.cs | 2 +- .../BigramAndTrigramBenchMark.cs | 24 ++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs index bf85ddf42f..4c19248bfa 100644 --- a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs @@ -429,7 +429,7 @@ private Model GetVocabularyDictionary() float temp; string firstKey = wordsInFirstLine[0]; float[] firstValue = wordsInFirstLine.Skip(1).Select(x => float.TryParse(x, out temp) ? temp : Single.NaN).ToArray(); - if (!firstValue.Contains(Single.NaN)) + if (!firstValue.Contains(Single.NaN) && firstValue.Length == model.Dimension) model.AddWordVector(ch, firstKey, firstValue); else ch.Warning($"Parsing error while reading model file: '{_modelFileNameWithPath}', line number 1"); diff --git a/test/Microsoft.ML.Benchmarks/BigramAndTrigramBenchMark.cs b/test/Microsoft.ML.Benchmarks/BigramAndTrigramBenchMark.cs index f4c947abda..5dd6477dfa 100644 --- a/test/Microsoft.ML.Benchmarks/BigramAndTrigramBenchMark.cs +++ b/test/Microsoft.ML.Benchmarks/BigramAndTrigramBenchMark.cs @@ -27,7 +27,9 @@ public class BigramAndTrigramBenchmark [GlobalSetup(Targets = new string[] { nameof(CV_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron), - nameof(CV_Multiclass_WikiDetox_BigramsAndTrichar_LightGBMMulticlass) })] + nameof(CV_Multiclass_WikiDetox_BigramsAndTrichar_LightGBMMulticlass), + nameof(CV_Multiclass_WikiDetox_WordEmbeddings_OVAAveragedPerceptron), + nameof(CV_Multiclass_WikiDetox_WordEmbeddings_SDCAMC)})] public void SetupTrainingSpeedTests() { _dataPath_Wiki = Path.GetFullPath(TestDatasets.WikiDetox.trainFilename); @@ -81,5 +83,25 @@ public void Test_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron() Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); } } + + [Benchmark] + public void CV_Multiclass_WikiDetox_WordEmbeddings_OVAAveragedPerceptron() + { + string cmd = @"CV tr=OVA{p=AveragedPerceptron{iter=10}} k=5 loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} data=" + _dataPath_Wiki + " xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor=NGramExtractorTransform{ngram=2}} xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D} xf=Concat{col=Features:FeaturesText,FeaturesWordEmbedding,logged_in,ns}"; + using (var tlc = new TlcEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + { + Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); + } + } + + [Benchmark] + public void CV_Multiclass_WikiDetox_WordEmbeddings_SDCAMC() + { + string cmd = @"CV tr=SDCAMC k=5 loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} data=" + _dataPath_Wiki + " xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor={} charExtractor={}} xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D} xf=Concat{col=Features:FeaturesWordEmbedding,logged_in,ns}"; + using (var tlc = new TlcEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance)) + { + Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false); + } + } } } From fd9024bf5cfe07b1f10b441ae3852557d267e361 Mon Sep 17 00:00:00 2001 From: Anipik Date: Tue, 11 Sep 2018 10:30:48 -0700 Subject: [PATCH 2/3] warning removed --- .../Text/WordEmbeddingsTransform.cs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs index 4c19248bfa..83e420afff 100644 --- a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs @@ -426,13 +426,14 @@ private Model GetVocabularyDictionary() dimension = wordsInFirstLine.Length - 1; if (model == null) model = new Model(dimension); - float temp; - string firstKey = wordsInFirstLine[0]; - float[] firstValue = wordsInFirstLine.Skip(1).Select(x => float.TryParse(x, out temp) ? temp : Single.NaN).ToArray(); - if (!firstValue.Contains(Single.NaN) && firstValue.Length == model.Dimension) - model.AddWordVector(ch, firstKey, firstValue); - else - ch.Warning($"Parsing error while reading model file: '{_modelFileNameWithPath}', line number 1"); + if (model.Dimension == dimension) + { + float temp; + string firstKey = wordsInFirstLine[0]; + float[] firstValue = wordsInFirstLine.Skip(1).Select(x => float.TryParse(x, out temp) ? temp : Single.NaN).ToArray(); + if (!firstValue.Contains(Single.NaN)) + model.AddWordVector(ch, firstKey, firstValue); + } pch.Checkpoint(lineNumber); } } From 014fea014e5127a98d5d575d3674068ed952de5a Mon Sep 17 00:00:00 2001 From: Anipik Date: Tue, 11 Sep 2018 11:31:59 -0700 Subject: [PATCH 3/3] class renamed --- .../MultiClassClassification.cs} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename test/Microsoft.ML.Benchmarks/{BigramAndTrigramBenchMark.cs => Text/MultiClassClassification.cs} (99%) diff --git a/test/Microsoft.ML.Benchmarks/BigramAndTrigramBenchMark.cs b/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs similarity index 99% rename from test/Microsoft.ML.Benchmarks/BigramAndTrigramBenchMark.cs rename to test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs index 5dd6477dfa..85c1cb0091 100644 --- a/test/Microsoft.ML.Benchmarks/BigramAndTrigramBenchMark.cs +++ b/test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs @@ -20,7 +20,7 @@ internal class EmptyWriter : TextWriter public override Encoding Encoding => null; } - public class BigramAndTrigramBenchmark + public class MultiClassClassification { private string _dataPath_Wiki; private string _modelPath_Wiki;