Remove parsing perf bottleneck in WordEmbeddingsTransform (#1599) fixes #1608

adamsitnik · web-flow · commit feddc7259167 · 2018-11-21T07:22:58.000-08:00
* update benchmarking docs: mention required git submodule

* update BDN to latest version with ETW profiler

* use CopyLocalLockFileAssemblies to force MSBuild to copy all dependencies to output folder, even if they are not used - to allow for dynamic assembly loading for EtwProfiler when used from console app

* use new .AsDefault() bdn method to tell it that it's a default job which can be overwritten by TrainConfig

* add benchmark which isolates the performance bottleneck

* write unit tests!

* move the parsing logic to a separate method before applying any code changes

* apply the optimizations

* read the file in parallel for even x3 speedup!

* remove the temporary benchmark

* revert breaking benchmark config changes

* apply a workaround for BenchmarkDotNet bug

* code review fixes

* cleanup the comment

* update BDN to 0.11.3, remove all workarounds

* code review fixes

* missing license header
diff --git a/build/Dependencies.props b/build/Dependencies.props
@@ -39,7 +39,7 @@
 
   <!-- Test-only Dependencies -->
   <PropertyGroup>
-    <BenchmarkDotNetVersion>0.11.1</BenchmarkDotNetVersion>
+    <BenchmarkDotNetVersion>0.11.3</BenchmarkDotNetVersion>
     <MicrosoftMLTestModelsPackageVersion>0.0.3-test</MicrosoftMLTestModelsPackageVersion>
   </PropertyGroup>
 
diff --git a/src/Microsoft.ML.Core/Utilities/LineParser.cs b/src/Microsoft.ML.Core/Utilities/LineParser.cs
@@ -0,0 +1,58 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Runtime.CompilerServices;
+
+namespace Microsoft.ML.Runtime.Internal.Utilities
+{
+    [BestFriend]
+    internal static class LineParser
+    {
+        public static (bool isSuccess, string key, float[] values) ParseKeyThenNumbers(string line)
+        {
+            if (string.IsNullOrWhiteSpace(line))
+                return (false, null, null);
+
+            ReadOnlySpan<char> trimmedLine = line.AsSpan().TrimEnd(); // TrimEnd creates a Span, no allocations
+
+            int firstSeparatorIndex = trimmedLine.IndexOfAny(' ', '\t'); // the first word is the key, we just skip it
+            ReadOnlySpan<char> valuesToParse = trimmedLine.Slice(start: firstSeparatorIndex + 1);
+
+            float[] values = AllocateFixedSizeArrayToStoreParsedValues(valuesToParse);
+
+            int toParseStartIndex = 0;
+            int valueIndex = 0;
+            for (int i = 0; i <= valuesToParse.Length; i++)
+            {
+                if (i == valuesToParse.Length || valuesToParse[i] == ' ' || valuesToParse[i] == '\t')
+                {
+                    if (DoubleParser.TryParse(valuesToParse.Slice(toParseStartIndex, i - toParseStartIndex), out float parsed))
+                        values[valueIndex++] = parsed;
+                    else
+                        return (false, null, null);
+
+                    toParseStartIndex = i + 1;
+                }
+            }
+
+            return (true, trimmedLine.Slice(0, firstSeparatorIndex).ToString(), values);
+        }
+
+        /// <summary>
+        /// we count the number of values first to allocate a single array with of proper size
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static float[] AllocateFixedSizeArrayToStoreParsedValues(ReadOnlySpan<char> valuesToParse)
+        {
+            int valuesCount = 0;
+
+            for (int i = 0; i < valuesToParse.Length; i++)
+                if (valuesToParse[i] == ' ' || valuesToParse[i] == '\t')
+                    valuesCount++;
+
+            return new float[valuesCount + 1]; // + 1 because the line is trimmed and there is no whitespace at the end
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs b/src/Microsoft.ML.Transforms/Text/WordEmbeddingsExtractor.cs
@@ -15,10 +15,12 @@
 using Microsoft.ML.StaticPipe.Runtime;
 using Microsoft.ML.Transforms.Text;
 using System;
+using System.Collections.Concurrent;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using System.Text;
+using System.Threading.Tasks;
 
 [assembly: LoadableClass(WordEmbeddingsExtractingTransformer.Summary, typeof(IDataTransform), typeof(WordEmbeddingsExtractingTransformer), typeof(WordEmbeddingsExtractingTransformer.Arguments),
     typeof(SignatureDataTransform), WordEmbeddingsExtractingTransformer.UserName, "WordEmbeddingsTransform", WordEmbeddingsExtractingTransformer.ShortName, DocName = "transform/WordEmbeddingsTransform.md")]
@@ -207,7 +209,7 @@ public WordEmbeddingsExtractingTransformer(IHostEnvironment env, PretrainedModel
 
             _modelKind = modelKind;
             _modelFileNameWithPath = EnsureModelFile(env, out _linesToSkip, (PretrainedModelKind)_modelKind);
-            _currentVocab = GetVocabularyDictionary();
+            _currentVocab = GetVocabularyDictionary(env);
         }
 
         /// <summary>
@@ -225,7 +227,7 @@ public WordEmbeddingsExtractingTransformer(IHostEnvironment env, string customMo
             _modelKind = null;
             _customLookup = true;
             _modelFileNameWithPath = customModelFile;
-            _currentVocab = GetVocabularyDictionary();
+            _currentVocab = GetVocabularyDictionary(env);
         }
 
         private static (string input, string output)[] GetColumnPairs(ColumnInfo[] columns)
@@ -281,7 +283,7 @@ private WordEmbeddingsExtractingTransformer(IHost host, ModelLoadContext ctx)
             }
 
             Host.CheckNonWhiteSpace(_modelFileNameWithPath, nameof(_modelFileNameWithPath));
-            _currentVocab = GetVocabularyDictionary();
+            _currentVocab = GetVocabularyDictionary(host);
         }
 
         public static WordEmbeddingsExtractingTransformer Create(IHostEnvironment env, ModelLoadContext ctx)
@@ -696,7 +698,7 @@ private string EnsureModelFile(IHostEnvironment env, out int linesToSkip, Pretra
             throw Host.Except($"Can't map model kind = {kind} to specific file, please refer to https://aka.ms/MLNetIssue for assistance");
         }
 
-        private Model GetVocabularyDictionary()
+        private Model GetVocabularyDictionary(IHostEnvironment hostEnvironment)
         {
             int dimension = 0;
             if (!File.Exists(_modelFileNameWithPath))
@@ -722,62 +724,64 @@ private Model GetVocabularyDictionary()
                     }
                 }
 
-                Model model = null;
-                using (StreamReader sr = File.OpenText(_modelFileNameWithPath))
+                using (var ch = Host.Start(LoaderSignature))
+                using (var pch = Host.StartProgressChannel("Building Vocabulary from Model File for Word Embeddings Transform"))
                 {
-                    string line;
-                    int lineNumber = 1;
-                    char[] delimiters = { ' ', '\t' };
-                    using (var ch = Host.Start(LoaderSignature))
-                    using (var pch = Host.StartProgressChannel("Building Vocabulary from Model File for Word Embeddings Transform"))
-                    {
-                        var header = new ProgressHeader(new[] { "lines" });
-                        pch.SetHeader(header, e => e.SetProgress(0, lineNumber));
-                        string firstLine = sr.ReadLine();
-                        while ((line = sr.ReadLine()) != null)
+                    var parsedData = new ConcurrentBag<(string key, float[] values, long lineNumber)>();
+                    int skippedLinesCount = Math.Max(1, _linesToSkip);
+
+                    Parallel.ForEach(File.ReadLines(_modelFileNameWithPath).Skip(skippedLinesCount), GetParallelOptions(hostEnvironment),
+                        (line, parallelState, lineNumber) =>
                         {
-                            if (lineNumber >= _linesToSkip)
-                            {
-                                string[] words = line.TrimEnd().Split(delimiters);
-                                dimension = words.Length - 1;
-                                if (model == null)
-                                    model = new Model(dimension);
-                                if (model.Dimension != dimension)
-                                    ch.Warning($"Dimension mismatch while reading model file: '{_modelFileNameWithPath}', line number {lineNumber + 1}, expected dimension = {model.Dimension}, received dimension = {dimension}");
-                                else
-                                {
-                                    float tmp;
-                                    string key = words[0];
-                                    float[] value = words.Skip(1).Select(x => float.TryParse(x, out tmp) ? tmp : Single.NaN).ToArray();
-                                    if (!value.Contains(Single.NaN))
-                                        model.AddWordVector(ch, key, value);
-                                    else
-                                        ch.Warning($"Parsing error while reading model file: '{_modelFileNameWithPath}', line number {lineNumber + 1}");
-                                }
-                            }
-                            lineNumber++;
-                        }
+                            (bool isSuccess, string key, float[] values) = LineParser.ParseKeyThenNumbers(line);
+
+                            if (isSuccess)
+                                parsedData.Add((key, values, lineNumber + skippedLinesCount));
+                            else // we use shared state here (ch) but it's not our hot path and we don't care about unhappy-path performance
+                                ch.Warning($"Parsing error while reading model file: '{_modelFileNameWithPath}', line number {lineNumber + skippedLinesCount}");
+                        });
 
-                        // Handle first line of the embedding file separately since some embedding files including fastText have a single-line header
-                        string[] wordsInFirstLine = firstLine.TrimEnd().Split(delimiters);
-                        dimension = wordsInFirstLine.Length - 1;
+                    Model model = null;
+                    foreach (var parsedLine in parsedData.OrderBy(parsedLine => parsedLine.lineNumber))
+                    {
+                        dimension = parsedLine.values.Length;
                         if (model == null)
                             model = new Model(dimension);
-                        if (model.Dimension == dimension)
-                        {
-                            float temp;
-                            string firstKey = wordsInFirstLine[0];
-                            float[] firstValue = wordsInFirstLine.Skip(1).Select(x => float.TryParse(x, out temp) ? temp : Single.NaN).ToArray();
-                            if (!firstValue.Contains(Single.NaN))
-                                model.AddWordVector(ch, firstKey, firstValue);
-                        }
-                        pch.Checkpoint(lineNumber);
+                        if (model.Dimension != dimension)
+                            ch.Warning($"Dimension mismatch while reading model file: '{_modelFileNameWithPath}', line number {parsedLine.lineNumber}, expected dimension = {model.Dimension}, received dimension = {dimension}");
+                        else
+                            model.AddWordVector(ch, parsedLine.key, parsedLine.values);
                     }
+
+                    // Handle first line of the embedding file separately since some embedding files including fastText have a single-line header
+                    var firstLine = File.ReadLines(_modelFileNameWithPath).First();
+                    string[] wordsInFirstLine = firstLine.TrimEnd().Split(' ', '\t');
+                    dimension = wordsInFirstLine.Length - 1;
+                    if (model == null)
+                        model = new Model(dimension);
+                    if (model.Dimension == dimension)
+                    {
+                        float temp;
+                        string firstKey = wordsInFirstLine[0];
+                        float[] firstValue = wordsInFirstLine.Skip(1).Select(x => float.TryParse(x, out temp) ? temp : Single.NaN).ToArray();
+                        if (!firstValue.Contains(Single.NaN))
+                            model.AddWordVector(ch, firstKey, firstValue);
+                    }
+
+                    _vocab[_modelFileNameWithPath] = new WeakReference<Model>(model, false);
+                    return model;
                 }
-                _vocab[_modelFileNameWithPath] = new WeakReference<Model>(model, false);
-                return model;
             }
         }
+
+        private static ParallelOptions GetParallelOptions(IHostEnvironment hostEnvironment)
+        {
+            //  "Less than 1 means whatever the component views as ideal." (about ConcurrencyFactor)
+            if (hostEnvironment.ConcurrencyFactor < 1)
+                return new ParallelOptions(); // we provide default options and let the Parallel decide
+            else
+                return new ParallelOptions() { MaxDegreeOfParallelism = hostEnvironment.ConcurrencyFactor };
+        }
     }
 
     /// <include file='doc.xml' path='doc/members/member[@name="WordEmbeddings"]/*' />
diff --git a/test/Microsoft.ML.Benchmarks/Harness/Configs.cs b/test/Microsoft.ML.Benchmarks/Harness/Configs.cs
@@ -18,22 +18,21 @@ public RecommendedConfig()
                 .With(CreateToolchain())); // toolchain is responsible for generating, building and running dedicated executable per benchmark
 
             Add(new ExtraMetricColumn()); // an extra colum that can display additional metric reported by the benchmarks
-
-            UnionRule = ConfigUnionRule.AlwaysUseLocal; // global config can be overwritten with local (the one set via [ConfigAttribute])
         }
 
         protected virtual Job GetJobDefinition()
             => Job.Default
                 .WithWarmupCount(1) // ML.NET benchmarks are typically CPU-heavy benchmarks, 1 warmup is usually enough
-                .WithMaxIterationCount(20);
+                .WithMaxIterationCount(20)
+                .AsDefault(); // this way we tell BDN that it's a default config which can be overwritten
 
         /// <summary>
         /// we need our own toolchain because MSBuild by default does not copy recursive native dependencies to the output
         /// </summary>
         private IToolchain CreateToolchain()
         {
-            var tfm = GetTargetFrameworkMoniker();
-            var csProj = CsProjCoreToolchain.From(new NetCoreAppSettings(targetFrameworkMoniker: tfm, runtimeFrameworkVersion: null, name: tfm));
+            var tfm = NetCoreAppSettings.Current.Value.TargetFrameworkMoniker;
+            var csProj = CsProjCoreToolchain.Current.Value;
 
             return new Toolchain(
                 tfm,
@@ -42,15 +41,6 @@ private IToolchain CreateToolchain()
                 csProj.Executor);
         }
 
-        private static string GetTargetFrameworkMoniker()
-        {
-#if NETCOREAPP3_0 // todo: remove the #IF DEFINES when BDN 0.11.2 gets released (BDN gains the 3.0 support)
-            return "netcoreapp3.0";
-#else
-            return NetCoreAppSettings.Current.Value.TargetFrameworkMoniker;
-#endif
-        }
-
         private static string GetBuildConfigurationName()
         {
 #if NETCOREAPP3_0
diff --git a/test/Microsoft.ML.Benchmarks/Harness/ProjectGenerator.cs b/test/Microsoft.ML.Benchmarks/Harness/ProjectGenerator.cs
@@ -27,7 +27,7 @@ namespace Microsoft.ML.Benchmarks.Harness
     /// </summary>
     public class ProjectGenerator : CsProjGenerator
     {
-        public ProjectGenerator(string targetFrameworkMoniker) : base(targetFrameworkMoniker, platform => platform.ToConfig(), null)
+        public ProjectGenerator(string targetFrameworkMoniker) : base(targetFrameworkMoniker, null, null, null)
         {
         }
 
diff --git a/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj b/test/Microsoft.ML.Benchmarks/Microsoft.ML.Benchmarks.csproj
@@ -12,6 +12,7 @@
   </ItemGroup>
   <ItemGroup>
     <PackageReference Include="BenchmarkDotNet" Version="$(BenchmarkDotNetVersion)" />
+    <PackageReference Include="BenchmarkDotNet.Diagnostics.Windows" Version="$(BenchmarkDotNetVersion)" />
   </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="..\..\src\Microsoft.ML.Core\Microsoft.ML.Core.csproj" />
diff --git a/test/Microsoft.ML.Benchmarks/README.md b/test/Microsoft.ML.Benchmarks/README.md
@@ -4,7 +4,11 @@ This project contains performance benchmarks.
 
 ## Run the Performance Tests
 
-**Pre-requisite:** On a clean repo, `build.cmd` at the root installs the right version of dotnet.exe and builds the solution. You need to build the solution in `Release` with native dependencies. 
+**Pre-requisite:** In order to fetch dependencies which come through Git submodules the following command needs to be run before building:
+
+    git submodule update --init
+
+**Pre-requisite:** On a clean repo with initalized submodules, `build.cmd` at the root installs the right version of dotnet.exe and builds the solution. You need to build the solution in `Release` with native dependencies. 
 
     build.cmd -release -buildNative
     
diff --git a/test/Microsoft.ML.Tests/Transformers/LineParserTests.cs b/test/Microsoft.ML.Tests/Transformers/LineParserTests.cs
@@ -0,0 +1,40 @@
+﻿// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.ML.Runtime.Internal.Utilities;
+using System.Collections.Generic;
+using Xunit;
+
+namespace Microsoft.ML.Tests.Transformers
+{
+    public class LineParserTests
+    {
+        public static IEnumerable<object[]> ValidInputs()
+        {
+            yield return new object[] { "key 0.1 0.2 0.3", "key", new float[] { 0.1f, 0.2f, 0.3f } };
+            yield return new object[] { "key 0.1 0.2 0.3 ", "key", new float[] { 0.1f, 0.2f, 0.3f } };
+            yield return new object[] { "key\t0.1\t0.2\t0.3", "key", new float[] { 0.1f, 0.2f, 0.3f } }; // tab can also be a separator
+            yield return new object[] { "key\t0.1\t0.2\t0.3\t", "key", new float[] { 0.1f, 0.2f, 0.3f } };
+        }
+
+        [Theory]
+        [MemberData(nameof(ValidInputs))]
+        public void WhenProvidedAValidInputParserParsesKeyAndValues(string input, string expectedKey, float[] expectedValues)
+        {
+            var result = LineParser.ParseKeyThenNumbers(input);
+
+            Assert.True(result.isSuccess);
+            Assert.Equal(expectedKey, result.key);
+            Assert.Equal(expectedValues, result.values);
+        }
+
+        [Theory]
+        [InlineData("")]
+        [InlineData("key 0.1 NOT_A_NUMBER")] // invalid number
+        public void WhenProvidedAnInvalidInputParserReturnsFailure(string input)
+        {
+            Assert.False(LineParser.ParseKeyThenNumbers(input).isSuccess);
+        }
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ namespace Microsoft.ML.Benchmarks.Harness`
`27`	`27`	`/// </summary>`
`28`	`28`	`public class ProjectGenerator : CsProjGenerator`
`29`	`29`	`{`
`30`		`- public ProjectGenerator(string targetFrameworkMoniker) : base(targetFrameworkMoniker, platform => platform.ToConfig(), null)`
	`30`	`+ public ProjectGenerator(string targetFrameworkMoniker) : base(targetFrameworkMoniker, null, null, null)`
`31`	`31`	`{`
`32`	`32`	`}`
`33`	`33`