From 77e0862e6cdb2e6ed70c26a38d7b71c5eff9a87f Mon Sep 17 00:00:00 2001 From: Pratik Gadhiya Date: Sat, 22 Sep 2018 08:13:54 +0530 Subject: [PATCH 1/3] Make bound checking of loops more efficient --- src/Microsoft.ML.CpuMath/AvxIntrinsics.cs | 44 +++++++++++------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index b31a427139..69adab26e7 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -420,7 +420,7 @@ public static unsafe void AddScalarU(float scalar, Span dst) Vector256 scalarVector256 = Avx.SetAllVector256(scalar); - while (pDstCurrent + 8 <= pDstEnd) + while (pDstCurrent <= pDstEnd - 8) { Vector256 dstVector = Avx.LoadVector256(pDstCurrent); dstVector = Avx.Add(dstVector, scalarVector256); @@ -460,7 +460,7 @@ public static unsafe void ScaleU(float scale, Span dst) Vector256 scaleVector256 = Avx.SetAllVector256(scale); - while (pDstCurrent + 8 <= pEnd) + while (pDstCurrent <= pEnd - 8) { Vector256 dstVector = Avx.LoadVector256(pDstCurrent); @@ -505,7 +505,7 @@ public static unsafe void ScaleSrcU(float scale, Span src, Span ds Vector256 scaleVector256 = Avx.SetAllVector256(scale); - while (pDstCurrent + 8 <= pDstEnd) + while (pDstCurrent <= pDstEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); srcVector = Avx.Multiply(srcVector, scaleVector256); @@ -550,7 +550,7 @@ public static unsafe void ScaleAddU(float a, float b, Span dst) Vector256 a256 = Avx.SetAllVector256(a); Vector256 b256 = Avx.SetAllVector256(b); - while (pDstCurrent + 8 <= pDstEnd) + while (pDstCurrent <= pDstEnd - 8) { Vector256 dstVector = Avx.LoadVector256(pDstCurrent); dstVector = Avx.Add(dstVector, b256); @@ -596,7 +596,7 @@ public static unsafe void AddScaleU(float scale, Span src, Span ds Vector256 scaleVector256 = Avx.SetAllVector256(scale); - while (pDstCurrent + 8 <= pEnd) + while (pDstCurrent <= pEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); Vector256 dstVector = Avx.LoadVector256(pDstCurrent); @@ -652,7 +652,7 @@ public static unsafe void AddScaleCopyU(float scale, Span src, Span scaleVector256 = Avx.SetAllVector256(scale); - while (pResCurrent + 8 <= pResEnd) + while (pResCurrent <= pResEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); Vector256 dstVector = Avx.LoadVector256(pDstCurrent); @@ -708,7 +708,7 @@ public static unsafe void AddScaleSU(float scale, Span src, Span idx Vector256 scaleVector256 = Avx.SetAllVector256(scale); - while (pIdxCurrent + 8 <= pEnd) + while (pIdxCurrent <= pEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); Vector256 dstVector = Load8(pDstCurrent, pIdxCurrent); @@ -755,7 +755,7 @@ public static unsafe void AddU(Span src, Span dst) float* pDstCurrent = pdst; float* pEnd = psrc + src.Length; - while (pSrcCurrent + 8 <= pEnd) + while (pSrcCurrent <= pEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); Vector256 dstVector = Avx.LoadVector256(pDstCurrent); @@ -804,7 +804,7 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) float* pDstCurrent = pdst; int* pEnd = pidx + idx.Length; - while (pIdxCurrent + 8 <= pEnd) + while (pIdxCurrent <= pEnd - 8) { Vector256 dstVector = Load8(pDstCurrent, pIdxCurrent); Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); @@ -849,7 +849,7 @@ public static unsafe void MulElementWiseU(Span src1, Span src2, Sp float* pDstCurrent = pdst; float* pEnd = pdst + dst.Length; - while (pDstCurrent + 8 <= pEnd) + while (pDstCurrent <= pEnd - 8) { Vector256 src1Vector = Avx.LoadVector256(pSrc1Current); Vector256 src2Vector = Avx.LoadVector256(pSrc2Current); @@ -896,7 +896,7 @@ public static unsafe float SumU(Span src) Vector256 result256 = Avx.SetZeroVector256(); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { result256 = Avx.Add(result256, Avx.LoadVector256(pSrcCurrent)); pSrcCurrent += 8; @@ -934,7 +934,7 @@ public static unsafe float SumSqU(Span src) Vector256 result256 = Avx.SetZeroVector256(); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); result256 = Avx.Add(result256, Avx.Multiply(srcVector, srcVector)); @@ -979,7 +979,7 @@ public static unsafe float SumSqDiffU(float mean, Span src) Vector256 result256 = Avx.SetZeroVector256(); Vector256 meanVector256 = Avx.SetAllVector256(mean); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); srcVector = Avx.Subtract(srcVector, meanVector256); @@ -1027,7 +1027,7 @@ public static unsafe float SumAbsU(Span src) Vector256 result256 = Avx.SetZeroVector256(); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); result256 = Avx.Add(result256, Avx.And(srcVector, _absMask256)); @@ -1072,7 +1072,7 @@ public static unsafe float SumAbsDiffU(float mean, Span src) Vector256 result256 = Avx.SetZeroVector256(); Vector256 meanVector256 = Avx.SetAllVector256(mean); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); srcVector = Avx.Subtract(srcVector, meanVector256); @@ -1120,7 +1120,7 @@ public static unsafe float MaxAbsU(Span src) Vector256 result256 = Avx.SetZeroVector256(); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); result256 = Avx.Max(result256, Avx.And(srcVector, _absMask256)); @@ -1165,7 +1165,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) Vector256 result256 = Avx.SetZeroVector256(); Vector256 meanVector256 = Avx.SetAllVector256(mean); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); srcVector = Avx.Subtract(srcVector, meanVector256); @@ -1215,7 +1215,7 @@ public static unsafe float DotU(Span src, Span dst) Vector256 result256 = Avx.SetZeroVector256(); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 srcVector = Avx.LoadVector256(pSrcCurrent); Vector256 dstVector = Avx.LoadVector256(pDstCurrent); @@ -1272,7 +1272,7 @@ public static unsafe float DotSU(Span src, Span dst, Span idx Vector256 result256 = Avx.SetZeroVector256(); - while (pIdxCurrent + 8 <= pIdxEnd) + while (pIdxCurrent <= pIdxEnd - 8) { Vector256 srcVector = Load8(pSrcCurrent, pIdxCurrent); Vector256 dstVector = Avx.LoadVector256(pDstCurrent); @@ -1327,7 +1327,7 @@ public static unsafe float Dist2(Span src, Span dst) Vector256 sqDistanceVector256 = Avx.SetZeroVector256(); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 distanceVector = Avx.Subtract(Avx.LoadVector256(pSrcCurrent), Avx.LoadVector256(pDstCurrent)); @@ -1384,7 +1384,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo Vector256 xPrimal256 = Avx.SetAllVector256(primalUpdate); Vector256 xThreshold256 = Avx.SetAllVector256(threshold); - while (pSrcCurrent + 8 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 8) { Vector256 xSrc = Avx.LoadVector256(pSrcCurrent); @@ -1446,7 +1446,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp Vector256 xPrimal256 = Avx.SetAllVector256(primalUpdate); Vector256 xThreshold = Avx.SetAllVector256(threshold); - while (pIdxCurrent + 8 <= pIdxEnd) + while (pIdxCurrent <= pIdxEnd - 8) { Vector256 xSrc = Avx.LoadVector256(pSrcCurrent); From e85f1d9a477c849389c63c8dfa1848b9d542c3f7 Mon Sep 17 00:00:00 2001 From: Pratik Gadhiya Date: Sat, 22 Sep 2018 08:23:02 +0530 Subject: [PATCH 2/3] Make bound checking of loops more efficient --- src/Microsoft.ML.CpuMath/SseIntrinsics.cs | 44 +++++++++++------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 4c36d0094e..8c5086fbc2 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -417,7 +417,7 @@ public static unsafe void AddScalarU(float scalar, Span dst) Vector128 scalarVector = Sse.SetAllVector128(scalar); - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent <= pDstEnd - 4) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse.Add(dstVector, scalarVector); @@ -446,7 +446,7 @@ public static unsafe void ScaleU(float scale, Span dst) Vector128 scaleVector = Sse.SetAllVector128(scale); - while (pDstCurrent + 4 <= pEnd) + while (pDstCurrent <= pEnd - 4) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -479,7 +479,7 @@ public static unsafe void ScaleSrcU(float scale, Span src, Span ds Vector128 scaleVector = Sse.SetAllVector128(scale); - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent <= pDstEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Multiply(srcVector, scaleVector); @@ -512,7 +512,7 @@ public static unsafe void ScaleAddU(float a, float b, Span dst) Vector128 aVector = Sse.SetAllVector128(a); Vector128 bVector = Sse.SetAllVector128(b); - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent <= pDstEnd - 4) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse.Add(dstVector, bVector); @@ -545,7 +545,7 @@ public static unsafe void AddScaleU(float scale, Span src, Span ds Vector128 scaleVector = Sse.SetAllVector128(scale); - while (pDstCurrent + 4 <= pEnd) + while (pDstCurrent <= pEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -586,7 +586,7 @@ public static unsafe void AddScaleCopyU(float scale, Span src, Span scaleVector = Sse.SetAllVector128(scale); - while (pResCurrent + 4 <= pResEnd) + while (pResCurrent <= pResEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -627,7 +627,7 @@ public static unsafe void AddScaleSU(float scale, Span src, Span idx Vector128 scaleVector = Sse.SetAllVector128(scale); - while (pIdxCurrent + 4 <= pEnd) + while (pIdxCurrent <= pEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); @@ -659,7 +659,7 @@ public static unsafe void AddU(Span src, Span dst) float* pDstCurrent = pdst; float* pEnd = psrc + src.Length; - while (pSrcCurrent + 4 <= pEnd) + while (pSrcCurrent <= pEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -696,7 +696,7 @@ public static unsafe void AddSU(Span src, Span idx, Span dst) float* pDstCurrent = pdst; int* pEnd = pidx + idx.Length; - while (pIdxCurrent + 4 <= pEnd) + while (pIdxCurrent <= pEnd - 4) { Vector128 dstVector = Load4(pDstCurrent, pIdxCurrent); Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); @@ -729,7 +729,7 @@ public static unsafe void MulElementWiseU(Span src1, Span src2, Sp float* pDstCurrent = pdst; float* pEnd = pdst + dst.Length; - while (pDstCurrent + 4 <= pEnd) + while (pDstCurrent <= pEnd - 4) { Vector128 src1Vector = Sse.LoadVector128(pSrc1Current); Vector128 src2Vector = Sse.LoadVector128(pSrc2Current); @@ -764,7 +764,7 @@ public static unsafe float SumU(Span src) Vector128 result = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { result = Sse.Add(result, Sse.LoadVector128(pSrcCurrent)); pSrcCurrent += 4; @@ -791,7 +791,7 @@ public static unsafe float SumSqU(Span src) Vector128 result = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result = Sse.Add(result, Sse.Multiply(srcVector, srcVector)); @@ -823,7 +823,7 @@ public static unsafe float SumSqDiffU(float mean, Span src) Vector128 result = Sse.SetZeroVector128(); Vector128 meanVector = Sse.SetAllVector128(mean); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); @@ -856,7 +856,7 @@ public static unsafe float SumAbsU(Span src) Vector128 result = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result = Sse.Add(result, Sse.And(srcVector, AbsMask128)); @@ -888,7 +888,7 @@ public static unsafe float SumAbsDiffU(float mean, Span src) Vector128 result = Sse.SetZeroVector128(); Vector128 meanVector = Sse.SetAllVector128(mean); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); @@ -921,7 +921,7 @@ public static unsafe float MaxAbsU(Span src) Vector128 result = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); result = Sse.Max(result, Sse.And(srcVector, AbsMask128)); @@ -953,7 +953,7 @@ public static unsafe float MaxAbsDiffU(float mean, Span src) Vector128 result = Sse.SetZeroVector128(); Vector128 meanVector = Sse.SetAllVector128(mean); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Subtract(srcVector, meanVector); @@ -988,7 +988,7 @@ public static unsafe float DotU(Span src, Span dst) Vector128 result = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -1029,7 +1029,7 @@ public static unsafe float DotSU(Span src, Span dst, Span idx Vector128 result = Sse.SetZeroVector128(); - while (pIdxCurrent + 4 <= pIdxEnd) + while (pIdxCurrent <= pIdxEnd - 4) { Vector128 srcVector = Load4(pSrcCurrent, pIdxCurrent); Vector128 dstVector = Sse.LoadVector128(pDstCurrent); @@ -1068,7 +1068,7 @@ public static unsafe float Dist2(Span src, Span dst) Vector128 sqDistanceVector = Sse.SetZeroVector128(); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 distanceVector = Sse.Subtract(Sse.LoadVector128(pSrcCurrent), Sse.LoadVector128(pDstCurrent)); @@ -1111,7 +1111,7 @@ public static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, flo Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xThreshold = Sse.SetAllVector128(threshold); - while (pSrcCurrent + 4 <= pSrcEnd) + while (pSrcCurrent <= pSrcEnd - 4) { Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); @@ -1156,7 +1156,7 @@ public static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Sp Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 Vector128 xThreshold = Sse.SetAllVector128(threshold); - while (pIdxCurrent + 4 <= pIdxEnd) + while (pIdxCurrent <= pIdxEnd - 4) { Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); From 81401e9ae2de35a5bf140bb1971165f7b787a219 Mon Sep 17 00:00:00 2001 From: Pratik Gadhiya Date: Sat, 6 Oct 2018 15:06:27 +0530 Subject: [PATCH 3/3] Rename LinearClassificationTrainer to StochasticDualCoordinateAscent --- .../Standard/LinearClassificationTrainer.cs | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs index 7bcb8ef04c..5af4eeb69f 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/LinearClassificationTrainer.cs @@ -20,10 +20,10 @@ using System.Threading; using System.Threading.Tasks; -[assembly: LoadableClass(typeof(LinearClassificationTrainer), typeof(LinearClassificationTrainer.Arguments), +[assembly: LoadableClass(typeof(StochasticDualCoordinateAscent), typeof(StochasticDualCoordinateAscent.Arguments), new[] { typeof(SignatureBinaryClassifierTrainer), typeof(SignatureTrainer), typeof(SignatureFeatureScorerTrainer) }, - LinearClassificationTrainer.UserNameValue, - LinearClassificationTrainer.LoadNameValue, + StochasticDualCoordinateAscent.UserNameValue, + StochasticDualCoordinateAscent.LoadNameValue, "LinearClassifier", "lc", "sasdca")] @@ -1361,7 +1361,7 @@ public void Add(Double summand) } } - public sealed class LinearClassificationTrainer : SdcaTrainerBase, TScalarPredictor> + public sealed class StochasticDualCoordinateAscent : SdcaTrainerBase, TScalarPredictor> { public const string LoadNameValue = "SDCA"; internal const string UserNameValue = "Fast Linear (SA-SDCA)"; @@ -1401,7 +1401,7 @@ internal override void Check(IHostEnvironment env) public override TrainerInfo Info { get; } - public LinearClassificationTrainer(IHostEnvironment env, Arguments args, + public StochasticDualCoordinateAscent(IHostEnvironment env, Arguments args, string featureColumn, string labelColumn, string weightColumn = null) : base(Contracts.CheckRef(env, nameof(env)).Register(LoadNameValue), args, MakeFeatureColumn(featureColumn), MakeLabelColumn(labelColumn), MakeWeightColumn(weightColumn)) { @@ -1431,7 +1431,7 @@ public LinearClassificationTrainer(IHostEnvironment env, Arguments args, } - public LinearClassificationTrainer(IHostEnvironment env, Arguments args) + public StochasticDualCoordinateAscent(IHostEnvironment env, Arguments args) : this(env, args, args.FeatureColumn, args.LabelColumn) { } @@ -1903,19 +1903,19 @@ public static partial class Sdca { [TlcModule.EntryPoint(Name = "Trainers.StochasticDualCoordinateAscentBinaryClassifier", Desc = "Train an SDCA binary model.", - UserName = LinearClassificationTrainer.UserNameValue, - ShortName = LinearClassificationTrainer.LoadNameValue, + UserName = StochasticDualCoordinateAscent.UserNameValue, + ShortName = StochasticDualCoordinateAscent.LoadNameValue, XmlInclude = new[] { @"", @"" })] - public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, LinearClassificationTrainer.Arguments input) + public static CommonOutputs.BinaryClassificationOutput TrainBinary(IHostEnvironment env, StochasticDualCoordinateAscent.Arguments input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register("TrainSDCA"); host.CheckValue(input, nameof(input)); EntryPointUtils.CheckInputArgs(host, input); - return LearnerEntryPointsUtils.Train(host, input, - () => new LinearClassificationTrainer(host, input), + return LearnerEntryPointsUtils.Train(host, input, + () => new StochasticDualCoordinateAscent(host, input), () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn), calibrator: input.Calibrator, maxCalibrationExamples: input.MaxCalibrationExamples); }