diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs index 6c6c1fe6ad..b238d602b0 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs @@ -9,10 +9,182 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) + { + Contracts.Assert(mat.Size == dst.Size * src.Size); + Contracts.Assert(crun >= 0); + + if (Sse.IsSupported) + { + if (!tran) + { + Contracts.Assert(crun <= dst.Size); + SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size); + } + else + { + Contracts.Assert(crun <= src.Size); + SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun); + } + } + else + { + if (!tran) + { + Contracts.Assert(crun <= dst.Size); + for (int i = 0; i < crun; i++) + { + float dotProduct = 0; + for (int j = 0; j < src.Size; j++) + { + dotProduct += mat[i * src.Size + j] * src[j]; + } + + if (add) + { + dst[i] += dotProduct; + } + else + { + dst[i] = dotProduct; + } + } + } + else + { + Contracts.Assert(crun <= src.Size); + for (int i = 0; i < dst.Size; i++) + { + float dotProduct = 0; + for (int j = 0; j < crun; j++) + { + dotProduct += mat[j * src.Size + i] * src[j]; + } + + if (add) + { + dst[i] += dotProduct; + } + else + { + dst[i] = dotProduct; + } + } + } + } + } + + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, + int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) + { + Contracts.AssertValue(rgposSrc); + Contracts.Assert(iposMin >= 0); + Contracts.Assert(iposMin <= iposLim); + Contracts.Assert(iposLim <= rgposSrc.Length); + Contracts.Assert(mat.Size == dst.Size * srcValues.Size); + + if (iposMin >= iposLim) + { + if (!add) + dst.ZeroItems(); + return; + } + + Contracts.AssertNonEmpty(rgposSrc); + Contracts.Assert(crun >= 0); + + if (Sse.IsSupported) + { + if (!tran) + { + Contracts.Assert(crun <= dst.Size); + SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size); + } + else + { + Contracts.Assert(crun <= srcValues.Size); + SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size); + } + } + else + { + if (!tran) + { + Contracts.Assert(crun <= dst.Size); + for (int i = 0; i < crun; i++) + { + float dotProduct = 0; + for (int j = iposMin; j < iposLim; j++) + { + int col = rgposSrc[j] - posMin; + dotProduct += mat[i * srcValues.Size + col] * srcValues[col]; + } + + if (add) + { + dst[i] += dotProduct; + } + else + { + dst[i] = dotProduct; + } + } + } + else + { + Contracts.Assert(crun <= srcValues.Size); + for (int i = 0; i < dst.Size; i++) + { + float dotProduct = 0; + for (int j = iposMin; j < iposLim; j++) + { + int col = rgposSrc[j] - posMin; + dotProduct += mat[col * dst.Size + i] * srcValues[col]; + } + + if (add) + { + dst[i] += dotProduct; + } + else + { + dst[i] = dotProduct; + } + } + + } + } + } + + public static void Add(float a, float[] dst, int count) + { + Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= dst.Length); + + Add(a, new Span(dst, 0, count)); + } + + private static void Add(float a, Span dst) + { + if (Sse.IsSupported) + { + SseIntrinsics.AddScalarU(a, dst); + } + else + { + for (int i = 0; i < dst.Length; i++) + { + dst[i] += a; + } + } + } + public static void Scale(float a, float[] dst, int count) { Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 < count && count <= dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= dst.Length); Scale(a, new Span(dst, 0, count)); } @@ -20,8 +192,9 @@ public static void Scale(float a, float[] dst, int count) public static void Scale(float a, float[] dst, int offset, int count) { Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset < dst.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset < (dst.Length - count)); Scale(a, new Span(dst, offset, count)); } @@ -41,11 +214,64 @@ private static void Scale(float a, Span dst) } } + // dst = a * src + public static void Scale(float a, float[] src, float[] dst, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= dst.Length); + + Scale(a, new Span(src, 0, count), new Span(dst, 0, count)); + } + + private static void Scale(float a, Span src, Span dst) + { + if (Sse.IsSupported) + { + SseIntrinsics.ScaleSrcU(a, src, dst); + } + else + { + for (int i = 0; i < dst.Length; i++) + { + dst[i] = a * src[i]; + } + } + } + + // dst[i] = a * (dst[i] + b) + public static void ScaleAdd(float a, float b, float[] dst, int count) + { + Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= dst.Length); + + ScaleAdd(a, b, new Span(dst, 0, count)); + } + + private static void ScaleAdd(float a, float b, Span dst) + { + if (Sse.IsSupported) + { + SseIntrinsics.ScaleAddU(a, b, dst); + } + else + { + for (int i = 0; i < dst.Length; i++) + { + dst[i] = a * (dst[i] + b); + } + } + } + public static void AddScale(float a, float[] src, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); Contracts.Assert(count <= dst.Length); AddScale(a, new Span(src, 0, count), new Span(dst, 0, count)); @@ -54,10 +280,12 @@ public static void AddScale(float a, float[] src, float[] dst, int count) public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(count <= src.Length); Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length); - Contracts.Assert(0 < count && count <= dst.Length - dstOffset); + Contracts.Assert(dstOffset >= 0); + Contracts.Assert(dstOffset < dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= (dst.Length - dstOffset)); AddScale(a, new Span(src, 0, count), new Span(dst, dstOffset, count)); } @@ -80,10 +308,11 @@ private static void AddScale(float a, Span src, Span dst) public static void AddScale(float a, float[] src, int[] indices, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); Contracts.Assert(count < dst.Length); AddScale(a, new Span(src), new Span(indices, 0, count), new Span(dst)); @@ -92,12 +321,14 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length); - Contracts.Assert(count < dst.Length - dstOffset); + Contracts.Assert(dstOffset >= 0); + Contracts.Assert(dstOffset < dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); + Contracts.Assert(count < (dst.Length - dstOffset)); AddScale(a, new Span(src), new Span(indices, 0, count), new Span(dst, dstOffset, dst.Length - dstOffset)); @@ -119,11 +350,40 @@ private static void AddScale(float a, Span src, Span indices, Span 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= dst.Length); + Contracts.Assert(count <= res.Length); + + AddScaleCopy(a, new Span(src, 0, count), new Span(dst, 0, count), new Span(res, 0, count)); + } + + private static void AddScaleCopy(float a, Span src, Span dst, Span res) + { + if (Sse.IsSupported) + { + SseIntrinsics.AddScaleCopyU(a, src, dst, res); + } + else + { + for (int i = 0; i < res.Length; i++) + { + res[i] = a * src[i] + dst[i]; + } + } + } + public static void Add(float[] src, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); Contracts.Assert(count <= dst.Length); Add(new Span(src, 0, count), new Span(dst, 0, count)); @@ -147,10 +407,11 @@ private static void Add(Span src, Span dst) public static void Add(float[] src, int[] indices, float[] dst, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); Contracts.Assert(count < dst.Length); Add(new Span(src), new Span(indices, 0, count), new Span(dst)); @@ -159,12 +420,14 @@ public static void Add(float[] src, int[] indices, float[] dst, int count) public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); Contracts.AssertNonEmpty(indices); - Contracts.Assert(count <= indices.Length); Contracts.AssertNonEmpty(dst); - Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length); - Contracts.Assert(count <= dst.Length - dstOffset); + Contracts.Assert(dstOffset >= 0); + Contracts.Assert(dstOffset < dst.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); + Contracts.Assert(count <= (dst.Length - dstOffset)); Add(new Span(src), new Span(indices, 0, count), new Span(dst, dstOffset, dst.Length - dstOffset)); @@ -189,10 +452,11 @@ private static void Add(Span src, Span indices, Span dst) public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) { Contracts.AssertNonEmpty(src1); - Contracts.Assert(0 < count && count <= src1.Length); Contracts.AssertNonEmpty(src2); - Contracts.Assert(0 < count && count <= src2.Length); Contracts.AssertNonEmpty(dst); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src1.Length); + Contracts.Assert(count <= src2.Length); MulElementWise(new Span(src1, 0, count), new Span(src2, 0, count), new Span(dst, 0, count)); @@ -213,10 +477,47 @@ private static void MulElementWise(Span src1, Span src2, Span 0); + Contracts.Assert(count <= src.Length); + + return Sum(new Span(src, 0, count)); + } + + public static float Sum(float[] src, int offset, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); + + return Sum(new Span(src, offset, count)); + } + + private static float Sum(Span src) + { + if (Sse.IsSupported) + { + return SseIntrinsics.SumU(src); + } + else + { + float sum = 0; + for (int i = 0; i < src.Length; i++) + { + sum += src[i]; + } + return sum; + } + } + public static float SumSq(float[] src, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); return SumSq(new Span(src, 0, count)); } @@ -224,8 +525,9 @@ public static float SumSq(float[] src, int count) public static float SumSq(float[] src, int offset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= src.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); return SumSq(new Span(src, offset, count)); } @@ -247,10 +549,38 @@ private static float SumSq(Span src) } } + public static float SumSq(float mean, float[] src, int offset, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); + + return SumSq(mean, new Span(src, offset, count)); + } + + private static float SumSq(float mean, Span src) + { + if (Sse.IsSupported) + { + return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src); + } + else + { + float result = 0; + for (int i = 0; i < src.Length; i++) + { + result += (src[i] - mean) * (src[i] - mean); + } + return result; + } + } + public static float SumAbs(float[] src, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count && count <= src.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); return SumAbs(new Span(src, 0, count)); } @@ -258,8 +588,9 @@ public static float SumAbs(float[] src, int count) public static float SumAbs(float[] src, int offset, int count) { Contracts.AssertNonEmpty(src); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= src.Length - count); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); return SumAbs(new Span(src, offset, count)); } @@ -281,11 +612,108 @@ private static float SumAbs(Span src) } } + public static float SumAbs(float mean, float[] src, int offset, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); + + return SumAbs(mean, new Span(src, offset, count)); + } + + private static float SumAbs(float mean, Span src) + { + if (Sse.IsSupported) + { + return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src); + } + else + { + float sum = 0; + for (int i = 0; i < src.Length; i++) + { + sum += Math.Abs(src[i] - mean); + } + return sum; + } + } + + public static float MaxAbs(float[] src, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + + return MaxAbs(new Span(src, 0, count)); + } + + public static float MaxAbs(float[] src, int offset, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(count > 0); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (src.Length - count)); + + return MaxAbs(new Span(src, offset, count)); + } + + private static float MaxAbs(Span src) + { + if (Sse.IsSupported) + { + return SseIntrinsics.MaxAbsU(src); + } + else + { + float max = 0; + for (int i = 0; i < src.Length; i++) + { + float abs = Math.Abs(src[i]); + if (abs > max) + { + max = abs; + } + } + return max; + } + } + + public static float MaxAbsDiff(float mean, float[] src, int count) + { + Contracts.AssertNonEmpty(src); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + + return MaxAbsDiff(mean, new Span(src, 0, count)); + } + + private static float MaxAbsDiff(float mean, Span src) + { + if (Sse.IsSupported) + { + return SseIntrinsics.MaxAbsDiffU(mean, src); + } + else + { + float max = 0; + for (int i = 0; i < src.Length; i++) + { + float abs = Math.Abs(src[i] - mean); + if (abs > max) + { + max = abs; + } + } + return max; + } + } + public static float DotProductDense(float[] a, float[] b, int count) { Contracts.AssertNonEmpty(a); Contracts.AssertNonEmpty(b); - Contracts.Assert(0 < count); + Contracts.Assert(count > 0); Contracts.Assert(a.Length >= count); Contracts.Assert(b.Length >= count); @@ -295,10 +723,11 @@ public static float DotProductDense(float[] a, float[] b, int count) public static float DotProductDense(float[] a, int offset, float[] b, int count) { Contracts.AssertNonEmpty(a); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset <= a.Length - count); Contracts.AssertNonEmpty(b); - Contracts.Assert(b.Length >= count); + Contracts.Assert(count > 0); + Contracts.Assert(count <= b.Length); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset <= (a.Length - count)); return DotProductDense(new Span(a, offset, count), new Span(b, 0, count)); } @@ -324,7 +753,8 @@ public static float DotProductSparse(float[] a, float[] b, int[] indices, int co { Contracts.AssertNonEmpty(a); Contracts.AssertNonEmpty(b); - Contracts.Assert(0 < count); + Contracts.AssertNonEmpty(indices); + Contracts.Assert(count > 0); Contracts.Assert(count < a.Length); Contracts.Assert(count <= b.Length); Contracts.Assert(count <= indices.Length); @@ -336,12 +766,14 @@ public static float DotProductSparse(float[] a, float[] b, int[] indices, int co public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count) { Contracts.AssertNonEmpty(a); - Contracts.Assert(0 < count); - Contracts.Assert(0 <= offset && offset < a.Length); - Contracts.Assert(a.Length - offset > count); Contracts.AssertNonEmpty(b); + Contracts.AssertNonEmpty(indices); + Contracts.Assert(count > 0); + Contracts.Assert(count < (a.Length - offset)); Contracts.Assert(count <= b.Length); Contracts.Assert(count <= indices.Length); + Contracts.Assert(offset >= 0); + Contracts.Assert(offset < a.Length); return DotProductSparse(new Span(a, offset, a.Length - offset), new Span(b), new Span(indices, 0, count)); @@ -369,7 +801,8 @@ public static float L2DistSquared(float[] a, float[] b, int count) { Contracts.AssertNonEmpty(a); Contracts.AssertNonEmpty(b); - Contracts.Assert(0 < count && count <= a.Length); + Contracts.Assert(count > 0); + Contracts.Assert(count <= a.Length); Contracts.Assert(count <= b.Length); return L2DistSquared(new Span(a, 0, count), new Span(b, 0, count)); @@ -392,5 +825,135 @@ private static float L2DistSquared(Span a, Span b) return norm; } } + + public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) + { + Contracts.Assert(ccol > 0); + Contracts.Assert(ccol <= cfltRow); + + if (ccol == cfltRow) + { + ZeroItemsU(dst, dst.Size, indices, indices.Length); + } + else + { + ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length); + } + } + + private static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices) + { + fixed (float* pdst = &dst.Items[0]) + fixed (int* pidx = &indices[0]) + { + for (int i = 0; i < cindices; ++i) + { + int index = pidx[i]; + Contracts.Assert(index >= 0); + Contracts.Assert(index < c); + pdst[index] = 0; + } + } + } + + private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices) + { + fixed (float* pdst = &dst.Items[0]) + fixed (int* pidx = &indices[0]) + { + int ivLogMin = 0; + int ivLogLim = ccol; + int ivPhyMin = 0; + + for (int i = 0; i < cindices; ++i) + { + int index = pidx[i]; + Contracts.Assert(index >= 0); + Contracts.Assert(index < c); + + int col = index - ivLogMin; + if ((uint)col >= (uint)ccol) + { + Contracts.Assert(ivLogMin > index || index >= ivLogLim); + + int row = index / ccol; + ivLogMin = row * ccol; + ivLogLim = ivLogMin + ccol; + ivPhyMin = row * cfltRow; + + Contracts.Assert(index >= ivLogMin); + Contracts.Assert(index < ivLogLim); + col = index - ivLogMin; + } + + pdst[ivPhyMin + col] = 0; + } + } + } + + public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w) + { + Contracts.AssertNonEmpty(src); + Contracts.AssertNonEmpty(v); + Contracts.AssertNonEmpty(w); + Contracts.Assert(length > 0); + Contracts.Assert(length <= src.Length); + Contracts.Assert(length <= v.Length); + Contracts.Assert(length <= w.Length); + + SdcaL1UpdateDense(primalUpdate, new Span(src, 0, length), threshold, new Span(v, 0, length), new Span(w, 0, length)); + } + + private static void SdcaL1UpdateDense(float primalUpdate, Span src, float threshold, Span v, Span w) + { + if (Sse.IsSupported) + { + SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w); + } + else + { + for (int i = 0; i < src.Length; i++) + { + v[i] += src[i] * primalUpdate; + float value = v[i]; + w[i] = Math.Abs(value) > threshold ? (value > 0 ? value - threshold : value + threshold) : 0; + } + } + } + + // REVIEW NEEDED: The second argument "length" is unused even in the existing code. + public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w) + { + Contracts.AssertNonEmpty(src); + Contracts.AssertNonEmpty(indices); + Contracts.AssertNonEmpty(v); + Contracts.AssertNonEmpty(w); + Contracts.Assert(count > 0); + Contracts.Assert(count <= src.Length); + Contracts.Assert(count <= indices.Length); + Contracts.Assert(count < length); + Contracts.Assert(length <= v.Length); + Contracts.Assert(length <= w.Length); + + SdcaL1UpdateSparse(primalUpdate, new Span(src, 0, count), new Span(indices, 0, count), threshold, new Span(v), new Span(w)); + } + + private static void SdcaL1UpdateSparse(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + { + if (Sse.IsSupported) + { + SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w); + } + else + { + for (int i = 0; i < indices.Length; i++) + { + int index = indices[i]; + v[index] += src[i] * primalUpdate; + float value = v[index]; + w[index] = Math.Abs(value) > threshold ? (value > 0 ? value - threshold : value + threshold) : 0; + } + } + } } } diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs index 501fc9082e..730fb10be7 100644 --- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs +++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs @@ -6,10 +6,21 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { public static partial class CpuMathUtils { + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun); + + public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues, + int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun); + + public static void Add(float a, float[] dst, int count) => SseUtils.Add(a, dst, count); + public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count); public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count); + public static void Scale(float a, float[] src, float[] dst, int count) => SseUtils.Scale(a, src, dst, count); + + public static void ScaleAdd(float a, float b, float[] dst, int count) => SseUtils.ScaleAdd(a, b, dst, count); + public static void AddScale(float a, float[] src, float[] dst, int count) => SseUtils.AddScale(a, src, dst, count); public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, dst, dstOffset, count); @@ -18,6 +29,8 @@ public static partial class CpuMathUtils public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, indices, dst, dstOffset, count); + public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, int count) => SseUtils.AddScaleCopy(a, src, dst, res, count); + public static void Add(float[] src, float[] dst, int count) => SseUtils.Add(src, dst, count); public static void Add(float[] src, int[] indices, float[] dst, int count) => SseUtils.Add(src, indices, dst, count); @@ -26,14 +39,28 @@ public static partial class CpuMathUtils public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) => SseUtils.MulElementWise(src1, src2, dst, count); + public static float Sum(float[] src, int count) => SseUtils.Sum(src, count); + + public static float Sum(float[] src, int offset, int count) => SseUtils.Sum(src, offset, count); + public static float SumSq(float[] src, int count) => SseUtils.SumSq(src, count); public static float SumSq(float[] src, int offset, int count) => SseUtils.SumSq(src, offset, count); + public static float SumSq(float mean, float[] src, int offset, int count) => SseUtils.SumSq(mean, src, offset, count); + public static float SumAbs(float[] src, int count) => SseUtils.SumAbs(src, count); public static float SumAbs(float[] src, int offset, int count) => SseUtils.SumAbs(src, offset, count); + public static float SumAbs(float mean, float[] src, int offset, int count) => SseUtils.SumAbs(mean, src, offset, count); + + public static float MaxAbs(float[] src, int count) => SseUtils.MaxAbs(src, count); + + public static float MaxAbs(float[] src, int offset, int count) => SseUtils.MaxAbs(src, offset, count); + + public static float MaxAbsDiff(float mean, float[] src, int count) => SseUtils.MaxAbsDiff(mean, src, count); + public static float DotProductDense(float[] a, float[] b, int count) => SseUtils.DotProductDense(a, b, count); public static float DotProductDense(float[] a, int offset, float[] b, int count) => SseUtils.DotProductDense(a, offset, b, count); @@ -43,5 +70,13 @@ public static partial class CpuMathUtils public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count) => SseUtils.DotProductSparse(a, offset, b, indices, count); public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count); + + public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(dst, ccol, cfltRow, indices); + + public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w) + => SseUtils.SdcaL1UpdateDense(primalUpdate, length, src, threshold, v, w); + + public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w) + => SseUtils.SdcaL1UpdateSparse(primalUpdate, length, src, indices, count, threshold, v, w); } } diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index d11676f283..2ac1f56f14 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -4,8 +4,14 @@ // The exported function names need to be unique (can't be disambiguated based on signature), hence // we introduce suffix letters to indicate the general patterns used. +// * A suffix means aligned and padded for SSE operations. // * U suffix means unaligned and unpadded. // * S suffix means sparse (unaligned) vector. +// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector. +// * R suffix means sparse matrix. +// * C suffix means convolution matrix. +// * D suffix means convolution matrix, with implicit source padding. +// * Tran means the matrix is transposed. using System; using System.Runtime.CompilerServices; @@ -16,6 +22,23 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath { internal static class SseIntrinsics { + private const int CbAlign = 16; + + private static bool Compat(AlignedArray a) + { + Contracts.AssertValue(a); + Contracts.Assert(a.Size > 0); + return a.CbAlign == CbAlign; + } + + private static unsafe float* Ptr(AlignedArray a, float* p) + { + Contracts.AssertValue(a); + float* q = p + a.GetBase((long)p); + Contracts.Assert(((long)q & (CbAlign - 1)) == 0); + return q; + } + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] private static unsafe Vector128 Load1(float* src, int* idx) { @@ -29,29 +52,22 @@ private static unsafe Vector128 Load4(float* src, int* idx) } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 Rotate(Vector128 x) + private static Vector128 Rotate(in Vector128 x) { // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA. return Sse.Shuffle(x, x, 0x39); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static Vector128 RotateReverse(Vector128 x) - { - // The control byte shuffles the four 32-bit floats of x: ABCD -> DABC. - return Sse.Shuffle(x, x, 0x93); - } - - [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] - private static unsafe void Store4(Vector128 x, float* dst, int* idx) + private static unsafe void Store4(in Vector128 x, float* dst, int* idx) { Sse.StoreScalar(dst + idx[0], x); - x = Rotate(x); - Sse.StoreScalar(dst + idx[1], x); - x = Rotate(x); - Sse.StoreScalar(dst + idx[2], x); - x = Rotate(x); - Sse.StoreScalar(dst + idx[3], x); + Vector128 rotated = Rotate(in x); + Sse.StoreScalar(dst + idx[1], rotated); + rotated = Rotate(in rotated); + Sse.StoreScalar(dst + idx[2], rotated); + rotated = Rotate(in rotated); + Sse.StoreScalar(dst + idx[3], rotated); } [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] @@ -59,15 +75,362 @@ private static Vector128 VectorSum(in Vector128 vector) { if (Sse3.IsSupported) { - Vector128 tmp = Sse3.HorizontalAdd(vector, vector); - return Sse3.HorizontalAdd(tmp, tmp); + Vector128 partialSum = Sse3.HorizontalAdd(vector, vector); + return Sse3.HorizontalAdd(partialSum, partialSum); } else { - // SSE3 is not supported. - Vector128 tmp = Sse.Add(vector, Sse.MoveHighToLow(vector, vector)); - // The control byte shuffles the four 32-bit floats of tmp: ABCD -> BADC. - return Sse.Add(tmp, Sse.Shuffle(tmp, tmp, 0xb1)); + Vector128 partialSum = Sse.Add(vector, Sse.MoveHighToLow(vector, vector)); + // The control byte shuffles the four 32-bit floats of partialSum: ABCD -> BADC. + return Sse.Add(partialSum, Sse.Shuffle(partialSum, partialSum, 0xB1)); + } + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 VectorMax(in Vector128 vector) + { + Vector128 x1 = Sse.Shuffle(vector, vector, 0xB1); + Vector128 partialMax = Sse.Max(vector, x1); + x1 = Sse.Shuffle(partialMax, partialMax, 0x02); + return Sse.MaxScalar(partialMax, x1); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 GetAbsMask() + { + return Sse2.IsSupported ? + Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)) : + Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + } + + [MethodImplAttribute(MethodImplOptions.AggressiveInlining)] + private static Vector128 GetNewDst(in Vector128 xDst1, in Vector128 signMask, in Vector128 xThreshold) + { + Vector128 xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise + Vector128 xDst1Abs = Sse.Xor(xDst1, xSign); + Vector128 xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true + Vector128 x2 = Sse.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise + return Sse.And(Sse.Subtract(xDst1, x2), xCond); + } + + // Multiply matrix times vector into vector. + internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + { + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); + + float* pSrcEnd = psrc + ccol; + float* pDstEnd = pdst + crow; + float* pDstCurrent = pdst; + float* pMatCurrent = pmat; + + while (pDstCurrent < pDstEnd) + { + Vector128 res0 = Sse.SetZeroVector128(); + Vector128 res1 = res0; + Vector128 res2 = res0; + Vector128 res3 = res0; + + float* pSrcCurrent = psrc; + + while (pSrcCurrent < pSrcEnd) + { + float* pMatTemp = pMatCurrent; + + Vector128 x01 = Sse.LoadAlignedVector128(pMatTemp); + Vector128 x11 = Sse.LoadAlignedVector128(pMatTemp += ccol); + Vector128 x21 = Sse.LoadAlignedVector128(pMatTemp += ccol); + Vector128 x31 = Sse.LoadAlignedVector128(pMatTemp += ccol); + Vector128 x02 = Sse.LoadAlignedVector128(pSrcCurrent); + + res0 = Sse.Add(res0, Sse.Multiply(x01, x02)); + res1 = Sse.Add(res1, Sse.Multiply(x11, x02)); + res2 = Sse.Add(res2, Sse.Multiply(x21, x02)); + res3 = Sse.Add(res3, Sse.Multiply(x31, x02)); + + pSrcCurrent += 4; + pMatCurrent += 4; + } + + // Add up the entries of each, with the 4 results in res0 + res0 = Sse3.HorizontalAdd(res0, res1); + res2 = Sse3.HorizontalAdd(res2, res3); + res0 = Sse3.HorizontalAdd(res0, res2); + + if (add) + { + res0 = Sse.Add(res0, Sse.LoadAlignedVector128(pDstCurrent)); + } + Sse.StoreAligned(pDstCurrent, res0); + + pDstCurrent += 4; + pMatCurrent += 3 * ccol; + } + } + } + + // Partial sparse source vector. + internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + + // REVIEW: For extremely sparse inputs, interchanging the loops would + // likely be more efficient. + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + fixed (int* pposSrc = &rgposSrc[0]) + { + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); + + int* pposMin = pposSrc + iposMin; + int* pposEnd = pposSrc + iposEnd; + float* pDstEnd = pdst + crow; + float* pm0 = pmat - posMin; + float* pSrcCurrent = psrc - posMin; + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pm1 = pm0 + ccol; + float* pm2 = pm1 + ccol; + float* pm3 = pm2 + ccol; + Vector128 result = Sse.SetZeroVector128(); + + int* ppos = pposMin; + + while (ppos < pposEnd) + { + int col = *ppos; + Vector128 x1 = Sse.SetVector128(pm3[col], pm2[col], pm1[col], pm0[col]); + Vector128 x2 = Sse.SetAllVector128(pSrcCurrent[col]); + x2 = Sse.Multiply(x2, x1); + result = Sse.Add(result, x2); + + ppos++; + } + + if (add) + { + result = Sse.Add(result, Sse.LoadAlignedVector128(pDstCurrent)); + } + Sse.StoreAligned(pDstCurrent, result); + + pDstCurrent += 4; + pm0 += 4 * ccol; + } + } + } + + internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + { + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); + + float* pSrcEnd = psrc + ccol; + float* pDstEnd = pdst + crow; + float* pSrcCurrent = psrc; + float* pMatCurrent = pmat; + + if (!add) + { + Vector128 x01 = Sse.LoadAlignedVector128(pSrcCurrent); + // Replicate each 32-bit slot of x01 (ABCD) into its own register. + Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); // B + Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); // C + Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); // D + x01 = Sse.Shuffle(x01, x01, 0x00); // A + + pSrcCurrent += 4; + + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatTemp = pMatCurrent; + Vector128 x02 = Sse.LoadAlignedVector128(pMatTemp); + Vector128 x12 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x22 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x32 = Sse.LoadAlignedVector128(pMatTemp += crow); + + x02 = Sse.Multiply(x01, x02); + x12 = Sse.Multiply(x11, x12); + x22 = Sse.Multiply(x21, x22); + x32 = Sse.Multiply(x31, x32); + + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); + + Sse.StoreAligned(pDstCurrent, x02); + + pDstCurrent += 4; + pMatCurrent += 4; + } + + pMatCurrent += 3 * crow; + } + + while (pSrcCurrent < pSrcEnd) + { + Vector128 x01 = Sse.LoadAlignedVector128(pSrcCurrent); + // Replicate each 32-bit slot of x01 (ABCD) into its own register. + Vector128 x11 = Sse.Shuffle(x01, x01, 0x55); // B + Vector128 x21 = Sse.Shuffle(x01, x01, 0xAA); // C + Vector128 x31 = Sse.Shuffle(x01, x01, 0xFF); // D + x01 = Sse.Shuffle(x01, x01, 0x00); // A + + float* pDstCurrent = pdst; + + while (pDstCurrent < pDstEnd) + { + float* pMatTemp = pMatCurrent; + + Vector128 x02 = Sse.LoadAlignedVector128(pMatTemp); + Vector128 x12 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x22 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x32 = Sse.LoadAlignedVector128(pMatTemp += crow); + Vector128 x3 = Sse.LoadAlignedVector128(pDstCurrent); + + x02 = Sse.Multiply(x01, x02); + x12 = Sse.Multiply(x11, x12); + x22 = Sse.Multiply(x21, x22); + x32 = Sse.Multiply(x31, x32); + + x02 = Sse.Add(x02, x12); + x22 = Sse.Add(x22, x32); + x02 = Sse.Add(x02, x22); + x3 = Sse.Add(x02, x3); + + Sse.StoreAligned(pDstCurrent, x3); + + pDstCurrent += 4; + pMatCurrent += 4; + } + + pMatCurrent += 3 * crow; + pSrcCurrent += 4; + } + } + } + + // Partial sparse source vector. + internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src, + int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow) + { + Contracts.Assert(Compat(mat)); + Contracts.Assert(Compat(src)); + Contracts.Assert(Compat(dst)); + + fixed (float* pSrcStart = &src.Items[0]) + fixed (float* pDstStart = &dst.Items[0]) + fixed (float* pMatStart = &mat.Items[0]) + fixed (int* pposSrc = &rgposSrc[0]) + { + float* psrc = Ptr(src, pSrcStart); + float* pdst = Ptr(dst, pDstStart); + float* pmat = Ptr(mat, pMatStart); + + int* ppos = pposSrc + iposMin; + int* pposEnd = pposSrc + iposEnd; + float* pDstEnd = pdst + crow; + + if (!add) + { + int col = *ppos - posMin; + ppos++; + + Vector128 x0 = Sse.SetAllVector128(psrc[col]); + float* pDstCurrent = pdst; + float* pMatCurrent = pmat + col * crow; + + while (pDstCurrent < pDstEnd) + { + Vector128 x1 = Sse.LoadAlignedVector128(pMatCurrent); + x1 = Sse.Multiply(x1, x0); + Sse.StoreAligned(pDstCurrent, x1); + + pDstCurrent += 4; + pMatCurrent += 4; + } + } + + // REVIEW: Should we explore unrolling the outer loop? + while (ppos < pposEnd) + { + int col = *ppos - posMin; + + Vector128 x0 = Sse.SetAllVector128(psrc[col]); + float* pDstCurrent = pdst; + float* pMatCurrent = pmat + col * crow; + + while (pDstCurrent < pDstEnd) + { + Vector128 x1 = Sse.LoadAlignedVector128(pMatCurrent); + Vector128 x2 = Sse.LoadAlignedVector128(pDstCurrent); + x1 = Sse.Multiply(x1, x0); + x2 = Sse.Add(x2, x1); + Sse.StoreAligned(pDstCurrent, x2); + + pDstCurrent += 4; + pMatCurrent += 4; + } + + ppos++; + } + } + } + + // dst[i] += scale + internal static unsafe void AddScalarU(float scale, Span dst) + { + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pDstCurrent = pdst; + + Vector128 x1 = Sse.SetAllVector128(scale); + + while (pDstCurrent + 4 <= pDstEnd) + { + Vector128 x2 = Sse.LoadVector128(pDstCurrent); + x2 = Sse.Add(x2, x1); + Sse.Store(pDstCurrent, x2); + + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 x2 = Sse.LoadScalarVector128(pDstCurrent); + x2 = Sse.AddScalar(x2, x1); + Sse.StoreScalar(pDstCurrent, x2); + + pDstCurrent++; + } } } @@ -102,6 +465,72 @@ internal static unsafe void ScaleU(float scale, Span dst) } } + internal static unsafe void ScaleSrcU(float scale, Span src, Span dst) + { + Vector128 scaleVector = Sse.SetAllVector128(scale); + + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + + while (pDstCurrent + 4 <= pDstEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Multiply(srcVector, scaleVector); + Sse.Store(pDstCurrent, srcVector); + + pSrcCurrent += 4; + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.MultiplyScalar(srcVector, scaleVector); + Sse.StoreScalar(pDstCurrent, srcVector); + + pSrcCurrent++; + pDstCurrent++; + } + } + } + + // dst[i] = a * (dst[i] + b) + internal static unsafe void ScaleAddU(float a, float b, Span dst) + { + Vector128 x1 = Sse.SetAllVector128(a); + Vector128 x2 = Sse.SetAllVector128(b); + + fixed (float* pdst = dst) + { + float* pDstEnd = pdst + dst.Length; + float* pDstCurrent = pdst; + + while (pDstCurrent + 4 <= pDstEnd) + { + Vector128 dstVector = Sse.LoadVector128(pDstCurrent); + dstVector = Sse.Add(dstVector, x2); + dstVector = Sse.Multiply(dstVector, x1); + Sse.Store(pDstCurrent, dstVector); + + pDstCurrent += 4; + } + + while (pDstCurrent < pDstEnd) + { + Vector128 dstVector = Sse.LoadScalarVector128(pDstCurrent); + dstVector = Sse.AddScalar(dstVector, x2); + dstVector = Sse.MultiplyScalar(dstVector, x1); + Sse.StoreScalar(pDstCurrent, dstVector); + + pDstCurrent++; + } + } + } + internal static unsafe void AddScaleU(float scale, Span src, Span dst) { Vector128 scaleVector = Sse.SetAllVector128(scale); @@ -141,6 +570,47 @@ internal static unsafe void AddScaleU(float scale, Span src, Span } } + internal static unsafe void AddScaleCopyU(float scale, Span src, Span dst, Span result) + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (float* pres = result) + { + float* pResEnd = pres + result.Length; + float* pSrcCurrent = psrc; + float* pDstCurrent = pdst; + float* pResCurrent = pres; + + Vector128 x1 = Sse.SetAllVector128(scale); + + while (pResCurrent + 4 <= pResEnd) + { + Vector128 x2 = Sse.LoadVector128(pSrcCurrent); + Vector128 x3 = Sse.LoadVector128(pDstCurrent); + x2 = Sse.Multiply(x2, x1); + x3 = Sse.Add(x3, x2); + Sse.Store(pResCurrent, x3); + + pSrcCurrent += 4; + pDstCurrent += 4; + pResCurrent += 4; + } + + while (pResCurrent < pResEnd) + { + Vector128 x2 = Sse.LoadScalarVector128(pSrcCurrent); + Vector128 x3 = Sse.LoadScalarVector128(pDstCurrent); + x2 = Sse.MultiplyScalar(x2, x1); + x3 = Sse.AddScalar(x3, x2); + Sse.StoreScalar(pResCurrent, x3); + + pSrcCurrent++; + pDstCurrent++; + pResCurrent++; + } + } + } + internal static unsafe void AddScaleSU(float scale, Span src, Span idx, Span dst) { Vector128 scaleVector = Sse.SetAllVector128(scale); @@ -161,7 +631,7 @@ internal static unsafe void AddScaleSU(float scale, Span src, Span i srcVector = Sse.Multiply(srcVector, scaleVector); dstVector = Sse.Add(dstVector, srcVector); - Store4(dstVector, pDstCurrent, pIdxCurrent); + Store4(in dstVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; @@ -229,7 +699,7 @@ internal static unsafe void AddSU(Span src, Span idx, Span ds Vector128 dstVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Add(srcVector, dstVector); - Store4(srcVector, pDstCurrent, pIdxCurrent); + Store4(in srcVector, pDstCurrent, pIdxCurrent); pIdxCurrent += 4; pSrcCurrent += 4; @@ -282,6 +752,33 @@ internal static unsafe void MulElementWiseU(Span src1, Span src2, } } + internal static unsafe float SumU(Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector128 result = Sse.SetZeroVector128(); + + while (pSrcCurrent + 4 < pSrcEnd) + { + result = Sse.Add(result, Sse.LoadVector128(pSrcCurrent)); + pSrcCurrent += 4; + } + + result = VectorSum(in result); + + while (pSrcCurrent < pSrcEnd) + { + result = Sse.AddScalar(result, Sse.LoadScalarVector128(pSrcCurrent)); + pSrcCurrent++; + } + + return Sse.ConvertToSingle(result); + } + } + internal static unsafe float SumSqU(Span src) { Vector128 result = Sse.SetZeroVector128(); @@ -313,20 +810,78 @@ internal static unsafe float SumSqU(Span src) return Sse.ConvertToSingle(result); } + internal static unsafe float SumSqDiffU(float mean, Span src) + { + fixed (float* psrc = src) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 x = Sse.LoadVector128(pSrcCurrent); + x = Sse.Subtract(x, meanVector); + result = Sse.Add(result, Sse.Multiply(x, x)); + + pSrcCurrent += 4; + } + + result = VectorSum(in result); + + while (pSrcCurrent < pSrcEnd) + { + Vector128 x = Sse.LoadScalarVector128(pSrcCurrent); + x = Sse.SubtractScalar(x, meanVector); + result = Sse.AddScalar(result, Sse.MultiplyScalar(x, x)); + + pSrcCurrent++; + } + + return Sse.ConvertToSingle(result); + } + } + internal static unsafe float SumAbsU(Span src) { Vector128 result = Sse.SetZeroVector128(); - Vector128 mask; + Vector128 mask = GetAbsMask(); - if (Sse2.IsSupported) - { - mask = Sse.StaticCast(Sse2.SetAllVector128(0x7FFFFFFF)); - } - else + fixed (float* psrc = src) { - mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF)); + float* pSrcCurrent = psrc; + float* pEnd = psrc + src.Length; + + while (pSrcCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result = Sse.Add(result, Sse.And(srcVector, mask)); + + pSrcCurrent += 4; + } + + result = VectorSum(in result); + + while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result = Sse.AddScalar(result, Sse.And(srcVector, mask)); + + pSrcCurrent++; + } } + return Sse.ConvertToSingle(result); + } + + internal static unsafe float SumAbsDiffU(float mean, Span src) + { + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + Vector128 mask = GetAbsMask(); + fixed (float* psrc = src) { float* pSrcCurrent = psrc; @@ -335,6 +890,7 @@ internal static unsafe float SumAbsU(Span src) while (pSrcCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + srcVector = Sse.Subtract(srcVector, meanVector); result = Sse.Add(result, Sse.And(srcVector, mask)); pSrcCurrent += 4; @@ -343,9 +899,77 @@ internal static unsafe float SumAbsU(Span src) result = VectorSum(in result); while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector); + result = Sse.AddScalar(result, Sse.And(srcVector, mask)); + + pSrcCurrent++; + } + } + + return Sse.ConvertToSingle(result); + } + + internal static unsafe float MaxAbsU(Span src) + { + Vector128 result = Sse.SetZeroVector128(); + Vector128 mask = GetAbsMask(); + + fixed (float* psrc = src) + { + float* pSrcCurrent = psrc; + float* pEnd = psrc + src.Length; + + while (pSrcCurrent + 4 <= pEnd) + { + Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); + result = Sse.Max(result, Sse.And(srcVector, mask)); + + pSrcCurrent += 4; + } + + result = VectorMax(in result); + + while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); + + pSrcCurrent++; + } + } + + return Sse.ConvertToSingle(result); + } + + internal static unsafe float MaxAbsDiffU(float mean, Span src) + { + Vector128 result = Sse.SetZeroVector128(); + Vector128 meanVector = Sse.SetAllVector128(mean); + Vector128 mask = GetAbsMask(); + + fixed (float* psrc = src) + { + float* pSrcCurrent = psrc; + float* pEnd = psrc + src.Length; + + while (pSrcCurrent + 4 <= pEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); - result = Sse.Add(result, Sse.And(srcVector, mask)); + srcVector = Sse.Subtract(srcVector, meanVector); + result = Sse.Max(result, Sse.And(srcVector, mask)); + + pSrcCurrent += 4; + } + + result = VectorMax(in result); + + while (pSrcCurrent < pEnd) + { + Vector128 srcVector = Sse.LoadScalarVector128(pSrcCurrent); + srcVector = Sse.SubtractScalar(srcVector, meanVector); + result = Sse.MaxScalar(result, Sse.And(srcVector, mask)); pSrcCurrent++; } @@ -472,5 +1096,93 @@ internal static unsafe float Dist2(Span src, Span dst) } } + internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span src, float threshold, Span v, Span w) + { + fixed (float* psrc = src) + fixed (float* pdst1 = v) + fixed (float* pdst2 = w) + { + float* pSrcEnd = psrc + src.Length; + float* pSrcCurrent = psrc; + float* pDst1Current = pdst1; + float* pDst2Current = pdst2; + + Vector128 xPrimal = Sse.SetAllVector128(primalUpdate); + + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 + Vector128 xThreshold = Sse.SetAllVector128(threshold); + + while (pSrcCurrent + 4 <= pSrcEnd) + { + Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + + Vector128 xDst1 = Sse.LoadVector128(pDst1Current); + xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); + Vector128 xDst2 = GetNewDst(xDst1, signMask, xThreshold); + + Sse.Store(pDst1Current, xDst1); + Sse.Store(pDst2Current, xDst2); + + pSrcCurrent += 4; + pDst1Current += 4; + pDst2Current += 4; + } + + while (pSrcCurrent < pSrcEnd) + { + *pDst1Current += (*pSrcCurrent) * primalUpdate; + float dst1 = *pDst1Current; + *pDst2Current = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0; + + pSrcCurrent++; + pDst1Current++; + pDst2Current++; + } + } + } + + internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span src, Span indices, float threshold, Span v, Span w) + { + fixed (float* psrc = src) + fixed (int* pidx = indices) + fixed (float* pdst1 = v) + fixed (float* pdst2 = w) + { + int* pIdxEnd = pidx + indices.Length; + float* pSrcCurrent = psrc; + int* pIdxCurrent = pidx; + + Vector128 xPrimal = Sse.SetAllVector128(primalUpdate); + + Vector128 signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000 + Vector128 xThreshold = Sse.SetAllVector128(threshold); + + while (pIdxCurrent + 4 <= pIdxEnd) + { + Vector128 xSrc = Sse.LoadVector128(pSrcCurrent); + + Vector128 xDst1 = Load4(pdst1, pIdxCurrent); + xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal)); + Vector128 xDst2 = GetNewDst(xDst1, signMask, xThreshold); + + Store4(in xDst1, pdst1, pIdxCurrent); + Store4(in xDst2, pdst2, pIdxCurrent); + + pIdxCurrent += 4; + pSrcCurrent += 4; + } + + while (pIdxCurrent < pIdxEnd) + { + int index = *pIdxCurrent; + pdst1[index] += (*pSrcCurrent) * primalUpdate; + float dst1 = pdst1[index]; + pdst2[index] = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0; + + pIdxCurrent++; + pSrcCurrent++; + } + } + } } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs index 90f362de3e..8df3352556 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs @@ -2,6 +2,17 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +// The exported function names need to be unique (can't be disambiguated based on signature), hence +// we introduce suffix letters to indicate the general patterns used. +// * A suffix means aligned and padded for SSE operations. +// * U suffix means unaligned and unpadded. +// * S suffix means sparse (unaligned) vector. +// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector. +// * R suffix means sparse matrix. +// * C suffix means convolution matrix. +// * D suffix means convolution matrix, with implicit source padding. +// * Tran means the matrix is transposed. + using System.Runtime.InteropServices; using System.Security; @@ -9,14 +20,26 @@ namespace Microsoft.ML.CpuMath.PerformanceTests { internal static class CpuMathNativeUtils { - [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c); + [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c); + [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float SumSqU(/*const*/ float* ps, int c); + [DllImport("CpuMathNative", EntryPoint = "ScaleSrcU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ScaleSrcU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "ScaleAddU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void ScaleAddU(float a, float b, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c); + + [DllImport("CpuMathNative", EntryPoint = "AddScaleCopyU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void AddScaleCopyU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ float* pd, /*_Inout_*/ float* pr, int c); [DllImport("CpuMathNative", EntryPoint = "AddU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void AddU(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); @@ -24,22 +47,43 @@ internal static class CpuMathNativeUtils [DllImport("CpuMathNative", EntryPoint = "AddSU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe void AddSU(/*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c); - [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "SumU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumU(/*const*/ float* ps, int c); - [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumSqU(/*const*/ float* ps, int c); - [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c); + [DllImport("CpuMathNative", EntryPoint = "SumSqDiffU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumSqDiffU(float mean, /*const*/ float* ps, int c); [DllImport("CpuMathNative", EntryPoint = "SumAbsU"), SuppressUnmanagedCodeSecurity] internal static extern unsafe float SumAbsU(/*const*/ float* ps, int c); - [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity] - internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c); + [DllImport("CpuMathNative", EntryPoint = "SumAbsDiffU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float SumAbsDiffU(float mean, /*const*/ float* ps, int c); + + [DllImport("CpuMathNative", EntryPoint = "MaxAbsU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float MaxAbsU(/*const*/ float* ps, int c); + + [DllImport("CpuMathNative", EntryPoint = "MaxAbsDiffU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float MaxAbsDiffU(float mean, /*const*/ float* ps, int c); + + [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c); + + [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c); + + [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c); + + [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void SdcaL1UpdateU(float primalUpdate, /*_In_ const*/ float* ps, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c); + + [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateSU"), SuppressUnmanagedCodeSecurity] + internal static extern unsafe void SdcaL1UpdateSU(float primalUpdate, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c); } } diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs index 92752a0018..ade2ea6a0e 100644 --- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs +++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs @@ -19,8 +19,11 @@ public class SsePerformanceTests private const int EXP_RANGE = EXP_MAX / 2; private const int DEFAULT_SEED = 253421; private const float DEFAULT_SCALE = 1.11f; + private const int DEFAULT_CROW = 500; + private const int DEFAULT_CCOL = 2000; + private const bool ADD = true; - private float[] src, dst, original, src1, src2; + private float[] src, dst, original, src1, src2, result; private int[] idx; private int seed = DEFAULT_SEED; @@ -65,6 +68,7 @@ public void Setup() src1 = new float[LEN]; src2 = new float[LEN]; original = new float[LEN]; + result = new float[LEN]; idx = new int[IDXLEN]; seed = GetSeed(); @@ -75,6 +79,7 @@ public void Setup() src[i] = NextFloat(rand, EXP_RANGE); dst[i] = NextFloat(rand, EXP_RANGE); original[i] = dst[i]; + result[i] = dst[i]; src1[i] = NextFloat(rand, EXP_RANGE); src2[i] = NextFloat(rand, EXP_RANGE); } @@ -89,46 +94,98 @@ public void Setup() public void GlobalCleanup() { original.CopyTo(dst, 0); + original.CopyTo(result, 0); } [Benchmark] - public unsafe float NativeDotUPerf() + public unsafe float NativeAddScalarUPerf() + { + fixed (float* pdst = dst) + { + return CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN); + } + } + + [Benchmark] + public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN); + + [Benchmark] + public unsafe void NativeScaleUPerf() + { + fixed (float* pdst = dst) + { + CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN); + } + } + + [Benchmark] + public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + + [Benchmark] + public unsafe void NativeScaleSrcUPerf() { fixed (float* psrc = src) fixed (float* pdst = dst) { - return CpuMathNativeUtils.DotU(psrc, pdst, LEN); + CpuMathNativeUtils.ScaleSrcU(DEFAULT_SCALE, psrc, pdst, LEN); } } [Benchmark] - public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN); + public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN); [Benchmark] - public unsafe float NativeDotSUPerf() + public unsafe void NativeScaleAddUPerf() + { + fixed (float* pdst = dst) + { + CpuMathNativeUtils.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, pdst, LEN); + } + } + + [Benchmark] + public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN); + + [Benchmark] + public unsafe void NativeAddScaleUPerf() + { + fixed (float* psrc = src) + fixed (float* pdst = dst) + { + CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN); + } + } + + [Benchmark] + public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + + [Benchmark] + public unsafe void NativeAddScaleSUPerf() { fixed (float* psrc = src) fixed (float* pdst = dst) fixed (int* pidx = idx) { - return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN); + CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN); } } [Benchmark] - public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN); + public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN); [Benchmark] - public unsafe float NativeSumSqUPerf() + public unsafe void NativeAddScaleCopyUPerf() { fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (float* pres = result) { - return CpuMathNativeUtils.SumSqU(psrc, LEN); + CpuMathNativeUtils.AddScaleCopyU(DEFAULT_SCALE, psrc, pdst, pres, LEN); } } [Benchmark] - public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN); + public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN); [Benchmark] public unsafe void NativeAddUPerf() @@ -157,44 +214,132 @@ public unsafe void NativeAddSUPerf() [Benchmark] public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN); + [Benchmark] - public unsafe void NativeAddScaleUPerf() + public unsafe void NativeMulElementWiseUPerf() { - fixed (float* psrc = src) + fixed (float* psrc1 = src1) + fixed (float* psrc2 = src2) fixed (float* pdst = dst) { - CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN); + CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN); } } [Benchmark] - public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN); + public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN); [Benchmark] - public unsafe void NativeAddScaleSUPerf() + public unsafe float NativeSumUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumU(psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN); + + [Benchmark] + public unsafe float NativeSumSqUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumSqU(psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN); + + [Benchmark] + public unsafe float NativeSumSqDiffUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumSqDiffU(DEFAULT_SCALE, psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN); + + [Benchmark] + public unsafe float NativeSumAbsUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumAbsU(psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN); + + [Benchmark] + public unsafe float NativeSumAbsDiffUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.SumAbsDiffU(DEFAULT_SCALE, psrc, LEN); + } + } + + [Benchmark] + public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN); + + [Benchmark] + public unsafe float NativeMaxAbsUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.MaxAbsU(psrc, LEN); + } + } + + [Benchmark] + public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN); + + [Benchmark] + public unsafe float NativeMaxAbsDiffUPerf() + { + fixed (float* psrc = src) + { + return CpuMathNativeUtils.MaxAbsDiffU(DEFAULT_SCALE, psrc, LEN); + } + } + + [Benchmark] + public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN); + // TODO: MaxAbsU!!! + + [Benchmark] + public unsafe float NativeDotUPerf() { fixed (float* psrc = src) fixed (float* pdst = dst) - fixed (int* pidx = idx) { - CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN); + return CpuMathNativeUtils.DotU(psrc, pdst, LEN); } } [Benchmark] - public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN); + public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN); [Benchmark] - public unsafe void NativeScaleUPerf() + public unsafe float NativeDotSUPerf() { + fixed (float* psrc = src) fixed (float* pdst = dst) + fixed (int* pidx = idx) { - CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN); + return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN); } } [Benchmark] - public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN); + public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN); [Benchmark] public unsafe float NativeDist2Perf() @@ -210,29 +355,32 @@ public unsafe float NativeDist2Perf() public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN); [Benchmark] - public unsafe float NativeSumAbsUPerf() + public unsafe void NativeSdcaL1UpdateUPerf() { fixed (float* psrc = src) + fixed (float* pdst = dst) + fixed (float* pres = result) { - return CpuMathNativeUtils.SumAbsU(psrc, LEN); + CpuMathNativeUtils.SdcaL1UpdateU(DEFAULT_SCALE, psrc, DEFAULT_SCALE, pdst, pres, LEN); } } [Benchmark] - public float ManagedSumAbsqUPerf() => CpuMathUtils.SumAbs(src, LEN); + public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result); [Benchmark] - public unsafe void NativeMulElementWiseUPerf() + public unsafe void NativeSdcaL1UpdateSUPerf() { - fixed (float* psrc1 = src1) - fixed (float* psrc2 = src2) + fixed (float* psrc = src) fixed (float* pdst = dst) + fixed (float* pres = result) + fixed (int* pidx = idx) { - CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN); + CpuMathNativeUtils.SdcaL1UpdateSU(DEFAULT_SCALE, psrc, pidx, DEFAULT_SCALE, pdst, pres, IDXLEN); } } [Benchmark] - public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN); + public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result); } } diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs index 6fc2596ef7..d1d5955a8e 100644 --- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs +++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs @@ -13,7 +13,11 @@ public class CpuMathUtilsUnitTests { private readonly float[][] testArrays; private readonly int[] testIndexArray; + private readonly AlignedArray[] testMatrices; + private readonly AlignedArray[] testSrcVectors; + private readonly AlignedArray[] testDstVectors; private const float DEFAULT_SCALE = 1.7f; + private const int SseCbAlign = 16; private FloatEqualityComparer comparer; public CpuMathUtilsUnitTests() @@ -25,75 +29,211 @@ public CpuMathUtilsUnitTests() testArrays = new float[][] { testArray1, testArray2 }; testIndexArray = new int[4] { 0, 2, 5, 6 }; comparer = new FloatEqualityComparer(); + + // Padded matrices whose dimensions are multiples of 4 + float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f, + 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f }; + float[] testMatrix2 = new float[4 * 8]; + + for (int i = 0; i < testMatrix2.Length; i++) + { + testMatrix2[i] = i + 1; + } + + AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, SseCbAlign); + AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, SseCbAlign); + testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length); + testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length); + + testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 }; + + // Padded source vectors whose dimensions are multiples of 4 + float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f }; + float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f }; + + AlignedArray testSrcVectorAligned1 = new AlignedArray(4, SseCbAlign); + AlignedArray testSrcVectorAligned2 = new AlignedArray(8, SseCbAlign); + testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length); + testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length); + + testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 }; + + // Padded destination vectors whose dimensions are multiples of 4 + float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f }; + float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f }; + + AlignedArray testDstVectorAligned1 = new AlignedArray(4, SseCbAlign); + AlignedArray testDstVectorAligned2 = new AlignedArray(8, SseCbAlign); + testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length); + testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length); + + testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 }; } [Theory] - [InlineData(0, 13306.0376f)] - [InlineData(1, 13291.9235f)] - public void DotUTest(int test, float expected) + [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })] + [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })] + [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })] + public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected) { - float[] src = (float[]) testArrays[test].Clone(); - float[] dst = (float[]) src.Clone(); - - for (int i = 0; i < dst.Length; i++) - { - dst[i] += 1; - } + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; - var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length); - Assert.Equal(expected, actual, 2); + CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, 736.7352f)] - [InlineData(1, 736.7352f)] - public void DotSUTest(int test, float expected) + [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })] + [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })] + [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })] + public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected) { - float[] src = (float[])testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })] + [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })] + [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })] + public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })] + [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })] + [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })] + public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + + CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })] + [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })] + [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })] + public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; int[] idx = testIndexArray; - // Ensures src and dst are different arrays - for (int i = 0; i < dst.Length; i++) - { - dst[i] += 1; - } + CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } - var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length); - Assert.Equal(expected, actual, 4); + [Theory] + [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })] + [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })] + [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })] + public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + int[] idx = testIndexArray; + + CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, 13399.9376f)] - [InlineData(1, 13389.1135f)] - public void SumSqUTest(int test, float expected) + [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })] + [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })] + [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })] + public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected) { - float[] src = (float[])testArrays[test].Clone(); - var actual = CpuMathUtils.SumSq(src, src.Length); - Assert.Equal(expected, actual, 2); + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + int[] idx = testIndexArray; + + CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })] + [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })] + [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })] + public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected) + { + AlignedArray mat = testMatrices[matTest]; + AlignedArray src = testSrcVectors[srcTest]; + AlignedArray dst = testDstVectors[dstTest]; + int[] idx = testIndexArray; + + CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size); + float[] actual = new float[dst.Size]; + dst.CopyTo(actual, 0, dst.Size); + Assert.Equal(expected, actual, comparer); } [Theory] [InlineData(0)] [InlineData(1)] - public void AddUTest(int test) + public void AddScalarUTest(int test) { - float[] src = (float[])testArrays[test].Clone(); - float[] dst = (float[])src.Clone(); - float[] expected = (float[])src.Clone(); + float[] dst = (float[])testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); - // Ensures src and dst are different arrays - for (int i = 0; i < dst.Length; i++) + for (int i = 0; i < expected.Length; i++) { - dst[i] += 1; + expected[i] += DEFAULT_SCALE; } + CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void ScaleUTest(int test) + { + float[] dst = (float[])testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); + for (int i = 0; i < expected.Length; i++) { - expected[i] = 2 * expected[i] + 1; + expected[i] *= DEFAULT_SCALE; } - CpuMathUtils.Add(src, dst, dst.Length); + CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length); var actual = dst; Assert.Equal(expected, actual, comparer); } @@ -101,19 +241,36 @@ public void AddUTest(int test) [Theory] [InlineData(0)] [InlineData(1)] - public void AddSUTest(int test) + public void ScaleSrcUTest(int test) { float[] src = (float[])testArrays[test].Clone(); float[] dst = (float[])src.Clone(); - int[] idx = testIndexArray; float[] expected = (float[])dst.Clone(); - expected[0] = 3.92f; - expected[2] = -12.14f; - expected[5] = -36.69f; - expected[6] = 46.29f; + for (int i = 0; i < expected.Length; i++) + { + expected[i] *= DEFAULT_SCALE; + } - CpuMathUtils.Add(src, idx, dst, idx.Length); + CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void ScaleAddUTest(int test) + { + float[] dst = (float[])testArrays[test].Clone(); + float[] expected = (float[])dst.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + expected[i] = DEFAULT_SCALE * (dst[i] + DEFAULT_SCALE); + } + + CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length); var actual = dst; Assert.Equal(expected, actual, comparer); } @@ -160,28 +317,31 @@ public void AddScaleSUTest(int test) [Theory] [InlineData(0)] [InlineData(1)] - public void ScaleUTest(int test) + public void AddScaleCopyUTest(int test) { - float[] dst = (float[])testArrays[test].Clone(); - float[] expectedOutput = (float[])dst.Clone(); + float[] src = (float[])testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + float[] result = (float[])dst.Clone(); + float[] expected = (float[])dst.Clone(); - for (int i = 0; i < expectedOutput.Length; i++) + for (int i = 0; i < expected.Length; i++) { - expectedOutput[i] *= DEFAULT_SCALE; + expected[i] *= (1 + DEFAULT_SCALE); } - CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length); - var managedOutput = dst; - Assert.Equal(expectedOutput, managedOutput, comparer); + CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length); + var actual = result; + Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, 8.0f)] - [InlineData(1, 7.0f)] - public void Dist2Test(int test, float expected) + [InlineData(0)] + [InlineData(1)] + public void AddUTest(int test) { float[] src = (float[])testArrays[test].Clone(); float[] dst = (float[])src.Clone(); + float[] expected = (float[])src.Clone(); // Ensures src and dst are different arrays for (int i = 0; i < dst.Length; i++) @@ -189,18 +349,34 @@ public void Dist2Test(int test, float expected) dst[i] += 1; } - var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length); - Assert.Equal(expected, actual, 0); + for (int i = 0; i < expected.Length; i++) + { + expected[i] = 2 * expected[i] + 1; + } + + CpuMathUtils.Add(src, dst, dst.Length); + var actual = dst; + Assert.Equal(expected, actual, comparer); } [Theory] - [InlineData(0, 196.98f)] - [InlineData(1, 193.69f)] - public void SumAbsUTest(int test, float expected) + [InlineData(0)] + [InlineData(1)] + public void AddSUTest(int test) { float[] src = (float[])testArrays[test].Clone(); - var actual = CpuMathUtils.SumAbs(src, src.Length); - Assert.Equal(expected, actual, 2); + float[] dst = (float[])src.Clone(); + int[] idx = testIndexArray; + float[] expected = (float[])dst.Clone(); + + expected[0] = 3.92f; + expected[2] = -12.14f; + expected[5] = -36.69f; + expected[6] = 46.29f; + + CpuMathUtils.Add(src, idx, dst, idx.Length); + var actual = dst; + Assert.Equal(expected, actual, comparer); } [Theory] @@ -229,6 +405,202 @@ public void MulElementWiseUTest(int test) var actual = dst; Assert.Equal(expected, actual, comparer); } + + [Theory] + [InlineData(0, -93.9f)] + [InlineData(1, -97.19f)] + public void SumUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.Sum(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13399.9376f)] + [InlineData(1, 13389.1135f)] + public void SumSqUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.SumSq(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13742.3176f)] + [InlineData(1, 13739.7895f)] + public void SumSqDiffUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 196.98f)] + [InlineData(1, 193.69f)] + public void SumAbsUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.SumAbs(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 196.98f)] + [InlineData(1, 195.39f)] + public void SumAbsDiffUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 106.37f)] + [InlineData(1, 106.37f)] + public void MaxAbsUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.MaxAbs(src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 108.07f)] + [InlineData(1, 108.07f)] + public void MaxAbsDiffUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 13306.0376f)] + [InlineData(1, 13291.9235f)] + public void DotUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length); + Assert.Equal(expected, actual, 2); + } + + [Theory] + [InlineData(0, 736.7352f)] + [InlineData(1, 736.7352f)] + public void DotSUTest(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + int[] idx = testIndexArray; + + // Ensures src and dst are different arrays + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length); + Assert.Equal(expected, actual, 4); + } + + [Theory] + [InlineData(0, 8.0f)] + [InlineData(1, 7.0f)] + public void Dist2Test(int test, float expected) + { + float[] src = (float[])testArrays[test].Clone(); + float[] dst = (float[])src.Clone(); + + // Ensures src and dst are different arrays + for (int i = 0; i < dst.Length; i++) + { + dst[i] += 1; + } + + var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length); + Assert.Equal(expected, actual, 0); + } + + [Theory] + [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })] + [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })] + public void ZeroItemsUTest(int test, int[] idx, float[] expected) + { + AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign); + src.CopyFrom(testSrcVectors[test]); + + CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx); + float[] actual = new float[src.Size]; + src.CopyTo(actual, 0, src.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })] + [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })] + public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected) + { + AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign); + src.CopyFrom(testSrcVectors[test]); + + CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx); + float[] actual = new float[src.Size]; + src.CopyTo(actual, 0, src.Size); + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void SdcaL1UpdateUTest(int test) + { + float[] src = (float[])testArrays[test].Clone(); + float[] v = (float[])src.Clone(); + float[] w = (float[])src.Clone(); + float[] expected = (float[])w.Clone(); + + for (int i = 0; i < expected.Length; i++) + { + float value = src[i] * (1 + DEFAULT_SCALE); + expected[i] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0; + } + + CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w); + var actual = w; + Assert.Equal(expected, actual, comparer); + } + + [Theory] + [InlineData(0)] + [InlineData(1)] + public void SdcaL1UpdateSUTest(int test) + { + float[] src = (float[])testArrays[test].Clone(); + float[] v = (float[])src.Clone(); + float[] w = (float[])src.Clone(); + int[] idx = testIndexArray; + float[] expected = (float[])w.Clone(); + + for (int i = 0; i < idx.Length; i++) + { + int index = idx[i]; + float value = v[index] + src[i] * DEFAULT_SCALE; + expected[index] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0; + } + + CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w); + var actual = w; + Assert.Equal(expected, actual, comparer); + } } internal class FloatEqualityComparer : IEqualityComparer