diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
index 6c6c1fe6ad..b238d602b0 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netcoreapp.cs
@@ -9,10 +9,182 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
+        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun)
+        {
+            Contracts.Assert(mat.Size == dst.Size * src.Size);
+            Contracts.Assert(crun >= 0);
+
+            if (Sse.IsSupported)
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(crun <= dst.Size);
+                    SseIntrinsics.MatMulA(add, mat, src, dst, crun, src.Size);
+                }
+                else
+                {
+                    Contracts.Assert(crun <= src.Size);
+                    SseIntrinsics.MatMulTranA(add, mat, src, dst, dst.Size, crun);
+                }
+            }
+            else
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(crun <= dst.Size);
+                    for (int i = 0; i < crun; i++)
+                    {
+                        float dotProduct = 0;
+                        for (int j = 0; j < src.Size; j++)
+                        {
+                            dotProduct += mat[i * src.Size + j] * src[j];
+                        }
+
+                        if (add)
+                        {
+                            dst[i] += dotProduct;
+                        }
+                        else
+                        {
+                            dst[i] = dotProduct;
+                        }
+                    }
+                }
+                else
+                {
+                    Contracts.Assert(crun <= src.Size);
+                    for (int i = 0; i < dst.Size; i++)
+                    {
+                        float dotProduct = 0;
+                        for (int j = 0; j < crun; j++)
+                        {
+                            dotProduct += mat[j * src.Size + i] * src[j];
+                        }
+
+                        if (add)
+                        {
+                            dst[i] += dotProduct;
+                        }
+                        else
+                        {
+                            dst[i] = dotProduct;
+                        }
+                    }
+                }
+            }
+        }
+
+        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
+            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun)
+        {
+            Contracts.AssertValue(rgposSrc);
+            Contracts.Assert(iposMin >= 0);
+            Contracts.Assert(iposMin <= iposLim);
+            Contracts.Assert(iposLim <= rgposSrc.Length);
+            Contracts.Assert(mat.Size == dst.Size * srcValues.Size);
+
+            if (iposMin >= iposLim)
+            {
+                if (!add)
+                    dst.ZeroItems();
+                return;
+            }
+
+            Contracts.AssertNonEmpty(rgposSrc);
+            Contracts.Assert(crun >= 0);
+
+            if (Sse.IsSupported)
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(crun <= dst.Size);
+                    SseIntrinsics.MatMulPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun, srcValues.Size);
+                }
+                else
+                {
+                    Contracts.Assert(crun <= srcValues.Size);
+                    SseIntrinsics.MatMulTranPA(add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, dst.Size);
+                }
+            }
+            else
+            {
+                if (!tran)
+                {
+                    Contracts.Assert(crun <= dst.Size);
+                    for (int i = 0; i < crun; i++)
+                    {
+                        float dotProduct = 0;
+                        for (int j = iposMin; j < iposLim; j++)
+                        {
+                            int col = rgposSrc[j] - posMin;
+                            dotProduct += mat[i * srcValues.Size + col] * srcValues[col];
+                        }
+
+                        if (add)
+                        {
+                            dst[i] += dotProduct;
+                        }
+                        else
+                        {
+                            dst[i] = dotProduct;
+                        }
+                    }
+                }
+                else
+                {
+                    Contracts.Assert(crun <= srcValues.Size);
+                    for (int i = 0; i < dst.Size; i++)
+                    {
+                        float dotProduct = 0;
+                        for (int j = iposMin; j < iposLim; j++)
+                        {
+                            int col = rgposSrc[j] - posMin;
+                            dotProduct += mat[col * dst.Size + i] * srcValues[col];
+                        }
+
+                        if (add)
+                        {
+                            dst[i] += dotProduct;
+                        }
+                        else
+                        {
+                            dst[i] = dotProduct;
+                        }
+                    }
+
+                }
+            }
+        }
+
+        public static void Add(float a, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= dst.Length);
+
+            Add(a, new Span<float>(dst, 0, count));
+        }
+
+        private static void Add(float a, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.AddScalarU(a, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] += a;
+                }
+            }
+        }
+
         public static void Scale(float a, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count && count <= dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= dst.Length);
 
             Scale(a, new Span<float>(dst, 0, count));
         }
@@ -20,8 +192,9 @@ public static void Scale(float a, float[] dst, int count)
         public static void Scale(float a, float[] dst, int offset, int count)
         {
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset < dst.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset < (dst.Length - count));
 
             Scale(a, new Span<float>(dst, offset, count));
         }
@@ -41,11 +214,64 @@ private static void Scale(float a, Span<float> dst)
             }
         }
 
+        // dst = a * src
+        public static void Scale(float a, float[] src, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= dst.Length);
+
+            Scale(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
+        }
+
+        private static void Scale(float a, Span<float> src, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.ScaleSrcU(a, src, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] = a * src[i];
+                }
+            }
+        }
+
+        // dst[i] = a * (dst[i] + b)
+        public static void ScaleAdd(float a, float b, float[] dst, int count)
+        {
+            Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= dst.Length);
+
+            ScaleAdd(a, b, new Span<float>(dst, 0, count));
+        }
+
+        private static void ScaleAdd(float a, float b, Span<float> dst)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.ScaleAddU(a, b, dst);
+            }
+            else
+            {
+                for (int i = 0; i < dst.Length; i++)
+                {
+                    dst[i] = a * (dst[i] + b);
+                }
+            }
+        }
+
         public static void AddScale(float a, float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
             Contracts.Assert(count <= dst.Length);
 
             AddScale(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
@@ -54,10 +280,12 @@ public static void AddScale(float a, float[] src, float[] dst, int count)
         public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(count <= src.Length);
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
-            Contracts.Assert(0 < count && count <= dst.Length - dstOffset);
+            Contracts.Assert(dstOffset >= 0);
+            Contracts.Assert(dstOffset < dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= (dst.Length - dstOffset));
 
             AddScale(a, new Span<float>(src, 0, count), new Span<float>(dst, dstOffset, count));
         }
@@ -80,10 +308,11 @@ private static void AddScale(float a, Span<float> src, Span<float> dst)
         public static void AddScale(float a, float[] src, int[] indices, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
             Contracts.Assert(count < dst.Length);
 
             AddScale(a, new Span<float>(src), new Span<int>(indices, 0, count), new Span<float>(dst));
@@ -92,12 +321,14 @@ public static void AddScale(float a, float[] src, int[] indices, float[] dst, in
         public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
-            Contracts.Assert(count < dst.Length - dstOffset);
+            Contracts.Assert(dstOffset >= 0);
+            Contracts.Assert(dstOffset < dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.Assert(count < (dst.Length - dstOffset));
 
             AddScale(a, new Span<float>(src), new Span<int>(indices, 0, count),
                     new Span<float>(dst, dstOffset, dst.Length - dstOffset));
@@ -119,11 +350,40 @@ private static void AddScale(float a, Span<float> src, Span<int> indices, Span<f
             }
         }
 
+        public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.AssertNonEmpty(dst);
+            Contracts.AssertNonEmpty(res);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= dst.Length);
+            Contracts.Assert(count <= res.Length);
+
+            AddScaleCopy(a, new Span<float>(src, 0, count), new Span<float>(dst, 0, count), new Span<float>(res, 0, count));
+        }
+
+        private static void AddScaleCopy(float a, Span<float> src, Span<float> dst, Span<float> res)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.AddScaleCopyU(a, src, dst, res);
+            }
+            else
+            {
+                for (int i = 0; i < res.Length; i++)
+                {
+                    res[i] = a * src[i] + dst[i];
+                }
+            }
+        }
+
         public static void Add(float[] src, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
             Contracts.Assert(count <= dst.Length);
 
             Add(new Span<float>(src, 0, count), new Span<float>(dst, 0, count));
@@ -147,10 +407,11 @@ private static void Add(Span<float> src, Span<float> dst)
         public static void Add(float[] src, int[] indices, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
             Contracts.Assert(count < dst.Length);
 
             Add(new Span<float>(src), new Span<int>(indices, 0, count), new Span<float>(dst));
@@ -159,12 +420,14 @@ public static void Add(float[] src, int[] indices, float[] dst, int count)
         public static void Add(float[] src, int[] indices, float[] dst, int dstOffset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
             Contracts.AssertNonEmpty(indices);
-            Contracts.Assert(count <= indices.Length);
             Contracts.AssertNonEmpty(dst);
-            Contracts.Assert(0 <= dstOffset && dstOffset < dst.Length);
-            Contracts.Assert(count <= dst.Length - dstOffset);
+            Contracts.Assert(dstOffset >= 0);
+            Contracts.Assert(dstOffset < dst.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.Assert(count <= (dst.Length - dstOffset));
 
             Add(new Span<float>(src), new Span<int>(indices, 0, count),
                 new Span<float>(dst, dstOffset, dst.Length - dstOffset));
@@ -189,10 +452,11 @@ private static void Add(Span<float> src, Span<int> indices, Span<float> dst)
         public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count)
         {
             Contracts.AssertNonEmpty(src1);
-            Contracts.Assert(0 < count && count <= src1.Length);
             Contracts.AssertNonEmpty(src2);
-            Contracts.Assert(0 < count && count <= src2.Length);
             Contracts.AssertNonEmpty(dst);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src1.Length);
+            Contracts.Assert(count <= src2.Length);
 
             MulElementWise(new Span<float>(src1, 0, count), new Span<float>(src2, 0, count),
                             new Span<float>(dst, 0, count));
@@ -213,10 +477,47 @@ private static void MulElementWise(Span<float> src1, Span<float> src2, Span<floa
             }
         }
 
+        public static float Sum(float[] src, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+
+            return Sum(new Span<float>(src, 0, count));
+        }
+
+        public static float Sum(float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
+
+            return Sum(new Span<float>(src, offset, count));
+        }
+
+        private static float Sum(Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.SumU(src);
+            }
+            else
+            {
+                float sum = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    sum += src[i];
+                }
+                return sum;
+            }
+        }
+
         public static float SumSq(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
 
             return SumSq(new Span<float>(src, 0, count));
         }
@@ -224,8 +525,9 @@ public static float SumSq(float[] src, int count)
         public static float SumSq(float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
 
             return SumSq(new Span<float>(src, offset, count));
         }
@@ -247,10 +549,38 @@ private static float SumSq(Span<float> src)
             }
         }
 
+        public static float SumSq(float mean, float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
+
+            return SumSq(mean, new Span<float>(src, offset, count));
+        }
+
+        private static float SumSq(float mean, Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return (mean == 0) ? SseIntrinsics.SumSqU(src) : SseIntrinsics.SumSqDiffU(mean, src);
+            }
+            else
+            {
+                float result = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    result += (src[i] - mean) * (src[i] - mean);
+                }
+                return result;
+            }
+        }
+
         public static float SumAbs(float[] src, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count && count <= src.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
 
             return SumAbs(new Span<float>(src, 0, count));
         }
@@ -258,8 +588,9 @@ public static float SumAbs(float[] src, int count)
         public static float SumAbs(float[] src, int offset, int count)
         {
             Contracts.AssertNonEmpty(src);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= src.Length - count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
 
             return SumAbs(new Span<float>(src, offset, count));
         }
@@ -281,11 +612,108 @@ private static float SumAbs(Span<float> src)
             }
         }
 
+        public static float SumAbs(float mean, float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
+
+            return SumAbs(mean, new Span<float>(src, offset, count));
+        }
+
+        private static float SumAbs(float mean, Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return (mean == 0) ? SseIntrinsics.SumAbsU(src) : SseIntrinsics.SumAbsDiffU(mean, src);
+            }
+            else
+            {
+                float sum = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    sum += Math.Abs(src[i] - mean);
+                }
+                return sum;
+            }
+        }
+
+        public static float MaxAbs(float[] src, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+
+            return MaxAbs(new Span<float>(src, 0, count));
+        }
+
+        public static float MaxAbs(float[] src, int offset, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (src.Length - count));
+
+            return MaxAbs(new Span<float>(src, offset, count));
+        }
+
+        private static float MaxAbs(Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.MaxAbsU(src);
+            }
+            else
+            {
+                float max = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    float abs = Math.Abs(src[i]);
+                    if (abs > max)
+                    {
+                        max = abs;
+                    }
+                }
+                return max;
+            }
+        }
+
+        public static float MaxAbsDiff(float mean, float[] src, int count)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+
+            return MaxAbsDiff(mean, new Span<float>(src, 0, count));
+        }
+
+        private static float MaxAbsDiff(float mean, Span<float> src)
+        {
+            if (Sse.IsSupported)
+            {
+                return SseIntrinsics.MaxAbsDiffU(mean, src);
+            }
+            else
+            {
+                float max = 0;
+                for (int i = 0; i < src.Length; i++)
+                {
+                    float abs = Math.Abs(src[i] - mean);
+                    if (abs > max)
+                    {
+                        max = abs;
+                    }
+                }
+                return max;
+            }
+        }
+
         public static float DotProductDense(float[] a, float[] b, int count)
         {
             Contracts.AssertNonEmpty(a);
             Contracts.AssertNonEmpty(b);
-            Contracts.Assert(0 < count);
+            Contracts.Assert(count > 0);
             Contracts.Assert(a.Length >= count);
             Contracts.Assert(b.Length >= count);
 
@@ -295,10 +723,11 @@ public static float DotProductDense(float[] a, float[] b, int count)
         public static float DotProductDense(float[] a, int offset, float[] b, int count)
         {
             Contracts.AssertNonEmpty(a);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset <= a.Length - count);
             Contracts.AssertNonEmpty(b);
-            Contracts.Assert(b.Length >= count);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= b.Length);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset <= (a.Length - count));
 
             return DotProductDense(new Span<float>(a, offset, count), new Span<float>(b, 0, count));
         }
@@ -324,7 +753,8 @@ public static float DotProductSparse(float[] a, float[] b, int[] indices, int co
         {
             Contracts.AssertNonEmpty(a);
             Contracts.AssertNonEmpty(b);
-            Contracts.Assert(0 < count);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count > 0);
             Contracts.Assert(count < a.Length);
             Contracts.Assert(count <= b.Length);
             Contracts.Assert(count <= indices.Length);
@@ -336,12 +766,14 @@ public static float DotProductSparse(float[] a, float[] b, int[] indices, int co
         public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count)
         {
             Contracts.AssertNonEmpty(a);
-            Contracts.Assert(0 < count);
-            Contracts.Assert(0 <= offset && offset < a.Length);
-            Contracts.Assert(a.Length - offset > count);
             Contracts.AssertNonEmpty(b);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count < (a.Length - offset));
             Contracts.Assert(count <= b.Length);
             Contracts.Assert(count <= indices.Length);
+            Contracts.Assert(offset >= 0);
+            Contracts.Assert(offset < a.Length);
 
             return DotProductSparse(new Span<float>(a, offset, a.Length - offset),
                                     new Span<float>(b), new Span<int>(indices, 0, count));
@@ -369,7 +801,8 @@ public static float L2DistSquared(float[] a, float[] b, int count)
         {
             Contracts.AssertNonEmpty(a);
             Contracts.AssertNonEmpty(b);
-            Contracts.Assert(0 < count && count <= a.Length);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= a.Length);
             Contracts.Assert(count <= b.Length);
 
             return L2DistSquared(new Span<float>(a, 0, count), new Span<float>(b, 0, count));
@@ -392,5 +825,135 @@ private static float L2DistSquared(Span<float> a, Span<float> b)
                 return norm;
             }
         }
+
+        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices)
+        {
+            Contracts.Assert(ccol > 0);
+            Contracts.Assert(ccol <= cfltRow);
+
+            if (ccol == cfltRow)
+            {
+                ZeroItemsU(dst, dst.Size, indices, indices.Length);
+            }
+            else
+            {
+                ZeroMatrixItemsCore(dst, dst.Size, ccol, cfltRow, indices, indices.Length);
+            }
+        }
+
+        private static unsafe void ZeroItemsU(AlignedArray dst, int c, int[] indices, int cindices)
+        {
+            fixed (float* pdst = &dst.Items[0])
+            fixed (int* pidx = &indices[0])
+            {
+                for (int i = 0; i < cindices; ++i)
+                {
+                    int index = pidx[i];
+                    Contracts.Assert(index >= 0);
+                    Contracts.Assert(index < c);
+                    pdst[index] = 0;
+                }
+            }
+        }
+
+        private static unsafe void ZeroMatrixItemsCore(AlignedArray dst, int c, int ccol, int cfltRow, int[] indices, int cindices)
+        {
+            fixed (float* pdst = &dst.Items[0])
+            fixed (int* pidx = &indices[0])
+            {
+                int ivLogMin = 0;
+                int ivLogLim = ccol;
+                int ivPhyMin = 0;
+
+                for (int i = 0; i < cindices; ++i)
+                {
+                    int index = pidx[i];
+                    Contracts.Assert(index >= 0);
+                    Contracts.Assert(index < c);
+
+                    int col = index - ivLogMin;
+                    if ((uint)col >= (uint)ccol)
+                    {
+                        Contracts.Assert(ivLogMin > index || index >= ivLogLim);
+
+                        int row = index / ccol;
+                        ivLogMin = row * ccol;
+                        ivLogLim = ivLogMin + ccol;
+                        ivPhyMin = row * cfltRow;
+
+                        Contracts.Assert(index >= ivLogMin);
+                        Contracts.Assert(index < ivLogLim);
+                        col = index - ivLogMin;
+                    }
+
+                    pdst[ivPhyMin + col] = 0;
+                }
+            }
+        }
+
+        public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.AssertNonEmpty(v);
+            Contracts.AssertNonEmpty(w);
+            Contracts.Assert(length > 0);
+            Contracts.Assert(length <= src.Length);
+            Contracts.Assert(length <= v.Length);
+            Contracts.Assert(length <= w.Length);
+
+            SdcaL1UpdateDense(primalUpdate, new Span<float>(src, 0, length), threshold, new Span<float>(v, 0, length), new Span<float>(w, 0, length));
+        }
+
+        private static void SdcaL1UpdateDense(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.SdcaL1UpdateU(primalUpdate, src, threshold, v, w);
+            }
+            else
+            {
+                for (int i = 0; i < src.Length; i++)
+                {
+                    v[i] += src[i] * primalUpdate;
+                    float value = v[i];
+                    w[i] = Math.Abs(value) > threshold ? (value > 0 ? value - threshold : value + threshold) : 0;
+                }
+            }
+        }
+
+        // REVIEW NEEDED: The second argument "length" is unused even in the existing code.
+        public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w)
+        {
+            Contracts.AssertNonEmpty(src);
+            Contracts.AssertNonEmpty(indices);
+            Contracts.AssertNonEmpty(v);
+            Contracts.AssertNonEmpty(w);
+            Contracts.Assert(count > 0);
+            Contracts.Assert(count <= src.Length);
+            Contracts.Assert(count <= indices.Length);
+            Contracts.Assert(count < length);
+            Contracts.Assert(length <= v.Length);
+            Contracts.Assert(length <= w.Length);
+
+            SdcaL1UpdateSparse(primalUpdate, new Span<float>(src, 0, count), new Span<int>(indices, 0, count), threshold, new Span<float>(v), new Span<float>(w));
+        }
+
+        private static void SdcaL1UpdateSparse(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
+        {
+            if (Sse.IsSupported)
+            {
+                SseIntrinsics.SdcaL1UpdateSU(primalUpdate, src, indices, threshold, v, w);
+            }
+            else
+            {
+                for (int i = 0; i < indices.Length; i++)
+                {
+                    int index = indices[i];
+                    v[index] += src[i] * primalUpdate;
+                    float value = v[index];
+                    w[index] = Math.Abs(value) > threshold ? (value > 0 ? value - threshold : value + threshold) : 0;
+                }
+            }
+        }
     }
 }
diff --git a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
index 501fc9082e..730fb10be7 100644
--- a/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
+++ b/src/Microsoft.ML.CpuMath/CpuMathUtils.netstandard.cs
@@ -6,10 +6,21 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     public static partial class CpuMathUtils
     {
+        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, src, dst, crun);
+
+        public static void MatTimesSrc(bool tran, bool add, AlignedArray mat, int[] rgposSrc, AlignedArray srcValues,
+            int posMin, int iposMin, int iposLim, AlignedArray dst, int crun) => SseUtils.MatTimesSrc(tran, add, mat, rgposSrc, srcValues, posMin, iposMin, iposLim, dst, crun);
+
+        public static void Add(float a, float[] dst, int count) => SseUtils.Add(a, dst, count);
+
         public static void Scale(float a, float[] dst, int count) => SseUtils.Scale(a, dst, count);
 
         public static void Scale(float a, float[] dst, int offset, int count) => SseUtils.Scale(a, dst, offset, count);
 
+        public static void Scale(float a, float[] src, float[] dst, int count) => SseUtils.Scale(a, src, dst, count);
+
+        public static void ScaleAdd(float a, float b, float[] dst, int count) => SseUtils.ScaleAdd(a, b, dst, count);
+
         public static void AddScale(float a, float[] src, float[] dst, int count) => SseUtils.AddScale(a, src, dst, count);
 
         public static void AddScale(float a, float[] src, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, dst, dstOffset, count);
@@ -18,6 +29,8 @@ public static partial class CpuMathUtils
 
         public static void AddScale(float a, float[] src, int[] indices, float[] dst, int dstOffset, int count) => SseUtils.AddScale(a, src, indices, dst, dstOffset, count);
 
+        public static void AddScaleCopy(float a, float[] src, float[] dst, float[] res, int count) => SseUtils.AddScaleCopy(a, src, dst, res, count);
+
         public static void Add(float[] src, float[] dst, int count) => SseUtils.Add(src, dst, count);
 
         public static void Add(float[] src, int[] indices, float[] dst, int count) => SseUtils.Add(src, indices, dst, count);
@@ -26,14 +39,28 @@ public static partial class CpuMathUtils
 
         public static void MulElementWise(float[] src1, float[] src2, float[] dst, int count) => SseUtils.MulElementWise(src1, src2, dst, count);
 
+        public static float Sum(float[] src, int count) => SseUtils.Sum(src, count);
+
+        public static float Sum(float[] src, int offset, int count) => SseUtils.Sum(src, offset, count);
+
         public static float SumSq(float[] src, int count) => SseUtils.SumSq(src, count);
 
         public static float SumSq(float[] src, int offset, int count) => SseUtils.SumSq(src, offset, count);
 
+        public static float SumSq(float mean, float[] src, int offset, int count) => SseUtils.SumSq(mean, src, offset, count);
+
         public static float SumAbs(float[] src, int count) => SseUtils.SumAbs(src, count);
 
         public static float SumAbs(float[] src, int offset, int count) => SseUtils.SumAbs(src, offset, count);
 
+        public static float SumAbs(float mean, float[] src, int offset, int count) => SseUtils.SumAbs(mean, src, offset, count);
+
+        public static float MaxAbs(float[] src, int count) => SseUtils.MaxAbs(src, count);
+
+        public static float MaxAbs(float[] src, int offset, int count) => SseUtils.MaxAbs(src, offset, count);
+
+        public static float MaxAbsDiff(float mean, float[] src, int count) => SseUtils.MaxAbsDiff(mean, src, count);
+
         public static float DotProductDense(float[] a, float[] b, int count) => SseUtils.DotProductDense(a, b, count);
 
         public static float DotProductDense(float[] a, int offset, float[] b, int count) => SseUtils.DotProductDense(a, offset, b, count);
@@ -43,5 +70,13 @@ public static partial class CpuMathUtils
         public static float DotProductSparse(float[] a, int offset, float[] b, int[] indices, int count) => SseUtils.DotProductSparse(a, offset, b, indices, count);
 
         public static float L2DistSquared(float[] a, float[] b, int count) => SseUtils.L2DistSquared(a, b, count);
+
+        public static void ZeroMatrixItems(AlignedArray dst, int ccol, int cfltRow, int[] indices) => SseUtils.ZeroMatrixItems(dst, ccol, cfltRow, indices);
+
+        public static void SdcaL1UpdateDense(float primalUpdate, int length, float[] src, float threshold, float[] v, float[] w)
+            => SseUtils.SdcaL1UpdateDense(primalUpdate, length, src, threshold, v, w);
+
+        public static void SdcaL1UpdateSparse(float primalUpdate, int length, float[] src, int[] indices, int count, float threshold, float[] v, float[] w)
+            => SseUtils.SdcaL1UpdateSparse(primalUpdate, length, src, indices, count, threshold, v, w);
     }
 }
diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
index d11676f283..2ac1f56f14 100644
--- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
+++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -4,8 +4,14 @@
 
 // The exported function names need to be unique (can't be disambiguated based on signature), hence
 // we introduce suffix letters to indicate the general patterns used.
+// * A suffix means aligned and padded for SSE operations.
 // * U suffix means unaligned and unpadded.
 // * S suffix means sparse (unaligned) vector.
+// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector.
+// * R suffix means sparse matrix.
+// * C suffix means convolution matrix.
+// * D suffix means convolution matrix, with implicit source padding.
+// * Tran means the matrix is transposed.
 
 using System;
 using System.Runtime.CompilerServices;
@@ -16,6 +22,23 @@ namespace Microsoft.ML.Runtime.Internal.CpuMath
 {
     internal static class SseIntrinsics
     {
+        private const int CbAlign = 16;
+
+        private static bool Compat(AlignedArray a)
+        {
+            Contracts.AssertValue(a);
+            Contracts.Assert(a.Size > 0);
+            return a.CbAlign == CbAlign;
+        }
+
+        private static unsafe float* Ptr(AlignedArray a, float* p)
+        {
+            Contracts.AssertValue(a);
+            float* q = p + a.GetBase((long)p);
+            Contracts.Assert(((long)q & (CbAlign - 1)) == 0);
+            return q;
+        }
+
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
         private static unsafe Vector128<float> Load1(float* src, int* idx)
         {
@@ -29,29 +52,22 @@ private static unsafe Vector128<float> Load4(float* src, int* idx)
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> Rotate(Vector128<float> x)
+        private static Vector128<float> Rotate(in Vector128<float> x)
         {
             // The control byte shuffles the four 32-bit floats of x: ABCD -> BCDA.
             return Sse.Shuffle(x, x, 0x39);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static Vector128<float> RotateReverse(Vector128<float> x)
-        {
-            // The control byte shuffles the four 32-bit floats of x: ABCD -> DABC.
-            return Sse.Shuffle(x, x, 0x93);
-        }
-
-        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
-        private static unsafe void Store4(Vector128<float> x, float* dst, int* idx)
+        private static unsafe void Store4(in Vector128<float> x, float* dst, int* idx)
         {
             Sse.StoreScalar(dst + idx[0], x);
-            x = Rotate(x);
-            Sse.StoreScalar(dst + idx[1], x);
-            x = Rotate(x);
-            Sse.StoreScalar(dst + idx[2], x);
-            x = Rotate(x);
-            Sse.StoreScalar(dst + idx[3], x);
+            Vector128<float> rotated = Rotate(in x);
+            Sse.StoreScalar(dst + idx[1], rotated);
+            rotated = Rotate(in rotated);
+            Sse.StoreScalar(dst + idx[2], rotated);
+            rotated = Rotate(in rotated);
+            Sse.StoreScalar(dst + idx[3], rotated);
         }
 
         [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
@@ -59,15 +75,362 @@ private static Vector128<float> VectorSum(in Vector128<float> vector)
         {
             if (Sse3.IsSupported)
             {
-                Vector128<float> tmp = Sse3.HorizontalAdd(vector, vector);
-                return Sse3.HorizontalAdd(tmp, tmp);
+                Vector128<float> partialSum = Sse3.HorizontalAdd(vector, vector);
+                return Sse3.HorizontalAdd(partialSum, partialSum);
             }
             else
             {
-                // SSE3 is not supported.
-                Vector128<float> tmp = Sse.Add(vector, Sse.MoveHighToLow(vector, vector));
-                // The control byte shuffles the four 32-bit floats of tmp: ABCD -> BADC.
-                return Sse.Add(tmp, Sse.Shuffle(tmp, tmp, 0xb1));
+                Vector128<float> partialSum = Sse.Add(vector, Sse.MoveHighToLow(vector, vector));
+                // The control byte shuffles the four 32-bit floats of partialSum: ABCD -> BADC.
+                return Sse.Add(partialSum, Sse.Shuffle(partialSum, partialSum, 0xB1));
+            }
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> VectorMax(in Vector128<float> vector)
+        {
+            Vector128<float> x1 = Sse.Shuffle(vector, vector, 0xB1);
+            Vector128<float> partialMax = Sse.Max(vector, x1);
+            x1 = Sse.Shuffle(partialMax, partialMax, 0x02);
+            return Sse.MaxScalar(partialMax, x1);
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> GetAbsMask()
+        {
+            return Sse2.IsSupported ?
+                Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF)) :
+                Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+        }
+
+        [MethodImplAttribute(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<float> GetNewDst(in Vector128<float> xDst1, in Vector128<float> signMask, in Vector128<float> xThreshold)
+        {
+            Vector128<float> xSign = Sse.And(xDst1, signMask); // result = 0x8000 0000 if xDst1 is negative or 0x0000 0000 otherwise
+            Vector128<float> xDst1Abs = Sse.Xor(xDst1, xSign);
+            Vector128<float> xCond = Sse.CompareGreaterThan(xDst1Abs, xThreshold); // result = 0xFFFF FFFF if true
+            Vector128<float> x2 = Sse.Xor(xSign, xThreshold); // -xThreshold if xDst1 is negative and +xThreshold otherwise
+            return Sse.And(Sse.Subtract(xDst1, x2), xCond);
+        }
+
+        // Multiply matrix times vector into vector.
+        internal static unsafe void MatMulA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            {
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
+
+                float* pSrcEnd = psrc + ccol;
+                float* pDstEnd = pdst + crow;
+                float* pDstCurrent = pdst;
+                float* pMatCurrent = pmat;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> res0 = Sse.SetZeroVector128();
+                    Vector128<float> res1 = res0;
+                    Vector128<float> res2 = res0;
+                    Vector128<float> res3 = res0;
+
+                    float* pSrcCurrent = psrc;
+
+                    while (pSrcCurrent < pSrcEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+
+                        Vector128<float> x01 = Sse.LoadAlignedVector128(pMatTemp);
+                        Vector128<float> x11 = Sse.LoadAlignedVector128(pMatTemp += ccol);
+                        Vector128<float> x21 = Sse.LoadAlignedVector128(pMatTemp += ccol);
+                        Vector128<float> x31 = Sse.LoadAlignedVector128(pMatTemp += ccol);
+                        Vector128<float> x02 = Sse.LoadAlignedVector128(pSrcCurrent);
+
+                        res0 = Sse.Add(res0, Sse.Multiply(x01, x02));
+                        res1 = Sse.Add(res1, Sse.Multiply(x11, x02));
+                        res2 = Sse.Add(res2, Sse.Multiply(x21, x02));
+                        res3 = Sse.Add(res3, Sse.Multiply(x31, x02));
+
+                        pSrcCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    // Add up the entries of each, with the 4 results in res0
+                    res0 = Sse3.HorizontalAdd(res0, res1);
+                    res2 = Sse3.HorizontalAdd(res2, res3);
+                    res0 = Sse3.HorizontalAdd(res0, res2);
+
+                    if (add)
+                    {
+                        res0 = Sse.Add(res0, Sse.LoadAlignedVector128(pDstCurrent));
+                    }
+                    Sse.StoreAligned(pDstCurrent, res0);
+
+                    pDstCurrent += 4;
+                    pMatCurrent += 3 * ccol;
+                }
+            }
+        }
+
+        // Partial sparse source vector.
+        internal static unsafe void MatMulPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow, int ccol)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
+            // REVIEW: For extremely sparse inputs, interchanging the loops would
+            // likely be more efficient.
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            fixed (int* pposSrc = &rgposSrc[0])
+            {
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
+
+                int* pposMin = pposSrc + iposMin;
+                int* pposEnd = pposSrc + iposEnd;
+                float* pDstEnd = pdst + crow;
+                float* pm0 = pmat - posMin;
+                float* pSrcCurrent = psrc - posMin;
+                float* pDstCurrent = pdst;
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    float* pm1 = pm0 + ccol;
+                    float* pm2 = pm1 + ccol;
+                    float* pm3 = pm2 + ccol;
+                    Vector128<float> result = Sse.SetZeroVector128();
+
+                    int* ppos = pposMin;
+
+                    while (ppos < pposEnd)
+                    {
+                        int col = *ppos;
+                        Vector128<float> x1 = Sse.SetVector128(pm3[col], pm2[col], pm1[col], pm0[col]);
+                        Vector128<float> x2 = Sse.SetAllVector128(pSrcCurrent[col]);
+                        x2 = Sse.Multiply(x2, x1);
+                        result = Sse.Add(result, x2);
+
+                        ppos++;
+                    }
+
+                    if (add)
+                    {
+                        result = Sse.Add(result, Sse.LoadAlignedVector128(pDstCurrent));
+                    }
+                    Sse.StoreAligned(pDstCurrent, result);
+
+                    pDstCurrent += 4;
+                    pm0 += 4 * ccol;
+                }
+            }
+        }
+
+        internal static unsafe void MatMulTranA(bool add, AlignedArray mat, AlignedArray src, AlignedArray dst, int crow, int ccol)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            {
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
+
+                float* pSrcEnd = psrc + ccol;
+                float* pDstEnd = pdst + crow;
+                float* pSrcCurrent = psrc;
+                float* pMatCurrent = pmat;
+
+                if (!add)
+                {
+                    Vector128<float> x01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    // Replicate each 32-bit slot of x01 (ABCD) into its own register.
+                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55); // B
+                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA); // C
+                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF); // D
+                    x01 = Sse.Shuffle(x01, x01, 0x00); // A
+
+                    pSrcCurrent += 4;
+
+                    float* pDstCurrent = pdst;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+                        Vector128<float> x02 = Sse.LoadAlignedVector128(pMatTemp);
+                        Vector128<float> x12 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x22 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x32 = Sse.LoadAlignedVector128(pMatTemp += crow);
+
+                        x02 = Sse.Multiply(x01, x02);
+                        x12 = Sse.Multiply(x11, x12);
+                        x22 = Sse.Multiply(x21, x22);
+                        x32 = Sse.Multiply(x31, x32);
+
+                        x02 = Sse.Add(x02, x12);
+                        x22 = Sse.Add(x22, x32);
+                        x02 = Sse.Add(x02, x22);
+
+                        Sse.StoreAligned(pDstCurrent, x02);
+
+                        pDstCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    pMatCurrent += 3 * crow;
+                }
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> x01 = Sse.LoadAlignedVector128(pSrcCurrent);
+                    // Replicate each 32-bit slot of x01 (ABCD) into its own register.
+                    Vector128<float> x11 = Sse.Shuffle(x01, x01, 0x55); // B
+                    Vector128<float> x21 = Sse.Shuffle(x01, x01, 0xAA); // C
+                    Vector128<float> x31 = Sse.Shuffle(x01, x01, 0xFF); // D
+                    x01 = Sse.Shuffle(x01, x01, 0x00); // A
+
+                    float* pDstCurrent = pdst;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        float* pMatTemp = pMatCurrent;
+
+                        Vector128<float> x02 = Sse.LoadAlignedVector128(pMatTemp);
+                        Vector128<float> x12 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x22 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x32 = Sse.LoadAlignedVector128(pMatTemp += crow);
+                        Vector128<float> x3 = Sse.LoadAlignedVector128(pDstCurrent);
+
+                        x02 = Sse.Multiply(x01, x02);
+                        x12 = Sse.Multiply(x11, x12);
+                        x22 = Sse.Multiply(x21, x22);
+                        x32 = Sse.Multiply(x31, x32);
+
+                        x02 = Sse.Add(x02, x12);
+                        x22 = Sse.Add(x22, x32);
+                        x02 = Sse.Add(x02, x22);
+                        x3 = Sse.Add(x02, x3);
+
+                        Sse.StoreAligned(pDstCurrent, x3);
+
+                        pDstCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    pMatCurrent += 3 * crow;
+                    pSrcCurrent += 4;
+                }
+            }
+        }
+
+        // Partial sparse source vector.
+        internal static unsafe void MatMulTranPA(bool add, AlignedArray mat, int[] rgposSrc, AlignedArray src,
+                                        int posMin, int iposMin, int iposEnd, AlignedArray dst, int crow)
+        {
+            Contracts.Assert(Compat(mat));
+            Contracts.Assert(Compat(src));
+            Contracts.Assert(Compat(dst));
+
+            fixed (float* pSrcStart = &src.Items[0])
+            fixed (float* pDstStart = &dst.Items[0])
+            fixed (float* pMatStart = &mat.Items[0])
+            fixed (int* pposSrc = &rgposSrc[0])
+            {
+                float* psrc = Ptr(src, pSrcStart);
+                float* pdst = Ptr(dst, pDstStart);
+                float* pmat = Ptr(mat, pMatStart);
+
+                int* ppos = pposSrc + iposMin;
+                int* pposEnd = pposSrc + iposEnd;
+                float* pDstEnd = pdst + crow;
+
+                if (!add)
+                {
+                    int col = *ppos - posMin;
+                    ppos++;
+
+                    Vector128<float> x0 = Sse.SetAllVector128(psrc[col]);
+                    float* pDstCurrent = pdst;
+                    float* pMatCurrent = pmat + col * crow;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        Vector128<float> x1 = Sse.LoadAlignedVector128(pMatCurrent);
+                        x1 = Sse.Multiply(x1, x0);
+                        Sse.StoreAligned(pDstCurrent, x1);
+
+                        pDstCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+                }
+
+                // REVIEW: Should we explore unrolling the outer loop?
+                while (ppos < pposEnd)
+                {
+                    int col = *ppos - posMin;
+
+                    Vector128<float> x0 = Sse.SetAllVector128(psrc[col]);
+                    float* pDstCurrent = pdst;
+                    float* pMatCurrent = pmat + col * crow;
+
+                    while (pDstCurrent < pDstEnd)
+                    {
+                        Vector128<float> x1 = Sse.LoadAlignedVector128(pMatCurrent);
+                        Vector128<float> x2 = Sse.LoadAlignedVector128(pDstCurrent);
+                        x1 = Sse.Multiply(x1, x0);
+                        x2 = Sse.Add(x2, x1);
+                        Sse.StoreAligned(pDstCurrent, x2);
+
+                        pDstCurrent += 4;
+                        pMatCurrent += 4;
+                    }
+
+                    ppos++;
+                }
+            }
+        }
+
+        // dst[i] += scale
+        internal static unsafe void AddScalarU(float scale, Span<float> dst)
+        {
+            fixed (float* pdst = dst)
+            {
+                float* pDstEnd = pdst + dst.Length;
+                float* pDstCurrent = pdst;
+
+                Vector128<float> x1 = Sse.SetAllVector128(scale);
+
+                while (pDstCurrent + 4 <= pDstEnd)
+                {
+                    Vector128<float> x2 = Sse.LoadVector128(pDstCurrent);
+                    x2 = Sse.Add(x2, x1);
+                    Sse.Store(pDstCurrent, x2);
+
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> x2 = Sse.LoadScalarVector128(pDstCurrent);
+                    x2 = Sse.AddScalar(x2, x1);
+                    Sse.StoreScalar(pDstCurrent, x2);
+
+                    pDstCurrent++;
+                }
             }
         }
 
@@ -102,6 +465,72 @@ internal static unsafe void ScaleU(float scale, Span<float> dst)
             }
         }
 
+        internal static unsafe void ScaleSrcU(float scale, Span<float> src, Span<float> dst)
+        {
+            Vector128<float> scaleVector = Sse.SetAllVector128(scale);
+
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                float* pDstEnd = pdst + dst.Length;
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+
+                while (pDstCurrent + 4 <= pDstEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Multiply(srcVector, scaleVector);
+                    Sse.Store(pDstCurrent, srcVector);
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.MultiplyScalar(srcVector, scaleVector);
+                    Sse.StoreScalar(pDstCurrent, srcVector);
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                }
+            }
+        }
+
+        // dst[i] = a * (dst[i] + b)
+        internal static unsafe void ScaleAddU(float a, float b, Span<float> dst)
+        {
+            Vector128<float> x1 = Sse.SetAllVector128(a);
+            Vector128<float> x2 = Sse.SetAllVector128(b);
+
+            fixed (float* pdst = dst)
+            {
+                float* pDstEnd = pdst + dst.Length;
+                float* pDstCurrent = pdst;
+
+                while (pDstCurrent + 4 <= pDstEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
+                    dstVector = Sse.Add(dstVector, x2);
+                    dstVector = Sse.Multiply(dstVector, x1);
+                    Sse.Store(pDstCurrent, dstVector);
+
+                    pDstCurrent += 4;
+                }
+
+                while (pDstCurrent < pDstEnd)
+                {
+                    Vector128<float> dstVector = Sse.LoadScalarVector128(pDstCurrent);
+                    dstVector = Sse.AddScalar(dstVector, x2);
+                    dstVector = Sse.MultiplyScalar(dstVector, x1);
+                    Sse.StoreScalar(pDstCurrent, dstVector);
+
+                    pDstCurrent++;
+                }
+            }
+        }
+
         internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float> dst)
         {
             Vector128<float> scaleVector = Sse.SetAllVector128(scale);
@@ -141,6 +570,47 @@ internal static unsafe void AddScaleU(float scale, Span<float> src, Span<float>
             }
         }
 
+        internal static unsafe void AddScaleCopyU(float scale, Span<float> src, Span<float> dst, Span<float> result)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (float* pres = result)
+            {
+                float* pResEnd = pres + result.Length;
+                float* pSrcCurrent = psrc;
+                float* pDstCurrent = pdst;
+                float* pResCurrent = pres;
+
+                Vector128<float> x1 = Sse.SetAllVector128(scale);
+
+                while (pResCurrent + 4 <= pResEnd)
+                {
+                    Vector128<float> x2 = Sse.LoadVector128(pSrcCurrent);
+                    Vector128<float> x3 = Sse.LoadVector128(pDstCurrent);
+                    x2 = Sse.Multiply(x2, x1);
+                    x3 = Sse.Add(x3, x2);
+                    Sse.Store(pResCurrent, x3);
+
+                    pSrcCurrent += 4;
+                    pDstCurrent += 4;
+                    pResCurrent += 4;
+                }
+
+                while (pResCurrent < pResEnd)
+                {
+                    Vector128<float> x2 = Sse.LoadScalarVector128(pSrcCurrent);
+                    Vector128<float> x3 = Sse.LoadScalarVector128(pDstCurrent);
+                    x2 = Sse.MultiplyScalar(x2, x1);
+                    x3 = Sse.AddScalar(x3, x2);
+                    Sse.StoreScalar(pResCurrent, x3);
+
+                    pSrcCurrent++;
+                    pDstCurrent++;
+                    pResCurrent++;
+                }
+            }
+        }
+
         internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> idx, Span<float> dst)
         {
             Vector128<float> scaleVector = Sse.SetAllVector128(scale);
@@ -161,7 +631,7 @@ internal static unsafe void AddScaleSU(float scale, Span<float> src, Span<int> i
 
                     srcVector = Sse.Multiply(srcVector, scaleVector);
                     dstVector = Sse.Add(dstVector, srcVector);
-                    Store4(dstVector, pDstCurrent, pIdxCurrent);
+                    Store4(in dstVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
@@ -229,7 +699,7 @@ internal static unsafe void AddSU(Span<float> src, Span<int> idx, Span<float> ds
                     Vector128<float> dstVector = Sse.LoadVector128(pSrcCurrent);
 
                     srcVector = Sse.Add(srcVector, dstVector);
-                    Store4(srcVector, pDstCurrent, pIdxCurrent);
+                    Store4(in srcVector, pDstCurrent, pIdxCurrent);
 
                     pIdxCurrent += 4;
                     pSrcCurrent += 4;
@@ -282,6 +752,33 @@ internal static unsafe void MulElementWiseU(Span<float> src1, Span<float> src2,
             }
         }
 
+        internal static unsafe float SumU(Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector128<float> result = Sse.SetZeroVector128();
+
+                while (pSrcCurrent + 4 < pSrcEnd)
+                {
+                    result = Sse.Add(result, Sse.LoadVector128(pSrcCurrent));
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorSum(in result);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    result = Sse.AddScalar(result, Sse.LoadScalarVector128(pSrcCurrent));
+                    pSrcCurrent++;
+                }
+
+                return Sse.ConvertToSingle(result);
+            }
+        }
+
         internal static unsafe float SumSqU(Span<float> src)
         {
             Vector128<float> result = Sse.SetZeroVector128();
@@ -313,20 +810,78 @@ internal static unsafe float SumSqU(Span<float> src)
             return Sse.ConvertToSingle(result);
         }
 
+        internal static unsafe float SumSqDiffU(float mean, Span<float> src)
+        {
+            fixed (float* psrc = src)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+
+                Vector128<float> result = Sse.SetZeroVector128();
+                Vector128<float> meanVector = Sse.SetAllVector128(mean);
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> x = Sse.LoadVector128(pSrcCurrent);
+                    x = Sse.Subtract(x, meanVector);
+                    result = Sse.Add(result, Sse.Multiply(x, x));
+
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorSum(in result);
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    Vector128<float> x = Sse.LoadScalarVector128(pSrcCurrent);
+                    x = Sse.SubtractScalar(x, meanVector);
+                    result = Sse.AddScalar(result, Sse.MultiplyScalar(x, x));
+
+                    pSrcCurrent++;
+                }
+
+                return Sse.ConvertToSingle(result);
+            }
+        }
+
         internal static unsafe float SumAbsU(Span<float> src)
         {
             Vector128<float> result = Sse.SetZeroVector128();
-            Vector128<float> mask;
+            Vector128<float> mask = GetAbsMask();
 
-            if (Sse2.IsSupported)
-            {
-                mask = Sse.StaticCast<int, float>(Sse2.SetAllVector128(0x7FFFFFFF));
-            }
-            else
+            fixed (float* psrc = src)
             {
-                mask = Sse.SetAllVector128(BitConverter.Int32BitsToSingle(0x7FFFFFFF));
+                float* pSrcCurrent = psrc;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result = Sse.Add(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorSum(in result);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    result = Sse.AddScalar(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent++;
+                }
             }
 
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float SumAbsDiffU(float mean, Span<float> src)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+            Vector128<float> meanVector = Sse.SetAllVector128(mean);
+            Vector128<float> mask = GetAbsMask();
+
             fixed (float* psrc = src)
             {
                 float* pSrcCurrent = psrc;
@@ -335,6 +890,7 @@ internal static unsafe float SumAbsU(Span<float> src)
                 while (pSrcCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    srcVector = Sse.Subtract(srcVector, meanVector);
                     result = Sse.Add(result, Sse.And(srcVector, mask));
 
                     pSrcCurrent += 4;
@@ -343,9 +899,77 @@ internal static unsafe float SumAbsU(Span<float> src)
                 result = VectorSum(in result);
 
                 while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.SubtractScalar(srcVector, meanVector);
+                    result = Sse.AddScalar(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float MaxAbsU(Span<float> src)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+            Vector128<float> mask = GetAbsMask();
+
+            fixed (float* psrc = src)
+            {
+                float* pSrcCurrent = psrc;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
+                    result = Sse.Max(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorMax(in result);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    result = Sse.MaxScalar(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent++;
+                }
+            }
+
+            return Sse.ConvertToSingle(result);
+        }
+
+        internal static unsafe float MaxAbsDiffU(float mean, Span<float> src)
+        {
+            Vector128<float> result = Sse.SetZeroVector128();
+            Vector128<float> meanVector = Sse.SetAllVector128(mean);
+            Vector128<float> mask = GetAbsMask();
+
+            fixed (float* psrc = src)
+            {
+                float* pSrcCurrent = psrc;
+                float* pEnd = psrc + src.Length;
+
+                while (pSrcCurrent + 4 <= pEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
-                    result = Sse.Add(result, Sse.And(srcVector, mask));
+                    srcVector = Sse.Subtract(srcVector, meanVector);
+                    result = Sse.Max(result, Sse.And(srcVector, mask));
+
+                    pSrcCurrent += 4;
+                }
+
+                result = VectorMax(in result);
+
+                while (pSrcCurrent < pEnd)
+                {
+                    Vector128<float> srcVector = Sse.LoadScalarVector128(pSrcCurrent);
+                    srcVector = Sse.SubtractScalar(srcVector, meanVector);
+                    result = Sse.MaxScalar(result, Sse.And(srcVector, mask));
 
                     pSrcCurrent++;
                 }
@@ -472,5 +1096,93 @@ internal static unsafe float Dist2(Span<float> src, Span<float> dst)
             }
         }
 
+        internal static unsafe void SdcaL1UpdateU(float primalUpdate, Span<float> src, float threshold, Span<float> v, Span<float> w)
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst1 = v)
+            fixed (float* pdst2 = w)
+            {
+                float* pSrcEnd = psrc + src.Length;
+                float* pSrcCurrent = psrc;
+                float* pDst1Current = pdst1;
+                float* pDst2Current = pdst2;
+
+                Vector128<float> xPrimal = Sse.SetAllVector128(primalUpdate);
+
+                Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000
+                Vector128<float> xThreshold = Sse.SetAllVector128(threshold);
+
+                while (pSrcCurrent + 4 <= pSrcEnd)
+                {
+                    Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
+
+                    Vector128<float> xDst1 = Sse.LoadVector128(pDst1Current);
+                    xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal));
+                    Vector128<float> xDst2 = GetNewDst(xDst1, signMask, xThreshold);
+
+                    Sse.Store(pDst1Current, xDst1);
+                    Sse.Store(pDst2Current, xDst2);
+
+                    pSrcCurrent += 4;
+                    pDst1Current += 4;
+                    pDst2Current += 4;
+                }
+
+                while (pSrcCurrent < pSrcEnd)
+                {
+                    *pDst1Current += (*pSrcCurrent) * primalUpdate;
+                    float dst1 = *pDst1Current;
+                    *pDst2Current = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0;
+
+                    pSrcCurrent++;
+                    pDst1Current++;
+                    pDst2Current++;
+                }
+            }
+        }
+
+        internal static unsafe void SdcaL1UpdateSU(float primalUpdate, Span<float> src, Span<int> indices, float threshold, Span<float> v, Span<float> w)
+        {
+            fixed (float* psrc = src)
+            fixed (int* pidx = indices)
+            fixed (float* pdst1 = v)
+            fixed (float* pdst2 = w)
+            {
+                int* pIdxEnd = pidx + indices.Length;
+                float* pSrcCurrent = psrc;
+                int* pIdxCurrent = pidx;
+
+                Vector128<float> xPrimal = Sse.SetAllVector128(primalUpdate);
+
+                Vector128<float> signMask = Sse.SetAllVector128(-0.0f); // 0x8000 0000
+                Vector128<float> xThreshold = Sse.SetAllVector128(threshold);
+
+                while (pIdxCurrent + 4 <= pIdxEnd)
+                {
+                    Vector128<float> xSrc = Sse.LoadVector128(pSrcCurrent);
+
+                    Vector128<float> xDst1 = Load4(pdst1, pIdxCurrent);
+                    xDst1 = Sse.Add(xDst1, Sse.Multiply(xSrc, xPrimal));
+                    Vector128<float> xDst2 = GetNewDst(xDst1, signMask, xThreshold);
+
+                    Store4(in xDst1, pdst1, pIdxCurrent);
+                    Store4(in xDst2, pdst2, pIdxCurrent);
+
+                    pIdxCurrent += 4;
+                    pSrcCurrent += 4;
+                }
+
+                while (pIdxCurrent < pIdxEnd)
+                {
+                    int index = *pIdxCurrent;
+                    pdst1[index] += (*pSrcCurrent) * primalUpdate;
+                    float dst1 = pdst1[index];
+                    pdst2[index] = Math.Abs(dst1) > threshold ? (dst1 > 0 ? dst1 - threshold : dst1 + threshold) : 0;
+
+                    pIdxCurrent++;
+                    pSrcCurrent++;
+                }
+            }
+        }
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
index 90f362de3e..8df3352556 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/CpuMathNativeUtils.cs
@@ -2,6 +2,17 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+// The exported function names need to be unique (can't be disambiguated based on signature), hence
+// we introduce suffix letters to indicate the general patterns used.
+// * A suffix means aligned and padded for SSE operations.
+// * U suffix means unaligned and unpadded.
+// * S suffix means sparse (unaligned) vector.
+// * P suffix means sparse (unaligned) partial vector - the vector is only part of a larger sparse vector.
+// * R suffix means sparse matrix.
+// * C suffix means convolution matrix.
+// * D suffix means convolution matrix, with implicit source padding.
+// * Tran means the matrix is transposed.
+
 using System.Runtime.InteropServices;
 using System.Security;
 
@@ -9,14 +20,26 @@ namespace Microsoft.ML.CpuMath.PerformanceTests
 {
     internal static class CpuMathNativeUtils
     {
-        [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c);
+        [DllImport("CpuMathNative", EntryPoint = "AddScalarU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float AddScalarU(float a, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c);
+        [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float SumSqU(/*const*/ float* ps, int c);
+        [DllImport("CpuMathNative", EntryPoint = "ScaleSrcU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ScaleSrcU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "ScaleAddU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void ScaleAddU(float a, float b, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "AddScaleCopyU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void AddScaleCopyU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ float* pd, /*_Inout_*/ float* pr, int c);
 
         [DllImport("CpuMathNative", EntryPoint = "AddU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void AddU(/*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
@@ -24,22 +47,43 @@ internal static class CpuMathNativeUtils
         [DllImport("CpuMathNative", EntryPoint = "AddSU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe void AddSU(/*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "AddScaleU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void AddScaleU(float a, /*_In_ const*/ float* ps, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "AddScaleSU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void AddScaleSU(float a, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "SumU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumU(/*const*/ float* ps, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "ScaleU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void ScaleU(float a, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "SumSqU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumSqU(/*const*/ float* ps, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
+        [DllImport("CpuMathNative", EntryPoint = "SumSqDiffU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumSqDiffU(float mean, /*const*/ float* ps, int c);
 
         [DllImport("CpuMathNative", EntryPoint = "SumAbsU"), SuppressUnmanagedCodeSecurity]
         internal static extern unsafe float SumAbsU(/*const*/ float* ps, int c);
 
-        [DllImport("CpuMathNative", EntryPoint = "MulElementWiseU"), SuppressUnmanagedCodeSecurity]
-        internal static extern unsafe void MulElementWiseU(/*_In_ const*/ float* ps1, /*_In_ const*/ float* ps2, /*_Inout_*/ float* pd, int c);
+        [DllImport("CpuMathNative", EntryPoint = "SumAbsDiffU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float SumAbsDiffU(float mean, /*const*/ float* ps, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "MaxAbsU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float MaxAbsU(/*const*/ float* ps, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "MaxAbsDiffU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float MaxAbsDiffU(float mean, /*const*/ float* ps, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "DotU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float DotU(/*const*/ float* pa, /*const*/ float* pb, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "DotSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float DotSU(/*const*/ float* pa, /*const*/ float* pb, /*const*/ int* pi, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "Dist2"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe float Dist2(/*const*/ float* px, /*const*/ float* py, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void SdcaL1UpdateU(float primalUpdate, /*_In_ const*/ float* ps, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c);
+
+        [DllImport("CpuMathNative", EntryPoint = "SdcaL1UpdateSU"), SuppressUnmanagedCodeSecurity]
+        internal static extern unsafe void SdcaL1UpdateSU(float primalUpdate, /*_In_ const*/ float* ps, /*_In_ const*/ int* pi, float threshold, /*_Inout_*/ float* pd1, /*_Inout_*/ float* pd2, int c);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
index 92752a0018..ade2ea6a0e 100644
--- a/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
+++ b/test/Microsoft.ML.CpuMath.PerformanceTests/SsePerformanceTests.cs
@@ -19,8 +19,11 @@ public class SsePerformanceTests
         private const int EXP_RANGE = EXP_MAX / 2;
         private const int DEFAULT_SEED = 253421;
         private const float DEFAULT_SCALE = 1.11f;
+        private const int DEFAULT_CROW = 500;
+        private const int DEFAULT_CCOL = 2000;
+        private const bool ADD = true;
 
-        private float[] src, dst, original, src1, src2;
+        private float[] src, dst, original, src1, src2, result;
         private int[] idx;
         private int seed = DEFAULT_SEED;
 
@@ -65,6 +68,7 @@ public void Setup()
             src1 = new float[LEN];
             src2 = new float[LEN];
             original = new float[LEN];
+            result = new float[LEN];
             idx = new int[IDXLEN];
 
             seed = GetSeed();
@@ -75,6 +79,7 @@ public void Setup()
                 src[i] = NextFloat(rand, EXP_RANGE);
                 dst[i] = NextFloat(rand, EXP_RANGE);
                 original[i] = dst[i];
+                result[i] = dst[i];
                 src1[i] = NextFloat(rand, EXP_RANGE);
                 src2[i] = NextFloat(rand, EXP_RANGE);
             }
@@ -89,46 +94,98 @@ public void Setup()
         public void GlobalCleanup()
         {
             original.CopyTo(dst, 0);
+            original.CopyTo(result, 0);
         }
 
         [Benchmark]
-        public unsafe float NativeDotUPerf()
+        public unsafe float NativeAddScalarUPerf()
+        {
+            fixed (float* pdst = dst)
+            {
+                return CpuMathNativeUtils.AddScalarU(DEFAULT_SCALE, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedAddScalarUPerf() => CpuMathUtils.Add(DEFAULT_SCALE, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeScaleUPerf()
+        {
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeScaleSrcUPerf()
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             {
-                return CpuMathNativeUtils.DotU(psrc, pdst, LEN);
+                CpuMathNativeUtils.ScaleSrcU(DEFAULT_SCALE, psrc, pdst, LEN);
             }
         }
 
         [Benchmark]
-        public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
+        public void ManagedScaleSrcUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, LEN);
 
         [Benchmark]
-        public unsafe float NativeDotSUPerf()
+        public unsafe void NativeScaleAddUPerf()
+        {
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.ScaleAddU(DEFAULT_SCALE, DEFAULT_SCALE, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedScaleAddUPerf() => CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeAddScaleUPerf()
+        {
+            fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            {
+                CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN);
+            }
+        }
+
+        [Benchmark]
+        public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+
+        [Benchmark]
+        public unsafe void NativeAddScaleSUPerf()
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
             fixed (int* pidx = idx)
             {
-                return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN);
+                CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN);
             }
         }
 
         [Benchmark]
-        public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
+        public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
 
         [Benchmark]
-        public unsafe float NativeSumSqUPerf()
+        public unsafe void NativeAddScaleCopyUPerf()
         {
             fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (float* pres = result)
             {
-                return CpuMathNativeUtils.SumSqU(psrc, LEN);
+                CpuMathNativeUtils.AddScaleCopyU(DEFAULT_SCALE, psrc, pdst, pres, LEN);
             }
         }
 
         [Benchmark]
-        public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+        public void ManagedAddScaleCopyUPerf() => CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, LEN);
 
         [Benchmark]
         public unsafe void NativeAddUPerf()
@@ -157,44 +214,132 @@ public unsafe void NativeAddSUPerf()
         [Benchmark]
         public void ManagedAddSUPerf() => CpuMathUtils.Add(src, idx, dst, IDXLEN);
 
+
         [Benchmark]
-        public unsafe void NativeAddScaleUPerf()
+        public unsafe void NativeMulElementWiseUPerf()
         {
-            fixed (float* psrc = src)
+            fixed (float* psrc1 = src1)
+            fixed (float* psrc2 = src2)
             fixed (float* pdst = dst)
             {
-                CpuMathNativeUtils.AddScaleU(DEFAULT_SCALE, psrc, pdst, LEN);
+                CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN);
             }
         }
 
         [Benchmark]
-        public void ManagedAddScaleUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, dst, LEN);
+        public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
 
         [Benchmark]
-        public unsafe void NativeAddScaleSUPerf()
+        public unsafe float NativeSumUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumUPerf() => CpuMathUtils.Sum(src, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumSqUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumSqU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumSqUPerf() => CpuMathUtils.SumSq(src, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumSqDiffUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumSqDiffU(DEFAULT_SCALE, psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumSqDiffUPerf() => CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumAbsUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumAbsU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumAbsUPerf() => CpuMathUtils.SumAbs(src, LEN);
+
+        [Benchmark]
+        public unsafe float NativeSumAbsDiffUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.SumAbsDiffU(DEFAULT_SCALE, psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedSumAbsDiffUPerf() => CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, LEN);
+
+        [Benchmark]
+        public unsafe float NativeMaxAbsUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.MaxAbsU(psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedMaxAbsUPerf() => CpuMathUtils.MaxAbs(src, LEN);
+
+        [Benchmark]
+        public unsafe float NativeMaxAbsDiffUPerf()
+        {
+            fixed (float* psrc = src)
+            {
+                return CpuMathNativeUtils.MaxAbsDiffU(DEFAULT_SCALE, psrc, LEN);
+            }
+        }
+
+        [Benchmark]
+        public float ManagedMaxAbsDiffUPerf() => CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, LEN);
+        // TODO: MaxAbsU!!!
+
+        [Benchmark]
+        public unsafe float NativeDotUPerf()
         {
             fixed (float* psrc = src)
             fixed (float* pdst = dst)
-            fixed (int* pidx = idx)
             {
-                CpuMathNativeUtils.AddScaleSU(DEFAULT_SCALE, psrc, pidx, pdst, IDXLEN);
+                return CpuMathNativeUtils.DotU(psrc, pdst, LEN);
             }
         }
 
         [Benchmark]
-        public void ManagedAddScaleSUPerf() => CpuMathUtils.AddScale(DEFAULT_SCALE, src, idx, dst, IDXLEN);
+        public float ManagedDotUPerf() => CpuMathUtils.DotProductDense(src, dst, LEN);
 
         [Benchmark]
-        public unsafe void NativeScaleUPerf()
+        public unsafe float NativeDotSUPerf()
         {
+            fixed (float* psrc = src)
             fixed (float* pdst = dst)
+            fixed (int* pidx = idx)
             {
-                CpuMathNativeUtils.ScaleU(DEFAULT_SCALE, pdst, LEN);
+                return CpuMathNativeUtils.DotSU(psrc, pdst, pidx, IDXLEN);
             }
         }
 
         [Benchmark]
-        public void ManagedScaleUPerf() => CpuMathUtils.Scale(DEFAULT_SCALE, dst, LEN);
+        public float ManagedDotSUPerf() => CpuMathUtils.DotProductSparse(src, dst, idx, IDXLEN);
 
         [Benchmark]
         public unsafe float NativeDist2Perf()
@@ -210,29 +355,32 @@ public unsafe float NativeDist2Perf()
         public float ManagedDist2Perf() => CpuMathUtils.L2DistSquared(src, dst, LEN);
 
         [Benchmark]
-        public unsafe float NativeSumAbsUPerf()
+        public unsafe void NativeSdcaL1UpdateUPerf()
         {
             fixed (float* psrc = src)
+            fixed (float* pdst = dst)
+            fixed (float* pres = result)
             {
-                return CpuMathNativeUtils.SumAbsU(psrc, LEN);
+                CpuMathNativeUtils.SdcaL1UpdateU(DEFAULT_SCALE, psrc, DEFAULT_SCALE, pdst, pres, LEN);
             }
         }
 
         [Benchmark]
-        public float ManagedSumAbsqUPerf() => CpuMathUtils.SumAbs(src, LEN);
+        public void ManagedSdcaL1UpdateUPerf() => CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, LEN, src, DEFAULT_SCALE, dst, result);
 
         [Benchmark]
-        public unsafe void NativeMulElementWiseUPerf()
+        public unsafe void NativeSdcaL1UpdateSUPerf()
         {
-            fixed (float* psrc1 = src1)
-            fixed (float* psrc2 = src2)
+            fixed (float* psrc = src)
             fixed (float* pdst = dst)
+            fixed (float* pres = result)
+            fixed (int* pidx = idx)
             {
-                CpuMathNativeUtils.MulElementWiseU(psrc1, psrc2, pdst, LEN);
+                CpuMathNativeUtils.SdcaL1UpdateSU(DEFAULT_SCALE, psrc, pidx, DEFAULT_SCALE, pdst, pres, IDXLEN);
             }
         }
 
         [Benchmark]
-        public void ManagedMulElementWiseUPerf() => CpuMathUtils.MulElementWise(src1, src2, dst, LEN);
+        public void ManagedSdcaL1UpdateSUPerf() => CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, LEN, src, idx, IDXLEN, DEFAULT_SCALE, dst, result);
     }
 }
diff --git a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
index 6fc2596ef7..d1d5955a8e 100644
--- a/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
+++ b/test/Microsoft.ML.CpuMath.UnitTests.netstandard/UnitTests.cs
@@ -13,7 +13,11 @@ public class CpuMathUtilsUnitTests
     {
         private readonly float[][] testArrays;
         private readonly int[] testIndexArray;
+        private readonly AlignedArray[] testMatrices;
+        private readonly AlignedArray[] testSrcVectors;
+        private readonly AlignedArray[] testDstVectors;
         private const float DEFAULT_SCALE = 1.7f;
+        private const int SseCbAlign = 16;
         private FloatEqualityComparer comparer;
 
         public CpuMathUtilsUnitTests()
@@ -25,75 +29,211 @@ public CpuMathUtilsUnitTests()
             testArrays = new float[][] { testArray1, testArray2 };
             testIndexArray = new int[4] { 0, 2, 5, 6 };
             comparer = new FloatEqualityComparer();
+
+            // Padded matrices whose dimensions are multiples of 4
+            float[] testMatrix1 = new float[4 * 4] { 1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f,
+                1.96f, -2.38f, -9.76f, 13.84f, -106.37f, -26.93f, 32.45f, 3.29f };
+            float[] testMatrix2 = new float[4 * 8];
+
+            for (int i = 0; i < testMatrix2.Length; i++)
+            {
+                testMatrix2[i] = i + 1;
+            }
+
+            AlignedArray testMatrixAligned1 = new AlignedArray(4 * 4, SseCbAlign);
+            AlignedArray testMatrixAligned2 = new AlignedArray(4 * 8, SseCbAlign);
+            testMatrixAligned1.CopyFrom(testMatrix1, 0, testMatrix1.Length);
+            testMatrixAligned2.CopyFrom(testMatrix2, 0, testMatrix2.Length);
+
+            testMatrices = new AlignedArray[] { testMatrixAligned1, testMatrixAligned2 };
+
+            // Padded source vectors whose dimensions are multiples of 4
+            float[] testSrcVector1 = new float[4] { 1f, 2f, 3f, 4f };
+            float[] testSrcVector2 = new float[8] { 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f };
+
+            AlignedArray testSrcVectorAligned1 = new AlignedArray(4, SseCbAlign);
+            AlignedArray testSrcVectorAligned2 = new AlignedArray(8, SseCbAlign);
+            testSrcVectorAligned1.CopyFrom(testSrcVector1, 0, testSrcVector1.Length);
+            testSrcVectorAligned2.CopyFrom(testSrcVector2, 0, testSrcVector2.Length);
+
+            testSrcVectors = new AlignedArray[] { testSrcVectorAligned1, testSrcVectorAligned2 };
+
+            // Padded destination vectors whose dimensions are multiples of 4
+            float[] testDstVector1 = new float[4] { 0f, 1f, 2f, 3f };
+            float[] testDstVector2 = new float[8] { 0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f };
+
+            AlignedArray testDstVectorAligned1 = new AlignedArray(4, SseCbAlign);
+            AlignedArray testDstVectorAligned2 = new AlignedArray(8, SseCbAlign);
+            testDstVectorAligned1.CopyFrom(testDstVector1, 0, testDstVector1.Length);
+            testDstVectorAligned2.CopyFrom(testDstVector2, 0, testDstVector2.Length);
+
+            testDstVectors = new AlignedArray[] { testDstVectorAligned1, testDstVectorAligned2 };
         }
 
         [Theory]
-        [InlineData(0, 13306.0376f)]
-        [InlineData(1, 13291.9235f)]
-        public void DotUTest(int test, float expected)
+        [InlineData(0, 0, 0, new float[] { 23.28f, -49.72f, 23.28f, -49.72f })]
+        [InlineData(1, 1, 0, new float[] { 204f, 492f, 780f, 1068f })]
+        [InlineData(1, 0, 1, new float[] { 30f, 70f, 110f, 150f, 190f, 230f, 270f, 310f })]
+        public void MatMulATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            float[] src = (float[]) testArrays[test].Clone();
-            float[] dst = (float[]) src.Clone();
-            
-            for (int i = 0; i < dst.Length; i++)
-            {
-                dst[i] += 1;
-            }
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
 
-            var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
-            Assert.Equal(expected, actual, 2);
+            CpuMathUtils.MatTimesSrc(false, false, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, 736.7352f)]
-        [InlineData(1, 736.7352f)]
-        public void DotSUTest(int test, float expected)
+        [InlineData(0, 0, 0, new float[] { 23.28f, -48.72f, 25.28f, -46.72f })]
+        [InlineData(1, 1, 0, new float[] { 204f, 493f, 782f, 1071f })]
+        [InlineData(1, 0, 1, new float[] { 30f, 71f, 112f, 153f, 194f, 235f, 276f, 317f })]
+        public void MatMulAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(false, true, mat, src, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -630.38f, -171.1f, 155.66f, 75.1f })]
+        [InlineData(1, 0, 1, new float[] { 170f, 180f, 190f, 200f, 210f, 220f, 230f, 240f })]
+        [InlineData(1, 1, 0, new float[] { 708f, 744f, 780f, 816f })]
+        public void MatMulTranATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(true, false, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -630.38f, -170.1f, 157.66f, 78.1f })]
+        [InlineData(1, 0, 1, new float[] { 170f, 181f, 192f, 203f, 214f, 225f, 236f, 247f })]
+        [InlineData(1, 1, 0, new float[] { 708f, 745f, 782f, 819f })]
+        public void MatMulTranAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+
+            CpuMathUtils.MatTimesSrc(true, true, mat, src, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -27.32f, -9.02f, -27.32f, -9.02f })]
+        [InlineData(1, 1, 0, new float[] { 95f, 231f, 367f, 503f })]
+        [InlineData(1, 0, 1, new float[] { 10f, 26f, 42f, 58f, 74f, 90f, 106f, 122f })]
+        public void MatMulPATest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
             int[] idx = testIndexArray;
 
-            // Ensures src and dst are different arrays
-            for (int i = 0; i < dst.Length; i++)
-            {
-                dst[i] += 1;
-            }
+            CpuMathUtils.MatTimesSrc(false, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
 
-            var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
-            Assert.Equal(expected, actual, 4);
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { -27.32f, -8.02f, -25.32f, -6.02f })]
+        [InlineData(1, 1, 0, new float[] { 95f, 232f, 369f, 506f })]
+        [InlineData(1, 0, 1, new float[] { 10f, 27f, 44f, 61f, 78f, 95f, 112f, 129f })]
+        public void MatMulPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+            int[] idx = testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(false, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, dst.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, 13399.9376f)]
-        [InlineData(1, 13389.1135f)]
-        public void SumSqUTest(int test, float expected)
+        [InlineData(0, 0, 0, new float[] { 7.84f, -9.52f, -39.04f, 55.36f })]
+        [InlineData(1, 0, 1, new float[] { 52f, 56f, 60f, 64f, 68f, 72f, 76f, 80f })]
+        [InlineData(1, 1, 0, new float[] { 329f, 346f, 363f, 380f })]
+        public void MatMulTranPATest(int matTest, int srcTest, int dstTest, float[] expected)
         {
-            float[] src = (float[])testArrays[test].Clone();
-            var actual = CpuMathUtils.SumSq(src, src.Length);
-            Assert.Equal(expected, actual, 2);
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+            int[] idx = testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(true, false, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, 0, 0, new float[] { 7.84f, -8.52f, -37.04f, 58.36f })]
+        [InlineData(1, 0, 1, new float[] { 52f, 57f, 62f, 67f, 72f, 77f, 82f, 87f })]
+        [InlineData(1, 1, 0, new float[] { 329f, 347f, 365f, 383f })]
+        public void MatMulTranPAAddTest(int matTest, int srcTest, int dstTest, float[] expected)
+        {
+            AlignedArray mat = testMatrices[matTest];
+            AlignedArray src = testSrcVectors[srcTest];
+            AlignedArray dst = testDstVectors[dstTest];
+            int[] idx = testIndexArray;
+
+            CpuMathUtils.MatTimesSrc(true, true, mat, idx, src, 0, 0, 2 + 2 * srcTest, dst, src.Size);
+            float[] actual = new float[dst.Size];
+            dst.CopyTo(actual, 0, dst.Size);
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
-        public void AddUTest(int test)
+        public void AddScalarUTest(int test)
         {
-            float[] src = (float[])testArrays[test].Clone();
-            float[] dst = (float[])src.Clone();
-            float[] expected = (float[])src.Clone();
+            float[] dst = (float[])testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
 
-            // Ensures src and dst are different arrays
-            for (int i = 0; i < dst.Length; i++)
+            for (int i = 0; i < expected.Length; i++)
             {
-                dst[i] += 1;
+                expected[i] += DEFAULT_SCALE;
             }
 
+            CpuMathUtils.Add(DEFAULT_SCALE, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void ScaleUTest(int test)
+        {
+            float[] dst = (float[])testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
+
             for (int i = 0; i < expected.Length; i++)
             {
-                expected[i] = 2 * expected[i] + 1;
+                expected[i] *= DEFAULT_SCALE;
             }
 
-            CpuMathUtils.Add(src, dst, dst.Length);
+            CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
             var actual = dst;
             Assert.Equal(expected, actual, comparer);
         }
@@ -101,19 +241,36 @@ public void AddUTest(int test)
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
-        public void AddSUTest(int test)
+        public void ScaleSrcUTest(int test)
         {
             float[] src = (float[])testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
-            int[] idx = testIndexArray;
             float[] expected = (float[])dst.Clone();
 
-            expected[0] = 3.92f;
-            expected[2] = -12.14f;
-            expected[5] = -36.69f;
-            expected[6] = 46.29f;
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] *= DEFAULT_SCALE;
+            }
 
-            CpuMathUtils.Add(src, idx, dst, idx.Length);
+            CpuMathUtils.Scale(DEFAULT_SCALE, src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void ScaleAddUTest(int test)
+        {
+            float[] dst = (float[])testArrays[test].Clone();
+            float[] expected = (float[])dst.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = DEFAULT_SCALE * (dst[i] + DEFAULT_SCALE);
+            }
+
+            CpuMathUtils.ScaleAdd(DEFAULT_SCALE, DEFAULT_SCALE, dst, dst.Length);
             var actual = dst;
             Assert.Equal(expected, actual, comparer);
         }
@@ -160,28 +317,31 @@ public void AddScaleSUTest(int test)
         [Theory]
         [InlineData(0)]
         [InlineData(1)]
-        public void ScaleUTest(int test)
+        public void AddScaleCopyUTest(int test)
         {
-            float[] dst = (float[])testArrays[test].Clone();
-            float[] expectedOutput = (float[])dst.Clone();
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            float[] result = (float[])dst.Clone();
+            float[] expected = (float[])dst.Clone();
 
-            for (int i = 0; i < expectedOutput.Length; i++)
+            for (int i = 0; i < expected.Length; i++)
             {
-                expectedOutput[i] *= DEFAULT_SCALE;
+                expected[i] *= (1 + DEFAULT_SCALE);
             }
 
-            CpuMathUtils.Scale(DEFAULT_SCALE, dst, dst.Length);
-            var managedOutput = dst;
-            Assert.Equal(expectedOutput, managedOutput, comparer);
+            CpuMathUtils.AddScaleCopy(DEFAULT_SCALE, src, dst, result, dst.Length);
+            var actual = result;
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, 8.0f)]
-        [InlineData(1, 7.0f)]
-        public void Dist2Test(int test, float expected)
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddUTest(int test)
         {
             float[] src = (float[])testArrays[test].Clone();
             float[] dst = (float[])src.Clone();
+            float[] expected = (float[])src.Clone();
 
             // Ensures src and dst are different arrays
             for (int i = 0; i < dst.Length; i++)
@@ -189,18 +349,34 @@ public void Dist2Test(int test, float expected)
                 dst[i] += 1;
             }
 
-            var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length);
-            Assert.Equal(expected, actual, 0);
+            for (int i = 0; i < expected.Length; i++)
+            {
+                expected[i] = 2 * expected[i] + 1;
+            }
+
+            CpuMathUtils.Add(src, dst, dst.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
-        [InlineData(0, 196.98f)]
-        [InlineData(1, 193.69f)]
-        public void SumAbsUTest(int test, float expected)
+        [InlineData(0)]
+        [InlineData(1)]
+        public void AddSUTest(int test)
         {
             float[] src = (float[])testArrays[test].Clone();
-            var actual = CpuMathUtils.SumAbs(src, src.Length);
-            Assert.Equal(expected, actual, 2);
+            float[] dst = (float[])src.Clone();
+            int[] idx = testIndexArray;
+            float[] expected = (float[])dst.Clone();
+
+            expected[0] = 3.92f;
+            expected[2] = -12.14f;
+            expected[5] = -36.69f;
+            expected[6] = 46.29f;
+
+            CpuMathUtils.Add(src, idx, dst, idx.Length);
+            var actual = dst;
+            Assert.Equal(expected, actual, comparer);
         }
 
         [Theory]
@@ -229,6 +405,202 @@ public void MulElementWiseUTest(int test)
             var actual = dst;
             Assert.Equal(expected, actual, comparer);
         }
+
+        [Theory]
+        [InlineData(0, -93.9f)]
+        [InlineData(1, -97.19f)]
+        public void SumUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.Sum(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13399.9376f)]
+        [InlineData(1, 13389.1135f)]
+        public void SumSqUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumSq(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13742.3176f)]
+        [InlineData(1, 13739.7895f)]
+        public void SumSqDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumSq(DEFAULT_SCALE, src, 0, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 196.98f)]
+        [InlineData(1, 193.69f)]
+        public void SumAbsUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumAbs(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 196.98f)]
+        [InlineData(1, 195.39f)]
+        public void SumAbsDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.SumAbs(DEFAULT_SCALE, src, 0, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 106.37f)]
+        [InlineData(1, 106.37f)]
+        public void MaxAbsUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.MaxAbs(src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 108.07f)]
+        [InlineData(1, 108.07f)]
+        public void MaxAbsDiffUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            var actual = CpuMathUtils.MaxAbsDiff(DEFAULT_SCALE, src, src.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 13306.0376f)]
+        [InlineData(1, 13291.9235f)]
+        public void DotUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.DotProductDense(src, dst, dst.Length);
+            Assert.Equal(expected, actual, 2);
+        }
+
+        [Theory]
+        [InlineData(0, 736.7352f)]
+        [InlineData(1, 736.7352f)]
+        public void DotSUTest(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+            int[] idx = testIndexArray;
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.DotProductSparse(src, dst, idx, idx.Length);
+            Assert.Equal(expected, actual, 4);
+        }
+
+        [Theory]
+        [InlineData(0, 8.0f)]
+        [InlineData(1, 7.0f)]
+        public void Dist2Test(int test, float expected)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] dst = (float[])src.Clone();
+
+            // Ensures src and dst are different arrays
+            for (int i = 0; i < dst.Length; i++)
+            {
+                dst[i] += 1;
+            }
+
+            var actual = CpuMathUtils.L2DistSquared(src, dst, dst.Length);
+            Assert.Equal(expected, actual, 0);
+        }
+
+        [Theory]
+        [InlineData(0, new int[] { 0, 2 }, new float[] { 0f, 2f, 0f, 4f })]
+        [InlineData(1, new int[] { 0, 2, 5, 6 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 0f, 8f })]
+        public void ZeroItemsUTest(int test, int[] idx, float[] expected)
+        {
+            AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign);
+            src.CopyFrom(testSrcVectors[test]);
+
+            CpuMathUtils.ZeroMatrixItems(src, src.Size, src.Size, idx);
+            float[] actual = new float[src.Size];
+            src.CopyTo(actual, 0, src.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0, new int[] { 0, 1 }, new float[] { 0f, 2f, 0f, 4f })]
+        [InlineData(1, new int[] { 0, 2, 4 }, new float[] { 0f, 2f, 0f, 4f, 5f, 0f, 7f, 8f })]
+        public void ZeroMatrixItemsCoreTest(int test, int[] idx, float[] expected)
+        {
+            AlignedArray src = new AlignedArray(4 + 4 * test, SseCbAlign);
+            src.CopyFrom(testSrcVectors[test]);
+
+            CpuMathUtils.ZeroMatrixItems(src, src.Size / 2 - 1, src.Size / 2, idx);
+            float[] actual = new float[src.Size];
+            src.CopyTo(actual, 0, src.Size);
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void SdcaL1UpdateUTest(int test)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] v = (float[])src.Clone();
+            float[] w = (float[])src.Clone();
+            float[] expected = (float[])w.Clone();
+
+            for (int i = 0; i < expected.Length; i++)
+            {
+                float value = src[i] * (1 + DEFAULT_SCALE);
+                expected[i] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0;
+            }
+
+            CpuMathUtils.SdcaL1UpdateDense(DEFAULT_SCALE, src.Length, src, DEFAULT_SCALE, v, w);
+            var actual = w;
+            Assert.Equal(expected, actual, comparer);
+        }
+
+        [Theory]
+        [InlineData(0)]
+        [InlineData(1)]
+        public void SdcaL1UpdateSUTest(int test)
+        {
+            float[] src = (float[])testArrays[test].Clone();
+            float[] v = (float[])src.Clone();
+            float[] w = (float[])src.Clone();
+            int[] idx = testIndexArray;
+            float[] expected = (float[])w.Clone();
+
+            for (int i = 0; i < idx.Length; i++)
+            {
+                int index = idx[i];
+                float value = v[index] + src[i] * DEFAULT_SCALE;
+                expected[index] = Math.Abs(value) > DEFAULT_SCALE ? (value > 0 ? value - DEFAULT_SCALE : value + DEFAULT_SCALE) : 0;
+            }
+
+            CpuMathUtils.SdcaL1UpdateSparse(DEFAULT_SCALE, src.Length, src, idx, idx.Length, DEFAULT_SCALE, v, w);
+            var actual = w;
+            Assert.Equal(expected, actual, comparer);
+        }
     }
 
     internal class FloatEqualityComparer : IEqualityComparer<float>