diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs index b31a427139..54226b5f35 100644 --- a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs @@ -417,6 +417,7 @@ public static unsafe void AddScalarU(float scalar, Span dst) { float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; + int destinationEnd = pDstEnd - 4; Vector256 scalarVector256 = Avx.SetAllVector256(scalar); @@ -431,7 +432,7 @@ public static unsafe void AddScalarU(float scalar, Span dst) Vector128 scalarVector128 = Sse.SetAllVector128(scalar); - if (pDstCurrent + 4 <= pDstEnd) + if (pDstCurrent <= destinationEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse.Add(dstVector, scalarVector128); @@ -502,6 +503,7 @@ public static unsafe void ScaleSrcU(float scale, Span src, Span ds float* pDstEnd = pdst + dst.Length; float* pSrcCurrent = psrc; float* pDstCurrent = pdst; + int destinationEnd = pDstEnd - 4; Vector256 scaleVector256 = Avx.SetAllVector256(scale); @@ -517,7 +519,7 @@ public static unsafe void ScaleSrcU(float scale, Span src, Span ds Vector128 scaleVector128 = Sse.SetAllVector128(scale); - if (pDstCurrent + 4 <= pDstEnd) + if (pDstCurrent <= destinationEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Multiply(srcVector, scaleVector128); @@ -546,6 +548,7 @@ public static unsafe void ScaleAddU(float a, float b, Span dst) { float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; + int destinationEnd = pDstEnd - 4; Vector256 a256 = Avx.SetAllVector256(a); Vector256 b256 = Avx.SetAllVector256(b); @@ -563,7 +566,7 @@ public static unsafe void ScaleAddU(float a, float b, Span dst) Vector128 a128 = Sse.SetAllVector128(a); Vector128 b128 = Sse.SetAllVector128(b); - if (pDstCurrent + 4 <= pDstEnd) + if (pDstCurrent <= destinationEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse.Add(dstVector, b128); diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs index 4c36d0094e..4a3dae4d82 100644 --- a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs +++ b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs @@ -414,10 +414,11 @@ public static unsafe void AddScalarU(float scalar, Span dst) { float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; + int destinationEnd = pDstEnd - 4; Vector128 scalarVector = Sse.SetAllVector128(scalar); - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent <= destinationEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse.Add(dstVector, scalarVector); @@ -476,10 +477,11 @@ public static unsafe void ScaleSrcU(float scale, Span src, Span ds float* pDstEnd = pdst + dst.Length; float* pSrcCurrent = psrc; float* pDstCurrent = pdst; + int destinationEnd = pDstEnd - 4; Vector128 scaleVector = Sse.SetAllVector128(scale); - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent <= destinationEnd) { Vector128 srcVector = Sse.LoadVector128(pSrcCurrent); srcVector = Sse.Multiply(srcVector, scaleVector); @@ -508,11 +510,12 @@ public static unsafe void ScaleAddU(float a, float b, Span dst) { float* pDstEnd = pdst + dst.Length; float* pDstCurrent = pdst; + int destinationEnd = pDstEnd - 4; Vector128 aVector = Sse.SetAllVector128(a); Vector128 bVector = Sse.SetAllVector128(b); - while (pDstCurrent + 4 <= pDstEnd) + while (pDstCurrent <= destinationEnd) { Vector128 dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse.Add(dstVector, bVector);