[L2Space] Perf improvement for dimension not of factor 4 and 16

2ooom · 2ooom · commit d02078f6b98f · 2020-04-19T09:34:36.000+02:00
Currently SIMD (SSE or AVX) is used for the cases when dimension is
multiple of 4 or 16, while when dimension size is not strictly equal
to multiple of 4 or 16 a slower non-vectorized method is used.

To improve performnance for these cases new methods are added:
 `L2SqrSIMD(4|16)ExtResidual` - it relies on existing `L2SqrSIMD(4|16)Ext`
 to compute up to *4 and *16 dimensions and finishes residual
 computation by method `L2Sqr`.

Performance improvement compared to baseline is x3-4 times depending on
dimension. Benhmark results:

Run on (4 X 3300 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 4096 KiB (x1)
Load Average: 2.18, 2.35, 3.88
-----------------------------------------------------------
Benchmark          Time             CPU        Iterations
-----------------------------------------------------------
TstDim65        14.7 ns         14.7 ns     20 * 47128209
RefDim65        50.2 ns         50.1 ns     20 * 10373751
TstDim101       24.7 ns         24.7 ns     20 * 28064436
RefDim101       90.4 ns         90.2 ns     20 *  7592191
TstDim129       31.4 ns         31.3 ns     20 * 22397921
RefDim129        125 ns          124 ns     20 *  5548862
TstDim257       59.3 ns         59.2 ns     20 * 10856753
RefDim257        266 ns          266 ns     20 *  2630926
diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h
@@ -4,16 +4,19 @@
 namespace hnswlib {
 
     static float
-    L2Sqr(const void *pVect1, const void *pVect2, const void *qty_ptr) {
-        //return *((float *)pVect2);
+    L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        float *pVect1 = (float *) pVect1v;
+        float *pVect2 = (float *) pVect2v;
         size_t qty = *((size_t *) qty_ptr);
+
         float res = 0;
-        for (unsigned i = 0; i < qty; i++) {
-            float t = ((float *) pVect1)[i] - ((float *) pVect2)[i];
+        for (size_t i = 0; i < qty; i++) {
+            float t = *pVect1 - *pVect2;
+            pVect1++;
+            pVect2++;
             res += t * t;
         }
         return (res);
-
     }
 
 #if defined(USE_AVX)
@@ -49,10 +52,8 @@ namespace hnswlib {
         }
 
         _mm256_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-
-        return (res);
-}
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
+    }
 
 #elif defined(USE_SSE)
 
@@ -62,12 +63,9 @@ namespace hnswlib {
         float *pVect2 = (float *) pVect2v;
         size_t qty = *((size_t *) qty_ptr);
         float PORTABLE_ALIGN32 TmpRes[8];
-        // size_t qty4 = qty >> 2;
         size_t qty16 = qty >> 4;
 
         const float *pEnd1 = pVect1 + (qty16 << 4);
-        // const float* pEnd2 = pVect1 + (qty4 << 2);
-        // const float* pEnd3 = pVect1 + qty;
 
         __m128 diff, v1, v2;
         __m128 sum = _mm_set1_ps(0);
@@ -102,10 +100,24 @@ namespace hnswlib {
             diff = _mm_sub_ps(v1, v2);
             sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         }
+
         _mm_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+    }
+#endif
 
-        return (res);
+#if defined(USE_SSE) || defined(USE_AVX)
+    static float
+    L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty16 = qty >> 4 << 4;
+        float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16);
+        float *pVect1 = (float *) pVect1v + qty16;
+        float *pVect2 = (float *) pVect2v + qty16;
+
+        size_t qty_left = qty - qty16;
+        float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
+        return (res + res_tail);
     }
 #endif
 
@@ -119,10 +131,9 @@ namespace hnswlib {
         size_t qty = *((size_t *) qty_ptr);
 
 
-        // size_t qty4 = qty >> 2;
-        size_t qty16 = qty >> 2;
+        size_t qty4 = qty >> 2;
 
-        const float *pEnd1 = pVect1 + (qty16 << 2);
+        const float *pEnd1 = pVect1 + (qty4 << 2);
 
         __m128 diff, v1, v2;
         __m128 sum = _mm_set1_ps(0);
@@ -136,9 +147,22 @@ namespace hnswlib {
             sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
         }
         _mm_store_ps(TmpRes, sum);
-        float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
+    }
 
-        return (res);
+    static float
+    L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty4 = qty >> 2 << 2;
+
+        float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4);
+        size_t qty_left = qty - qty4;
+
+        float *pVect1 = (float *) pVect1v + qty4;
+        float *pVect2 = (float *) pVect2v + qty4;
+        float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
+
+        return (res + res_tail);
     }
 #endif
 
@@ -151,13 +175,14 @@ namespace hnswlib {
         L2Space(size_t dim) {
             fstdistfunc_ = L2Sqr;
         #if defined(USE_SSE) || defined(USE_AVX)
-            if (dim % 4 == 0)
-                fstdistfunc_ = L2SqrSIMD4Ext;
             if (dim % 16 == 0)
                 fstdistfunc_ = L2SqrSIMD16Ext;
-            /*else{
-                throw runtime_error("Data type not supported!");
-            }*/
+            else if (dim % 4 == 0)
+                fstdistfunc_ = L2SqrSIMD4Ext;
+            else if (dim > 16)
+                fstdistfunc_ = L2SqrSIMD16ExtResiduals;
+            else if (dim > 4)
+                fstdistfunc_ = L2SqrSIMD4ExtResiduals;
         #endif
             dim_ = dim;
             data_size_ = dim * sizeof(float);
@@ -185,10 +210,6 @@ namespace hnswlib {
         int res = 0;
         unsigned char *a = (unsigned char *) pVect1;
         unsigned char *b = (unsigned char *) pVect2;
-        /*for (int i = 0; i < qty; i++) {
-            int t = int((a)[i]) - int((b)[i]);
-            res += t*t;
-        }*/
 
         qty = qty >> 2;
         for (size_t i = 0; i < qty; i++) {
@@ -241,4 +262,4 @@ namespace hnswlib {
     };
 
 
-}
+}