[InnerProductSpace] Perf improvement for dimension not of factor 4 and 16

2ooom · 2ooom · commit 30ac4c574df9 · 2020-04-19T09:52:10.000+02:00
Currently SIMD (SSE or AVX) is used for the cases when dimension is
multiple of 4 or 16, when dimension size is not strictly equal to
multiple of 4 or 16 a slower non-vectorized method is used.

To improve performnance for these cases new methods are added:
`InnerProductSIMD(4|16)ExtResidual` - relies on existing
`InnerProductSIMD(4|16)Ext` to compute up to *4 and *16 dimensions and
finishes residual computation by non-vectorized method `InnerProduct`.

Performance improvement compared to baseline is x3-4 times depending on
dimension. Benchmark results:

Run on (4 X 3300 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x2)
  L1 Instruction 32 KiB (x2)
  L2 Unified 256 KiB (x2)
  L3 Unified 4096 KiB (x1)
Load Average: 2.10, 2.25, 2.46

----------------------------------------------------------
Benchmark          Time             CPU        Iterations
----------------------------------------------------------
TstDim65        14.0 ns         14.0 ns     20 * 48676012
RefDim65        50.3 ns         50.2 ns     20 * 12907985
TstDim101       23.8 ns         23.8 ns     20 * 27976276
RefDim101       91.4 ns         91.3 ns     20 *  7364003
TetDim129       30.0 ns         30.0 ns     20 * 23413955
RefDim129        123 ns          123 ns     20 *  5656383
TstDim257       57.8 ns         57.7 ns     20 * 11263073
RefDim257        268 ns          267 ns     20 *  2617478
diff --git a/hnswlib/space_ip.h b/hnswlib/space_ip.h
@@ -211,6 +211,36 @@ namespace hnswlib {
 
 #endif
 
+#if defined(USE_SSE) || defined(USE_AVX)
+    static float
+    InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty16 = qty >> 4 << 4;
+        float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
+        float *pVect1 = (float *) pVect1v + qty16;
+        float *pVect2 = (float *) pVect2v + qty16;
+
+        size_t qty_left = qty - qty16;
+        float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
+        return res + res_tail - 1.0f;
+    }
+
+    static float
+    InnerProductSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
+        size_t qty = *((size_t *) qty_ptr);
+        size_t qty4 = qty >> 2 << 2;
+
+        float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
+        size_t qty_left = qty - qty4;
+
+        float *pVect1 = (float *) pVect1v + qty4;
+        float *pVect2 = (float *) pVect2v + qty4;
+        float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
+
+        return res + res_tail - 1.0f;
+    }
+#endif
+
     class InnerProductSpace : public SpaceInterface<float> {
 
         DISTFUNC<float> fstdistfunc_;
@@ -220,11 +250,15 @@ namespace hnswlib {
         InnerProductSpace(size_t dim) {
             fstdistfunc_ = InnerProduct;
     #if defined(USE_AVX) || defined(USE_SSE)
-            if (dim % 4 == 0)
-                fstdistfunc_ = InnerProductSIMD4Ext;
             if (dim % 16 == 0)
                 fstdistfunc_ = InnerProductSIMD16Ext;
-#endif
+            else if (dim % 4 == 0)
+                fstdistfunc_ = InnerProductSIMD4Ext;
+            else if (dim > 16)
+                fstdistfunc_ = InnerProductSIMD16ExtResiduals;
+            else if (dim > 4)
+                fstdistfunc_ = InnerProductSIMD4ExtResiduals;
+    #endif
             dim_ = dim;
             data_size_ = dim * sizeof(float);
         }