Skip to content

Commit 30ac4c5

Browse files
committed
[InnerProductSpace] Perf improvement for dimension not of factor 4 and 16
Currently SIMD (SSE or AVX) is used for the cases when dimension is multiple of 4 or 16, when dimension size is not strictly equal to multiple of 4 or 16 a slower non-vectorized method is used. To improve performnance for these cases new methods are added: `InnerProductSIMD(4|16)ExtResidual` - relies on existing `InnerProductSIMD(4|16)Ext` to compute up to *4 and *16 dimensions and finishes residual computation by non-vectorized method `InnerProduct`. Performance improvement compared to baseline is x3-4 times depending on dimension. Benchmark results: Run on (4 X 3300 MHz CPU s) CPU Caches: L1 Data 32 KiB (x2) L1 Instruction 32 KiB (x2) L2 Unified 256 KiB (x2) L3 Unified 4096 KiB (x1) Load Average: 2.10, 2.25, 2.46 ---------------------------------------------------------- Benchmark Time CPU Iterations ---------------------------------------------------------- TstDim65 14.0 ns 14.0 ns 20 * 48676012 RefDim65 50.3 ns 50.2 ns 20 * 12907985 TstDim101 23.8 ns 23.8 ns 20 * 27976276 RefDim101 91.4 ns 91.3 ns 20 * 7364003 TetDim129 30.0 ns 30.0 ns 20 * 23413955 RefDim129 123 ns 123 ns 20 * 5656383 TstDim257 57.8 ns 57.7 ns 20 * 11263073 RefDim257 268 ns 267 ns 20 * 2617478
1 parent ddea06f commit 30ac4c5

File tree

1 file changed

+37
-3
lines changed

1 file changed

+37
-3
lines changed

hnswlib/space_ip.h

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,36 @@ namespace hnswlib {
211211

212212
#endif
213213

214+
#if defined(USE_SSE) || defined(USE_AVX)
215+
static float
216+
InnerProductSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
217+
size_t qty = *((size_t *) qty_ptr);
218+
size_t qty16 = qty >> 4 << 4;
219+
float res = InnerProductSIMD16Ext(pVect1v, pVect2v, &qty16);
220+
float *pVect1 = (float *) pVect1v + qty16;
221+
float *pVect2 = (float *) pVect2v + qty16;
222+
223+
size_t qty_left = qty - qty16;
224+
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
225+
return res + res_tail - 1.0f;
226+
}
227+
228+
static float
229+
InnerProductSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) {
230+
size_t qty = *((size_t *) qty_ptr);
231+
size_t qty4 = qty >> 2 << 2;
232+
233+
float res = InnerProductSIMD4Ext(pVect1v, pVect2v, &qty4);
234+
size_t qty_left = qty - qty4;
235+
236+
float *pVect1 = (float *) pVect1v + qty4;
237+
float *pVect2 = (float *) pVect2v + qty4;
238+
float res_tail = InnerProduct(pVect1, pVect2, &qty_left);
239+
240+
return res + res_tail - 1.0f;
241+
}
242+
#endif
243+
214244
class InnerProductSpace : public SpaceInterface<float> {
215245

216246
DISTFUNC<float> fstdistfunc_;
@@ -220,11 +250,15 @@ namespace hnswlib {
220250
InnerProductSpace(size_t dim) {
221251
fstdistfunc_ = InnerProduct;
222252
#if defined(USE_AVX) || defined(USE_SSE)
223-
if (dim % 4 == 0)
224-
fstdistfunc_ = InnerProductSIMD4Ext;
225253
if (dim % 16 == 0)
226254
fstdistfunc_ = InnerProductSIMD16Ext;
227-
#endif
255+
else if (dim % 4 == 0)
256+
fstdistfunc_ = InnerProductSIMD4Ext;
257+
else if (dim > 16)
258+
fstdistfunc_ = InnerProductSIMD16ExtResiduals;
259+
else if (dim > 4)
260+
fstdistfunc_ = InnerProductSIMD4ExtResiduals;
261+
#endif
228262
dim_ = dim;
229263
data_size_ = dim * sizeof(float);
230264
}

0 commit comments

Comments
 (0)