From 2bf4d131f0d783a0febfc1df7e65a00e9aea906b Mon Sep 17 00:00:00 2001 From: Dmitry Parfenchik Date: Tue, 30 Jul 2019 15:59:56 +0200 Subject: [PATCH 01/33] [L2 Space] Improving performance when dimension is not a factor of 4 or 16 Processing 8 values at once and finishing computation by non-vectorized instructions in case dim % 8 != 0 --- hnswlib/space_l2.h | 52 +++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index 4d3ac69a..fda82543 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -3,17 +3,19 @@ namespace hnswlib { - static float - L2Sqr(const void *pVect1, const void *pVect2, const void *qty_ptr) { - //return *((float *)pVect2); + static float L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + float *pVect1 = (float *) pVect1v; + float *pVect2 = (float *) pVect2v; size_t qty = *((size_t *) qty_ptr); + float res = 0; - for (unsigned i = 0; i < qty; i++) { - float t = ((float *) pVect1)[i] - ((float *) pVect2)[i]; + for (size_t i = 0; i < qty; i++) { + float t = *pVect1 - *pVect2; + pVect1++; + pVect2++; res += t * t; } return (res); - } #if defined(USE_AVX) @@ -50,8 +52,9 @@ namespace hnswlib { _mm256_store_ps(TmpRes, sum); float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - - return (res); + size_t qty_left = qty - (qty16 << 4); + float res_tail = L2Sqr(pVect1, pVect2, &qty_left); + return (res + res_tail); } #elif defined(USE_SSE) @@ -62,12 +65,9 @@ namespace hnswlib { float *pVect2 = (float *) pVect2v; size_t qty = *((size_t *) qty_ptr); float PORTABLE_ALIGN32 TmpRes[8]; - // size_t qty4 = qty >> 2; size_t qty16 = qty >> 4; const float *pEnd1 = pVect1 + (qty16 << 4); - // const float* pEnd2 = pVect1 + (qty4 << 2); - // const float* pEnd3 = pVect1 + qty; __m128 diff, v1, v2; __m128 sum = _mm_set1_ps(0); @@ -102,10 +102,12 @@ namespace hnswlib { diff = _mm_sub_ps(v1, v2); sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); } + _mm_store_ps(TmpRes, sum); float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - - return (res); + size_t qty_left = qty - (qty16 << 4); + float res_tail = L2Sqr(pVect1, pVect2, &qty_left); + return (res + res_tail); } #endif @@ -119,10 +121,9 @@ namespace hnswlib { size_t qty = *((size_t *) qty_ptr); - // size_t qty4 = qty >> 2; - size_t qty16 = qty >> 2; + size_t qty4 = qty >> 2; - const float *pEnd1 = pVect1 + (qty16 << 2); + const float *pEnd1 = pVect1 + (qty4 << 2); __m128 diff, v1, v2; __m128 sum = _mm_set1_ps(0); @@ -137,8 +138,10 @@ namespace hnswlib { } _mm_store_ps(TmpRes, sum); float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + size_t qty_left = qty - (qty4 << 2); + float res_tail = L2Sqr(pVect1, pVect2, &qty_left); - return (res); + return (res + res_tail); } #endif @@ -151,13 +154,10 @@ namespace hnswlib { L2Space(size_t dim) { fstdistfunc_ = L2Sqr; #if defined(USE_SSE) || defined(USE_AVX) - if (dim % 4 == 0) - fstdistfunc_ = L2SqrSIMD4Ext; - if (dim % 16 == 0) + if (dim >= 16) fstdistfunc_ = L2SqrSIMD16Ext; - /*else{ - throw runtime_error("Data type not supported!"); - }*/ + else if (dim >= 4) + fstdistfunc_ = L2SqrSIMD4Ext; #endif dim_ = dim; data_size_ = dim * sizeof(float); @@ -185,10 +185,6 @@ namespace hnswlib { int res = 0; unsigned char *a = (unsigned char *) pVect1; unsigned char *b = (unsigned char *) pVect2; - /*for (int i = 0; i < qty; i++) { - int t = int((a)[i]) - int((b)[i]); - res += t*t; - }*/ qty = qty >> 2; for (size_t i = 0; i < qty; i++) { @@ -241,4 +237,4 @@ namespace hnswlib { }; -} +} \ No newline at end of file From 1bdbe2dd474d52efe1d1d069df6ad05ff0d69669 Mon Sep 17 00:00:00 2001 From: Paul Brossier Date: Wed, 24 Jul 2019 14:04:38 -0400 Subject: [PATCH 02/33] setExternalLabel returns void --- hnswlib/hnswalg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 41665595..e0da65d9 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -131,7 +131,7 @@ namespace hnswlib { return return_label; } - inline labeltype setExternalLabel(tableint internal_id, labeltype label) const { + inline void setExternalLabel(tableint internal_id, labeltype label) const { memcpy((data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), &label, sizeof(labeltype)); } From 5f10af8534d06000d401deab205bdac7c866c6c5 Mon Sep 17 00:00:00 2001 From: Paul Brossier Date: Wed, 24 Jul 2019 14:29:50 -0400 Subject: [PATCH 03/33] use size_t counters to avoid size_t to int comparisons --- hnswlib/hnswalg.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index e0da65d9..e056a902 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -197,7 +197,7 @@ namespace hnswlib { _mm_prefetch(getDataByInternalId(*(datal + 1)), _MM_HINT_T0); #endif - for (int j = 0; j < size; j++) { + for (size_t j = 0; j < size; j++) { tableint candidate_id = *(datal + j); // if (candidate_id == 0) continue; #ifdef USE_SSE @@ -275,7 +275,7 @@ namespace hnswlib { _mm_prefetch((char *) (data + 2), _MM_HINT_T0); #endif - for (int j = 1; j <= size; j++) { + for (size_t j = 1; j <= size; j++) { int candidate_id = *(data + j); // if (candidate_id == 0) continue; #ifdef USE_SSE From e1803263d7d579a3844994bbd7c21d4d2ceb4412 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Wed, 31 Jul 2019 19:09:27 +0300 Subject: [PATCH 04/33] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 8b371449..2490d83f 100644 --- a/README.md +++ b/README.md @@ -188,7 +188,10 @@ https://github.com/dbaranchuk/ivf-hnsw * Java implementation: https://github.com/jelmerk/hnswlib * .Net implementation: https://github.com/microsoft/HNSW.Net +### Contributing to the repository +Contributions are highly welcome! +Please make pull requests against the `develop` branch. ### 200M SIFT test reproduction To download and extract the bigann dataset: From 34b142d50cb64c260a517f73ebd399080c7d94e1 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Thu, 1 Aug 2019 17:06:10 +0300 Subject: [PATCH 05/33] fix bug in sift test --- sift_1b.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sift_1b.cpp b/sift_1b.cpp index 522de5d2..273c9828 100644 --- a/sift_1b.cpp +++ b/sift_1b.cpp @@ -319,6 +319,7 @@ void sift_test1B() { #pragma omp parallel for for (int i = 1; i < vecsize; i++) { unsigned char mass[128]; + int j2=0; #pragma omp critical { @@ -332,6 +333,7 @@ void sift_test1B() { mass[j] = massb[j]; } j1++; + j2=j1; if (j1 % report_every == 0) { cout << j1 / (0.01 * vecsize) << " %, " << report_every / (1000.0 * 1e-6 * stopw.getElapsedTimeMicro()) << " kips " << " Mem: " @@ -339,7 +341,7 @@ void sift_test1B() { stopw.reset(); } } - appr_alg->addPoint((void *) (mass), (size_t) j1); + appr_alg->addPoint((void *) (mass), (size_t) j2); } From ce80e995f2a90819239fdf44d89f7e2f71a66006 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Thu, 1 Aug 2019 17:55:28 +0300 Subject: [PATCH 06/33] update bruteforce to support element updates, add locks for multi-threaded --- hnswlib/bruteforce.h | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index e8c24d96..7958c94f 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include namespace hnswlib { template @@ -35,22 +36,37 @@ namespace hnswlib { size_t data_size_; DISTFUNC fstdistfunc_; void *dist_func_param_; + std::mutex index_lock; std::unordered_map dict_external_to_internal; void addPoint(void *datapoint, labeltype label) { - if(dict_external_to_internal.count(label)) - throw std::runtime_error("Ids have to be unique"); + + int idx; + { + std::unique_lock lock(index_lock); + + + + auto search=dict_external_to_internal.find(label); + if (search != dict_external_to_internal.end()) { + idx=search->second; + } + else{ + if (cur_element_count >= maxelements_) { + throw std::runtime_error("The number of elements exceeds the specified limit\n"); + } + idx=cur_element_count; + dict_external_to_internal[label] = idx; + cur_element_count++; + } + } + memcpy(data_ + size_per_element_ * idx + data_size_, &label, sizeof(labeltype)); + memcpy(data_ + size_per_element_ * idx, datapoint, data_size_); + - if (cur_element_count >= maxelements_) { - throw std::runtime_error("The number of elements exceeds the specified limit\n"); - }; - memcpy(data_ + size_per_element_ * cur_element_count + data_size_, &label, sizeof(labeltype)); - memcpy(data_ + size_per_element_ * cur_element_count, datapoint, data_size_); - dict_external_to_internal[label]=cur_element_count; - cur_element_count++; }; void removePoint(labeltype cur_external) { @@ -123,7 +139,6 @@ namespace hnswlib { input.close(); - return; } }; From af0007c6f54df02e4a0fd6130b2e55d3b1d9befc Mon Sep 17 00:00:00 2001 From: louisabraham Date: Tue, 20 Aug 2019 17:21:04 +0200 Subject: [PATCH 07/33] pypi package --- python_bindings/Makefile | 10 ++++++++++ python_bindings/bindings.cpp | 2 +- python_bindings/setup.py | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 python_bindings/Makefile diff --git a/python_bindings/Makefile b/python_bindings/Makefile new file mode 100644 index 00000000..cc77d936 --- /dev/null +++ b/python_bindings/Makefile @@ -0,0 +1,10 @@ +pypi: dist + twine upload dist/* + +dist: + cp -r ../hnswlib . + -rm dist/* + python3 setup.py sdist bdist_wheel + +clean: + rm -rf *.egg-info build dist \ No newline at end of file diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 1aa99ac6..04bdbb7b 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -2,7 +2,7 @@ #include #include #include -#include "../hnswlib/hnswlib.h" +#include "hnswlib/hnswlib.h" #include #include diff --git a/python_bindings/setup.py b/python_bindings/setup.py index 66de1033..fd82e63d 100644 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -18,6 +18,7 @@ 'hnswlib', source_files, # include_dirs=[os.path.join(libdir, "include")], + include_dirs=["hnswlib"], libraries=libraries, language='c++', extra_objects=extra_objects, From bc91a9390af4f0d542e16f121df3cc68daae6f81 Mon Sep 17 00:00:00 2001 From: louisabraham Date: Tue, 20 Aug 2019 17:32:51 +0200 Subject: [PATCH 08/33] travis installation --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 2b92b985..b40e5f44 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,11 @@ language: python matrix: include: - python: 3.6 + - python: 3.7 install: - | cd python_bindings + cp -r ../hnswlib . pip install -r requirements.txt python setup.py install From 2195112b8afc2e21cfd830701cded709e139aaad Mon Sep 17 00:00:00 2001 From: louisabraham Date: Wed, 21 Aug 2019 20:20:36 +0200 Subject: [PATCH 09/33] include hsnwlib in sdist --- python_bindings/MANIFEST.in | 1 + python_bindings/Makefile | 4 +++- python_bindings/setup.py | 1 - 3 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 python_bindings/MANIFEST.in diff --git a/python_bindings/MANIFEST.in b/python_bindings/MANIFEST.in new file mode 100644 index 00000000..5a480e4f --- /dev/null +++ b/python_bindings/MANIFEST.in @@ -0,0 +1 @@ +include hnswlib/*.h \ No newline at end of file diff --git a/python_bindings/Makefile b/python_bindings/Makefile index cc77d936..9d19ea47 100644 --- a/python_bindings/Makefile +++ b/python_bindings/Makefile @@ -7,4 +7,6 @@ dist: python3 setup.py sdist bdist_wheel clean: - rm -rf *.egg-info build dist \ No newline at end of file + rm -rf *.egg-info build dist + +.PHONY: dist \ No newline at end of file diff --git a/python_bindings/setup.py b/python_bindings/setup.py index fd82e63d..66de1033 100644 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -18,7 +18,6 @@ 'hnswlib', source_files, # include_dirs=[os.path.join(libdir, "include")], - include_dirs=["hnswlib"], libraries=libraries, language='c++', extra_objects=extra_objects, From d58b9a87c8227a98b82c782f3ffb32c01bc8806e Mon Sep 17 00:00:00 2001 From: louisabraham Date: Fri, 23 Aug 2019 02:48:21 +0200 Subject: [PATCH 10/33] removebdist_wheel from distribution --- python_bindings/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_bindings/Makefile b/python_bindings/Makefile index 9d19ea47..cfad0fb4 100644 --- a/python_bindings/Makefile +++ b/python_bindings/Makefile @@ -4,7 +4,7 @@ pypi: dist dist: cp -r ../hnswlib . -rm dist/* - python3 setup.py sdist bdist_wheel + python3 setup.py sdist clean: rm -rf *.egg-info build dist From a6b87f291258be590534d20cdc85909503719d9d Mon Sep 17 00:00:00 2001 From: louisabraham Date: Sat, 24 Aug 2019 14:06:47 +0200 Subject: [PATCH 11/33] use symlink --- python_bindings/Makefile | 6 ++++-- python_bindings/hnswlib | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) create mode 120000 python_bindings/hnswlib diff --git a/python_bindings/Makefile b/python_bindings/Makefile index cfad0fb4..02ec523b 100644 --- a/python_bindings/Makefile +++ b/python_bindings/Makefile @@ -2,11 +2,13 @@ pypi: dist twine upload dist/* dist: - cp -r ../hnswlib . -rm dist/* python3 setup.py sdist +test: + python3 setup.py test + clean: - rm -rf *.egg-info build dist + rm -rf *.egg-info build dist var first_half.bin tests/__pycache__ hnswlib.cpython-36m-darwin.so .PHONY: dist \ No newline at end of file diff --git a/python_bindings/hnswlib b/python_bindings/hnswlib new file mode 120000 index 00000000..236d6575 --- /dev/null +++ b/python_bindings/hnswlib @@ -0,0 +1 @@ +../hnswlib \ No newline at end of file From 077b041a0c8db9dc20bb0f32101fbedeb9e001d4 Mon Sep 17 00:00:00 2001 From: louisabraham Date: Sat, 24 Aug 2019 14:20:03 +0200 Subject: [PATCH 12/33] remove unuseful cp (and relaunch tests) --- .travis.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index b40e5f44..6b194926 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,11 +7,10 @@ matrix: install: - | cd python_bindings - cp -r ../hnswlib . pip install -r requirements.txt python setup.py install script: - | cd python_bindings - python setup.py test \ No newline at end of file + python setup.py test From 86188b19463ccdd09c5da0ef4cb831e153377eed Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Wed, 28 Aug 2019 12:24:20 +0300 Subject: [PATCH 13/33] Update README.md --- README.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2490d83f..a4ca58e2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,10 @@ # Hnswlib - fast approximate nearest neighbor search Header-only C++ HNSW implementation with python bindings. Paper code for the HNSW 200M SIFT experiment +**NEWS:** + +**Thanks to Louis Abraham (@louisabraham) hnswlib is now can be installed via pip!** + Highlights: 1) Lightweight, header-only, no dependencies other than C++ 11. 2) Interfaces for C++, python and R (https://github.com/jlmelville/rcpphnsw). @@ -26,7 +30,7 @@ Note that inner product is not an actual metric. An element can be closer to som For other spaces use the nmslib library https://github.com/nmslib/nmslib. -#### short API description +#### Short API description * `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`. Index methods: @@ -45,7 +49,7 @@ Index methods: * `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`. * `set_ef(ef)` - sets the query time accuracy/speed trade-off, defined by the `ef` parameter ( -[ALGO_PARAMS.md](ALGO_PARAMS.md)). +[ALGO_PARAMS.md](ALGO_PARAMS.md)). Note that the parameter is currently not saved along with the index, so you need to set it manually after loading. * `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closests elements for each element of the * `data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`). @@ -166,6 +170,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat ``` ### Bindings installation + +You can install from sources: ```bash apt-get install -y python-setuptools python-pip pip3 install pybind11 numpy setuptools @@ -173,6 +179,9 @@ cd python_bindings python3 setup.py install ``` +or you can install via pip: +`pip install hnswlib` + ### Other implementations * Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib * Faiss libary by facebook, uses own HNSW implementation for coarse quantization (python, C++): From f47e853f1e7503420c44abc39fd0facc80b05baa Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Wed, 28 Aug 2019 12:25:32 +0300 Subject: [PATCH 14/33] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a4ca58e2..fed68b67 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ Header-only C++ HNSW implementation with python bindings. Paper code for the HNS **NEWS:** -**Thanks to Louis Abraham (@louisabraham) hnswlib is now can be installed via pip!** +**Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib is now can be installed via pip!** Highlights: 1) Lightweight, header-only, no dependencies other than C++ 11. From d6d204f1fa8f7f8e7838b2eb42fa95678001fed3 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Wed, 28 Aug 2019 14:24:27 +0300 Subject: [PATCH 15/33] fix/improve tests, #142 --- python_bindings/tests/bindings_test_resize.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python_bindings/tests/bindings_test_resize.py b/python_bindings/tests/bindings_test_resize.py index 1803178d..5e798164 100644 --- a/python_bindings/tests/bindings_test_resize.py +++ b/python_bindings/tests/bindings_test_resize.py @@ -3,11 +3,11 @@ class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): - + for idx in range(32): print("\n**** Index resize test ****\n") import hnswlib import numpy as np - + np.random.seed(idx) dim = 16 num_elements = 10000 @@ -29,9 +29,9 @@ def testRandomSelf(self): # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search - p.set_ef(100) + p.set_ef(20) - p.set_num_threads(4) # by default using all available cores + p.set_num_threads(idx%8) # by default using all available cores # We split the data in two batches: data1 = data[:num_elements // 2] @@ -43,7 +43,7 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data1, k=1) - items=p.get_items(labels) + items=p.get_items(list(range(len(data1)))) # Check the recall: self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) @@ -62,7 +62,7 @@ def testRandomSelf(self): # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) - items=p.get_items(labels) + items=p.get_items(list(range(num_elements))) # Check the recall: self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) From 76db8aed9c8556191aa89594d8dffddd0bd653f4 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Mon, 16 Sep 2019 23:55:22 +0300 Subject: [PATCH 16/33] Fix load bugs/messages, update test, deprecate old indices (#148) * temp debug state * fix bug in loading index with deleted elements * adjust condition in test * add check for file existence * cleanup --- hnswlib/hnswalg.h | 37 ++++++++++++------- python_bindings/tests/bindings_test_labels.py | 20 ++++++++-- python_bindings/tests/bindings_test_resize.py | 2 +- 3 files changed, 41 insertions(+), 18 deletions(-) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index e056a902..ba74bb96 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -595,6 +595,10 @@ namespace hnswlib { std::ifstream input(location, std::ios::binary); + if (!input.is_open()) + throw std::runtime_error("Cannot open file"); + + // get file size: input.seekg(0,input.end); std::streampos total_filesize=input.tellg(); @@ -625,16 +629,15 @@ namespace hnswlib { fstdistfunc_ = s->get_dist_func(); dist_func_param_ = s->get_dist_func_param(); - /// Legacy, check that everything is ok - - bool old_index=false; - auto pos=input.tellg(); + + + /// Optional - check if index is ok: + input.seekg(cur_element_count * size_data_per_element_,input.cur); for (size_t i = 0; i < cur_element_count; i++) { if(input.tellg() < 0 || input.tellg()>=total_filesize){ - old_index = true; - break; + throw std::runtime_error("Index seems to be corrupted or unsupported"); } unsigned int linkListSize; @@ -644,23 +647,21 @@ namespace hnswlib { } } - // check if file is ok, if not this is either corrupted or old index + // throw exception if it either corrupted or old index if(input.tellg()!=total_filesize) - old_index = true; + throw std::runtime_error("Index seems to be corrupted or unsupported"); - if (old_index) { - std::cerr << "Warning: loading of old indexes will be deprecated before 2019.\n" - << "Please resave the index in the new format.\n"; - } input.clear(); + + /// Optional check end + input.seekg(pos,input.beg); data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); input.read(data_level0_memory_, cur_element_count * size_data_per_element_); - if(old_index) - input.seekg(((max_elements_-cur_element_count) * size_data_per_element_), input.cur); + size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); @@ -691,6 +692,14 @@ namespace hnswlib { input.read(linkLists_[i], linkListSize); } } + + has_deletions_=false; + + for (size_t i = 0; i < cur_element_count; i++) { + if(isMarkedDeleted(i)) + has_deletions_=true; + } + input.close(); return; diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index b351935b..f629ab29 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -3,10 +3,12 @@ class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): + for idx in range(16): print("\n**** Index save-load test ****\n") import hnswlib import numpy as np - + + np.random.seed(idx) dim = 16 num_elements = 10000 @@ -95,8 +97,8 @@ def testRandomSelf(self): p.mark_deleted(l[0]) labels2, _ = p.knn_query(data2, k=1) items=p.get_items(labels2) - diff_with_gt_labels=np.max(np.abs(data2-items)) - self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # console + diff_with_gt_labels=np.mean(np.abs(data2-items)) + self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console labels1_after, _ = p.knn_query(data1, k=1) @@ -106,6 +108,18 @@ def testRandomSelf(self): self.assertTrue(False) print("All the data in data1 are removed") + # checking saving/loading index with elements marked as deleted + p.save_index("with_deleted.bin") + p = hnswlib.Index(space='l2', dim=dim) + p.load_index("with_deleted.bin") + p.set_ef(100) + + labels1_after, _ = p.knn_query(data1, k=1) + for la in labels1_after: + for lb in labels1: + if la[0] == lb[0]: + self.assertTrue(False) + if __name__ == "__main__": diff --git a/python_bindings/tests/bindings_test_resize.py b/python_bindings/tests/bindings_test_resize.py index 5e798164..9411af64 100644 --- a/python_bindings/tests/bindings_test_resize.py +++ b/python_bindings/tests/bindings_test_resize.py @@ -3,7 +3,7 @@ class RandomSelfTestCase(unittest.TestCase): def testRandomSelf(self): - for idx in range(32): + for idx in range(16): print("\n**** Index resize test ****\n") import hnswlib import numpy as np From 6c4ab29dce6e06d3e6bba835329d962857011e31 Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Wed, 25 Sep 2019 15:26:29 +0800 Subject: [PATCH 17/33] The interface addPoint is changed from addPoint(void*, args...) to addPoint(const void*, args...). The changes include the interface in bruteforce.h, and all interfaces related to addPoint in hnswlib. I test the code using 1 million sift data, the result is ok. --- hnswlib/bruteforce.h | 2 +- hnswlib/hnswalg.h | 8 ++++---- hnswlib/hnswlib.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 7958c94f..b8183493 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -40,7 +40,7 @@ namespace hnswlib { std::unordered_map dict_external_to_internal; - void addPoint(void *datapoint, labeltype label) { + void addPoint(const void *datapoint, labeltype label) { int idx; { diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index ba74bb96..2fcc412a 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -150,7 +150,7 @@ namespace hnswlib { } std::priority_queue, std::vector>, CompareByFirst> - searchBaseLayer(tableint ep_id, void *data_point, int layer) { + searchBaseLayer(tableint ep_id, const void *data_point, int layer) { VisitedList *vl = visited_list_pool_->getFreeVisitedList(); vl_type *visited_array = vl->mass; vl_type visited_array_tag = vl->curV; @@ -371,7 +371,7 @@ namespace hnswlib { return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_); }; - void mutuallyConnectNewElement(void *data_point, tableint cur_c, + void mutuallyConnectNewElement(const void *data_point, tableint cur_c, std::priority_queue, std::vector>, CompareByFirst> top_candidates, int level) { @@ -779,11 +779,11 @@ namespace hnswlib { *((unsigned short int*)(ptr))=*((unsigned short int *)&size); } - void addPoint(void *data_point, labeltype label) { + void addPoint(const void *data_point, labeltype label) { addPoint(data_point, label,-1); } - tableint addPoint(void *data_point, labeltype label, int level) { + tableint addPoint(const void *data_point, labeltype label, int level) { tableint cur_c = 0; { std::unique_lock lock(cur_element_count_guard_); diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 3ea73ef2..f55e0ec6 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -60,7 +60,7 @@ namespace hnswlib { template class AlgorithmInterface { public: - virtual void addPoint(void *datapoint, labeltype label)=0; + virtual void addPoint(const void *datapoint, labeltype label)=0; virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; virtual void saveIndex(const std::string &location)=0; virtual ~AlgorithmInterface(){ From 0334c8ca78456c9b8610933bfc2e2c0cf5148927 Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Wed, 25 Sep 2019 16:54:32 +0800 Subject: [PATCH 18/33] Two main changes: 1. searchKnn will fist check if the graph is empty 2. searchKnn will return a min-heap The test code in sift_1b is changed and tested. --- hnswlib/bruteforce.h | 12 ++++++++++-- hnswlib/hnswalg.h | 10 ++++++---- hnswlib/hnswlib.h | 19 ++++++++++++++++++- sift_1b.cpp | 16 ++++++++-------- 4 files changed, 42 insertions(+), 15 deletions(-) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index b8183493..f3bc4010 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -84,7 +84,10 @@ namespace hnswlib { } - std::priority_queue> searchKnn(const void *query_data, size_t k) const { + retType searchKnn(const void *query_data, size_t k) const { + retType result; + if (cur_element_count == 0) return result; + std::priority_queue> topResults; for (int i = 0; i < k; i++) { dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_); @@ -103,7 +106,12 @@ namespace hnswlib { } } - return topResults; + while (!topResults.empty()) { + auto each = topResults.top(); + result.push(each); + topResults.pop(); + } + return result; }; void saveIndex(const std::string &location) { diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 2fcc412a..d191bfeb 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -895,7 +895,10 @@ namespace hnswlib { return cur_c; }; - std::priority_queue> searchKnn(const void *query_data, size_t k) const { + retType searchKnn(const void *query_data, size_t k) const { + retType result; + if (cur_element_count == 0) return result; + tableint currObj = enterpoint_node_; dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); @@ -934,16 +937,15 @@ namespace hnswlib { currObj, query_data, std::max(ef_, k)); top_candidates.swap(top_candidates1); } - std::priority_queue> results; while (top_candidates.size() > k) { top_candidates.pop(); } while (top_candidates.size() > 0) { std::pair rez = top_candidates.top(); - results.push(std::pair(rez.first, getExternalLabel(rez.second))); + result.push(std::pair(rez.first, getExternalLabel(rez.second))); top_candidates.pop(); } - return results; + return result; }; diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index f55e0ec6..84e5cffe 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -24,12 +24,29 @@ #endif #include +#include #include namespace hnswlib { typedef size_t labeltype; + template + class pairGreater { + public: + bool operator()(const T& p1, const T& p2) { + return p1.first > p2.first; + } + }; + + template + using retType = std::priority_queue< + std::pair, + std::vector>, + pairGreater> + >; + + template static void writeBinaryPOD(std::ostream &out, const T &podRef) { out.write((char *) &podRef, sizeof(T)); @@ -61,7 +78,7 @@ namespace hnswlib { class AlgorithmInterface { public: virtual void addPoint(const void *datapoint, labeltype label)=0; - virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; + virtual retType searchKnn(const void *, size_t) const = 0; virtual void saveIndex(const std::string &location)=0; virtual ~AlgorithmInterface(){ } diff --git a/sift_1b.cpp b/sift_1b.cpp index 273c9828..95ec8e66 100644 --- a/sift_1b.cpp +++ b/sift_1b.cpp @@ -147,10 +147,10 @@ static size_t getCurrentRSS() { static void get_gt(unsigned int *massQA, unsigned char *massQ, unsigned char *mass, size_t vecsize, size_t qsize, L2SpaceI &l2space, - size_t vecdim, vector>> &answers, size_t k) { + size_t vecdim, vector> &answers, size_t k) { - (vector>>(qsize)).swap(answers); + (vector>(qsize)).swap(answers); DISTFUNC fstdistfunc_ = l2space.get_dist_func(); cout << qsize << "\n"; for (int i = 0; i < qsize; i++) { @@ -162,15 +162,15 @@ get_gt(unsigned int *massQA, unsigned char *massQ, unsigned char *mass, size_t v static float test_approx(unsigned char *massQ, size_t vecsize, size_t qsize, HierarchicalNSW &appr_alg, size_t vecdim, - vector>> &answers, size_t k) { + vector> &answers, size_t k) { size_t correct = 0; size_t total = 0; //uncomment to test in parallel mode: //#pragma omp parallel for for (int i = 0; i < qsize; i++) { - std::priority_queue> result = appr_alg.searchKnn(massQ + vecdim * i, k); - std::priority_queue> gt(answers[i]); + retType result = appr_alg.searchKnn(massQ + vecdim * i, k); + retType gt(answers[i]); unordered_set g; total += gt.size(); @@ -196,7 +196,7 @@ test_approx(unsigned char *massQ, size_t vecsize, size_t qsize, HierarchicalNSW< static void test_vs_recall(unsigned char *massQ, size_t vecsize, size_t qsize, HierarchicalNSW &appr_alg, size_t vecdim, - vector>> &answers, size_t k) { + vector> &answers, size_t k) { vector efs;// = { 10,10,10,10,10 }; for (int i = k; i < 30; i++) { efs.push_back(i); @@ -231,7 +231,7 @@ inline bool exists_test(const std::string &name) { void sift_test1B() { - int subset_size_milllions = 200; + int subset_size_milllions = 1; int efConstruction = 40; int M = 16; @@ -351,7 +351,7 @@ void sift_test1B() { } - vector>> answers; + vector> answers; size_t k = 1; cout << "Parsing gt:\n"; get_gt(massQA, massQ, mass, vecsize, qsize, l2space, vecdim, answers, k); From 16c11759aea87c5badaf4857fa37492fab609a43 Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Wed, 25 Sep 2019 16:57:38 +0800 Subject: [PATCH 19/33] Remove unneeded header --- sift_1b.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sift_1b.cpp b/sift_1b.cpp index 95ec8e66..39c7e5a2 100644 --- a/sift_1b.cpp +++ b/sift_1b.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include "hnswlib/hnswlib.h" From 2b400f3f6e76bbdb5a54294eaaf819ef6a1ba46b Mon Sep 17 00:00:00 2001 From: Peter Whidden <52440435+PWhids@users.noreply.github.com> Date: Wed, 25 Sep 2019 17:29:26 -0700 Subject: [PATCH 20/33] Fix typos --- ALGO_PARAMS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ALGO_PARAMS.md b/ALGO_PARAMS.md index 14cec786..4585a82c 100644 --- a/ALGO_PARAMS.md +++ b/ALGO_PARAMS.md @@ -20,10 +20,10 @@ The range ```M```=12-48 is ok for the most of the use cases. When ```M``` is cha Nonetheless, ef and ef_construction parameters can be roughly estimated by assuming that ```M```*```ef_{construction}``` is a constant. -* ```ef_constrution``` - the parameter has the same meaning as ```ef```, but controls the index_time/index_accuracy. Bigger +* ```ef_construction``` - the parameter has the same meaning as ```ef```, but controls the index_time/index_accuracy. Bigger ef_construction leads to longer construction, but better index quality. At some point, increasing ef_construction does not improve the quality of the index. One way to check if the selection of ef_construction was ok is to measure a recall -for M nearest neighbor search when ```ef``` =```ef_constuction```: if the recall is lower than 0.9, than there is room +for M nearest neighbor search when ```ef``` =```ef_construction```: if the recall is lower than 0.9, than there is room for improvement. * ```num_elements``` - defines the maximum number of elements in the index. The index can be extened by saving/loading(load_index function has a parameter which defines the new maximum number of elements). From eb8dc989b9d2cd6e25df1a6ff28ef4d2c4359a83 Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Thu, 26 Sep 2019 13:19:18 +0800 Subject: [PATCH 21/33] A new interface taking a template comparator is added, so it can not be virtual. I modified the sift_1b(not commited) to test the new interface, the result is ok. Test result of sift_1b on 1 million data. Loading GT: Loading queries: Loading index from sift1b_1m_ef_40_M_16.bin: Actual memory usage: 417 Mb Parsing gt: 10000 Loaded gt 1 0.2371 13.319 us 2 0.3712 15.691 us 3 0.4615 18.5166 us 4 0.5273 20.6371 us 5 0.5758 22.1235 us 6 0.6179 24.4141 us 7 0.6502 25.9906 us 8 0.6796 28.2004 us 9 0.7042 29.8559 us 10 0.7243 31.3286 us 11 0.7432 36.0276 us 12 0.7605 34.9448 us 13 0.7754 36.4176 us 14 0.7874 37.7606 us 15 0.8013 44.6698 us 16 0.8116 47.4424 us 17 0.8239 46.9154 us 18 0.8312 45.9322 us 19 0.8379 49.3406 us 20 0.8442 49.124 us 21 0.8507 52.1223 us 22 0.8566 52.4161 us 23 0.8622 56.9665 us 24 0.8675 71.5782 us 25 0.8731 72.4451 us 26 0.8768 57.0935 us 27 0.8812 58.3525 us 28 0.8845 59.5751 us 29 0.889 61.7516 us 30 0.8935 62.6091 us 40 0.9224 76.8735 us 50 0.9412 92.5431 us 60 0.9541 107.141 us 70 0.9632 121.24 us 80 0.9708 135.862 us 90 0.9756 163.516 us 100 0.9792 180.539 us 140 0.9883 228.747 us 180 0.9921 281.199 us 220 0.9942 338.32 us 260 0.9956 388.501 us 300 0.9962 445.776 us 340 0.9968 477.474 us 380 0.9975 534.054 us 420 0.9982 582.327 us 460 0.9983 625.824 us Actual memory usage: 419 Mb --- hnswlib/bruteforce.h | 35 ++++++++++++++++++++++++++--------- hnswlib/hnswalg.h | 30 ++++++++++++++++++++++++++---- hnswlib/hnswlib.h | 13 ++++--------- sift_1b.cpp | 17 +++++++++-------- 4 files changed, 65 insertions(+), 30 deletions(-) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index f3bc4010..86944ad5 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -2,6 +2,7 @@ #include #include #include +#include namespace hnswlib { template @@ -84,11 +85,10 @@ namespace hnswlib { } - retType searchKnn(const void *query_data, size_t k) const { - retType result; - if (cur_element_count == 0) return result; - + std::priority_queue> + searchKnn(const void *query_data, size_t k) const { std::priority_queue> topResults; + if (cur_element_count == 0) return topResults; for (int i = 0; i < k; i++) { dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_); topResults.push(std::pair(dist, *((labeltype *) (data_ + size_per_element_ * i + @@ -106,13 +106,30 @@ namespace hnswlib { } } - while (!topResults.empty()) { - auto each = topResults.top(); - result.push(each); - topResults.pop(); + return topResults; + }; + + template + std::vector> + searchKnn(const void* query_data, size_t k, Comp comp) { + std::vector> result; + if (cur_element_count == 0) return result; + + auto ret = searchKnn(query_data, k); + + while (!ret.empty()) { + result.push_back(ret.top()); + ret.pop(); } + + if (result.size() > 1) { + if (!comp(result.front(), result.back())) { + std::reverse(result.begin(), result.end()); + } + } + return result; - }; + } void saveIndex(const std::string &location) { std::ofstream output(location, std::ios::binary); diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index d191bfeb..b5ec0547 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -484,6 +484,8 @@ namespace hnswlib { std::priority_queue> searchKnnInternal(void *query_data, int k) { + std::priority_queue> top_candidates; + if (cur_element_count == 0) return top_candidates; tableint currObj = enterpoint_node_; dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); @@ -510,8 +512,6 @@ namespace hnswlib { } } - - std::priority_queue> top_candidates; if (has_deletions_) { std::priority_queue> top_candidates1=searchBaseLayerST(currObj, query_data, ef_); @@ -895,8 +895,9 @@ namespace hnswlib { return cur_c; }; - retType searchKnn(const void *query_data, size_t k) const { - retType result; + std::priority_queue> + searchKnn(const void *query_data, size_t k) const { + std::priority_queue> result; if (cur_element_count == 0) return result; tableint currObj = enterpoint_node_; @@ -948,6 +949,27 @@ namespace hnswlib { return result; }; + template + std::vector> + searchKnn(const void* query_data, size_t k, Comp comp) { + std::vector> result; + if (cur_element_count == 0) return result; + + auto ret = searchKnn(query_data, k); + + while (!ret.empty()) { + result.push_back(ret.top()); + ret.pop(); + } + + if (result.size() > 1) { + if (!comp(result.front(), result.back())) { + std::reverse(result.begin(), result.end()); + } + } + + return result; + } }; diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 84e5cffe..dbfb1656 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -39,14 +39,6 @@ namespace hnswlib { } }; - template - using retType = std::priority_queue< - std::pair, - std::vector>, - pairGreater> - >; - - template static void writeBinaryPOD(std::ostream &out, const T &podRef) { out.write((char *) &podRef, sizeof(T)); @@ -78,7 +70,10 @@ namespace hnswlib { class AlgorithmInterface { public: virtual void addPoint(const void *datapoint, labeltype label)=0; - virtual retType searchKnn(const void *, size_t) const = 0; + virtual std::priority_queue> searchKnn(const void *, size_t) const = 0; + template + std::vector> searchKnn(const void*, size_t, Comp) { + } virtual void saveIndex(const std::string &location)=0; virtual ~AlgorithmInterface(){ } diff --git a/sift_1b.cpp b/sift_1b.cpp index 39c7e5a2..273c9828 100644 --- a/sift_1b.cpp +++ b/sift_1b.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include "hnswlib/hnswlib.h" @@ -146,10 +147,10 @@ static size_t getCurrentRSS() { static void get_gt(unsigned int *massQA, unsigned char *massQ, unsigned char *mass, size_t vecsize, size_t qsize, L2SpaceI &l2space, - size_t vecdim, vector> &answers, size_t k) { + size_t vecdim, vector>> &answers, size_t k) { - (vector>(qsize)).swap(answers); + (vector>>(qsize)).swap(answers); DISTFUNC fstdistfunc_ = l2space.get_dist_func(); cout << qsize << "\n"; for (int i = 0; i < qsize; i++) { @@ -161,15 +162,15 @@ get_gt(unsigned int *massQA, unsigned char *massQ, unsigned char *mass, size_t v static float test_approx(unsigned char *massQ, size_t vecsize, size_t qsize, HierarchicalNSW &appr_alg, size_t vecdim, - vector> &answers, size_t k) { + vector>> &answers, size_t k) { size_t correct = 0; size_t total = 0; //uncomment to test in parallel mode: //#pragma omp parallel for for (int i = 0; i < qsize; i++) { - retType result = appr_alg.searchKnn(massQ + vecdim * i, k); - retType gt(answers[i]); + std::priority_queue> result = appr_alg.searchKnn(massQ + vecdim * i, k); + std::priority_queue> gt(answers[i]); unordered_set g; total += gt.size(); @@ -195,7 +196,7 @@ test_approx(unsigned char *massQ, size_t vecsize, size_t qsize, HierarchicalNSW< static void test_vs_recall(unsigned char *massQ, size_t vecsize, size_t qsize, HierarchicalNSW &appr_alg, size_t vecdim, - vector> &answers, size_t k) { + vector>> &answers, size_t k) { vector efs;// = { 10,10,10,10,10 }; for (int i = k; i < 30; i++) { efs.push_back(i); @@ -230,7 +231,7 @@ inline bool exists_test(const std::string &name) { void sift_test1B() { - int subset_size_milllions = 1; + int subset_size_milllions = 200; int efConstruction = 40; int M = 16; @@ -350,7 +351,7 @@ void sift_test1B() { } - vector> answers; + vector>> answers; size_t k = 1; cout << "Parsing gt:\n"; get_gt(massQA, massQ, mass, vecsize, qsize, l2space, vecdim, answers, k); From 5e037e2a05249b565f28648bfe6085c9641771f9 Mon Sep 17 00:00:00 2001 From: uestc-lfs Date: Sat, 28 Sep 2019 17:58:56 +0800 Subject: [PATCH 22/33] Using std::sort to sort the result according to the comparator provided by the user. --- hnswlib/bruteforce.h | 8 ++------ hnswlib/hnswalg.h | 6 +----- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 86944ad5..8765726b 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -121,12 +121,8 @@ namespace hnswlib { result.push_back(ret.top()); ret.pop(); } - - if (result.size() > 1) { - if (!comp(result.front(), result.back())) { - std::reverse(result.begin(), result.end()); - } - } + + std::sort(result.begin(), result.end(), comp); return result; } diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index b5ec0547..8fcf949a 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -962,11 +962,7 @@ namespace hnswlib { ret.pop(); } - if (result.size() > 1) { - if (!comp(result.front(), result.back())) { - std::reverse(result.begin(), result.end()); - } - } + std::sort(result.begin(), result.end(), comp); return result; } From dc25836557e40b1aca91112a6f97dbbf230a492a Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Tue, 5 Nov 2019 15:44:11 +0300 Subject: [PATCH 23/33] fix missing deletion initialization --- hnswlib/hnswalg.h | 1 + 1 file changed, 1 insertion(+) diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 8fcf949a..f24c8714 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -797,6 +797,7 @@ namespace hnswlib { auto search = label_lookup_.find(label); if (search != label_lookup_.end()) { std::unique_lock lock_el(link_list_locks_[search->second]); + has_deletions_ = true; markDeletedInternal(search->second); } label_lookup_[label] = cur_c; From 65f35ac904fa81705c55b0a80a0a1fe7f260919c Mon Sep 17 00:00:00 2001 From: Vimal Mathew Date: Wed, 6 Nov 2019 17:02:41 +0530 Subject: [PATCH 24/33] Expose current-count and max-elements in Python --- python_bindings/bindings.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 04bdbb7b..9c90fa45 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -359,6 +359,14 @@ class Index { appr_alg->resizeIndex(new_size); } + size_t getMaxElements() const { + return appr_alg->max_elements_; + } + + size_t getCurrentCount() const { + return appr_alg->cur_element_count; + } + std::string space_name; int dim; @@ -397,6 +405,8 @@ PYBIND11_PLUGIN(hnswlib) { .def("load_index", &Index::loadIndex, py::arg("path_to_index"), py::arg("max_elements")=0) .def("mark_deleted", &Index::markDeleted, py::arg("label")) .def("resize_index", &Index::resizeIndex, py::arg("new_size")) + .def("get_max_elements", &Index::getMaxElements) + .def("get_current_count", &Index::getCurrentCount) .def("__repr__", [](const Index &a) { return ""; From d5f6aad9f8ca2697e340181f9f664e9b3445d1c6 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Wed, 6 Nov 2019 18:27:24 +0300 Subject: [PATCH 25/33] fix python tests --- python_bindings/tests/bindings_test_labels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py index f629ab29..c1887bef 100644 --- a/python_bindings/tests/bindings_test_labels.py +++ b/python_bindings/tests/bindings_test_labels.py @@ -50,7 +50,7 @@ def testRandomSelf(self): self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3) # Check that the returned element data is correct: - diff_with_gt_labels=np.max(np.abs(data1-items)) + diff_with_gt_labels=np.mean(np.abs(data1-items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # Serializing and deleting the index. @@ -83,7 +83,7 @@ def testRandomSelf(self): self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3) # Check that the returned element data is correct: - diff_with_gt_labels=np.max(np.abs(data-items)) + diff_with_gt_labels=np.mean(np.abs(data-items)) self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # deleting index. # Checking that all labels are returned correctly: From bf9491556c9783dfb812d8ffd0ac3256ac0edc89 Mon Sep 17 00:00:00 2001 From: Vimal Mathew Date: Mon, 11 Nov 2019 15:47:54 +0530 Subject: [PATCH 26/33] Throw exception on malloc fails --- hnswlib/bruteforce.h | 4 ++++ hnswlib/hnswalg.h | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/hnswlib/bruteforce.h b/hnswlib/bruteforce.h index 8765726b..5b1bd655 100644 --- a/hnswlib/bruteforce.h +++ b/hnswlib/bruteforce.h @@ -22,6 +22,8 @@ namespace hnswlib { dist_func_param_ = s->get_dist_func_param(); size_per_element_ = data_size_ + sizeof(labeltype); data_ = (char *) malloc(maxElements * size_per_element_); + if (data_ == nullptr) + std::runtime_error("Not enough memory: BruteforceSearch failed to allocate data"); cur_element_count = 0; } @@ -155,6 +157,8 @@ namespace hnswlib { dist_func_param_ = s->get_dist_func_param(); size_per_element_ = data_size_ + sizeof(labeltype); data_ = (char *) malloc(maxelements_ * size_per_element_); + if (data_ == nullptr) + std::runtime_error("Not enough memory: loadIndex failed to allocate data"); input.read(data_, maxelements_ * size_per_element_); diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index f24c8714..afc1222d 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -61,6 +61,8 @@ namespace hnswlib { maxlevel_ = -1; linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); + if (linkLists_ == nullptr) + throw std::runtime_error("Not enough memory: HierarchicalNSW failed to allocate linklists"); size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); mult_ = 1 / log(1.0 * M_); revSize_ = 1.0 / mult_; @@ -546,12 +548,16 @@ namespace hnswlib { // Reallocate base layer char * data_level0_memory_new = (char *) malloc(new_max_elements * size_data_per_element_); + if (data_level0_memory_new == nullptr) + throw std::runtime_error("Not enough memory: resizeIndex failed to allocate base layer"); memcpy(data_level0_memory_new, data_level0_memory_,cur_element_count * size_data_per_element_); free(data_level0_memory_); data_level0_memory_=data_level0_memory_new; // Reallocate all other layers char ** linkLists_new = (char **) malloc(sizeof(void *) * new_max_elements); + if (linkLists_new == nullptr) + throw std::runtime_error("Not enough memory: resizeIndex failed to allocate other layers"); memcpy(linkLists_new, linkLists_,cur_element_count * sizeof(void *)); free(linkLists_); linkLists_=linkLists_new; @@ -659,6 +665,8 @@ namespace hnswlib { data_level0_memory_ = (char *) malloc(max_elements * size_data_per_element_); + if (data_level0_memory_ == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate level0"); input.read(data_level0_memory_, cur_element_count * size_data_per_element_); @@ -675,6 +683,8 @@ namespace hnswlib { linkLists_ = (char **) malloc(sizeof(void *) * max_elements); + if (linkLists_ == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklists"); element_levels_ = std::vector(max_elements); revSize_ = 1.0 / mult_; ef_ = 10; @@ -689,6 +699,8 @@ namespace hnswlib { } else { element_levels_[i] = linkListSize / size_links_per_element_; linkLists_[i] = (char *) malloc(linkListSize); + if (linkLists_[i] == nullptr) + throw std::runtime_error("Not enough memory: loadIndex failed to allocate linklist"); input.read(linkLists_[i], linkListSize); } } @@ -828,6 +840,8 @@ namespace hnswlib { if (curlevel) { linkLists_[cur_c] = (char *) malloc(size_links_per_element_ * curlevel + 1); + if (linkLists_[cur_c] == nullptr) + throw std::runtime_error("Not enough memory: addPoint failed to allocate linklist"); memset(linkLists_[cur_c], 0, size_links_per_element_ * curlevel + 1); } From ef1d4e082de9de18da38b7c15e22c283396ac0e7 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Mon, 11 Nov 2019 14:40:12 +0300 Subject: [PATCH 27/33] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index fed68b67..31c41564 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,9 @@ Index methods: * `get_ids_list()` - returns a list of all elements' ids. +* `get_max_elements()` - returns the current capacity of the index +* `get_current_count()` - returns the current number of element stored in the index From b75c7133bed240dfd26083c04a8722f526c765d7 Mon Sep 17 00:00:00 2001 From: Muhammad Ashfaq Date: Wed, 20 Nov 2019 12:46:43 +0800 Subject: [PATCH 28/33] updated path from static to dynamic --- examples/example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example.py b/examples/example.py index b9d2ec64..a08955a1 100644 --- a/examples/example.py +++ b/examples/example.py @@ -45,7 +45,7 @@ # Serializing and deleting the index: index_path='first_half.bin' print("Saving index to '%s'" % index_path) -p.save_index("first_half.bin") +p.save_index(index_path) del p # Reiniting, loading the index From 83b635dc841bd90cffbe665cfa030f5405d2b6a5 Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Mon, 16 Dec 2019 15:31:29 -0800 Subject: [PATCH 29/33] bump version --- python_bindings/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_bindings/setup.py b/python_bindings/setup.py index 66de1033..2e863c87 100644 --- a/python_bindings/setup.py +++ b/python_bindings/setup.py @@ -4,7 +4,7 @@ import sys import setuptools -__version__ = '0.3' +__version__ = '0.3.4' source_files = ['bindings.cpp'] From aae3be9884eb4c5555318e63d235a7bd83ff22a2 Mon Sep 17 00:00:00 2001 From: xiejianqiao Date: Mon, 20 Jan 2020 15:18:10 +0800 Subject: [PATCH 30/33] =?UTF-8?q?fix=EF=BC=8Coverflow=20in=20getIdsList?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python_bindings/bindings.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 9c90fa45..ef1dc1d6 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -244,9 +244,9 @@ class Index { return data; } - std::vector getIdsList() { + std::vector getIdsList() { - std::vector ids; + std::vector ids; for(auto kv : appr_alg->label_lookup_) { ids.push_back(kv.first); From 679903c1edfb4eafd63425dd71cebc9aad12f249 Mon Sep 17 00:00:00 2001 From: Hussama Ismail Date: Wed, 12 Feb 2020 21:17:16 +0100 Subject: [PATCH 31/33] include one more other implementation --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 31c41564..fb3897f8 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,7 @@ https://github.com/dbaranchuk/ivf-hnsw * Go implementation: https://github.com/Bithack/go-hnsw * Python implementation (as a part of the clustering code by by Matteo Dell'Amico): https://github.com/matteodellamico/flexible-clustering * Java implementation: https://github.com/jelmerk/hnswlib +* Java bindings using Java Native Access: https://github.com/stepstone-tech/hnswlib-jna * .Net implementation: https://github.com/microsoft/HNSW.Net ### Contributing to the repository From fd4ebf42c62a33419f885881065b7af980fc283e Mon Sep 17 00:00:00 2001 From: Yury Malkov Date: Tue, 3 Mar 2020 22:17:28 -0800 Subject: [PATCH 32/33] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb3897f8..c79e24c1 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ Index methods: * `set_num_threads(num_threads)` set the default number of cpu threads used during data insertion/querying. -* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). +* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). Note that for cosine similarity it currently returns **normalized** vectors. * `get_ids_list()` - returns a list of all elements' ids. From 4bd853c18100caee5726e18f36d54423037c7291 Mon Sep 17 00:00:00 2001 From: Dmitry Parfenchik Date: Sun, 5 Apr 2020 21:08:02 +0200 Subject: [PATCH 33/33] [L2Space] Perf improvement for dimension of factor 4 and 16 L2 SIMD methods are split in 2: 1. `L2SqrSIMD(4|16)Ext` - uses SSE or AVX to compute distance on dimensions that are multiples of 4 and 16 2. `L2SqrSIMD(4|16)ExtResidual` - relies on (1) to compute full multiples of 4 and 16 dimensions and finishes residual computation by relying on non-SIMD method `L2Sqr`. --- hnswlib/space_l2.h | 49 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/hnswlib/space_l2.h b/hnswlib/space_l2.h index fda82543..bc00af72 100644 --- a/hnswlib/space_l2.h +++ b/hnswlib/space_l2.h @@ -3,7 +3,8 @@ namespace hnswlib { - static float L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + static float + L2Sqr(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { float *pVect1 = (float *) pVect1v; float *pVect2 = (float *) pVect2v; size_t qty = *((size_t *) qty_ptr); @@ -51,11 +52,8 @@ namespace hnswlib { } _mm256_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - size_t qty_left = qty - (qty16 << 4); - float res_tail = L2Sqr(pVect1, pVect2, &qty_left); - return (res + res_tail); -} + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; + } #elif defined(USE_SSE) @@ -104,8 +102,20 @@ namespace hnswlib { } _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - size_t qty_left = qty - (qty16 << 4); + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + } +#endif + +#if defined(USE_SSE) || defined(USE_AVX) + static float + L2SqrSIMD16ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + size_t qty16 = qty >> 4 << 4; + float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16); + float *pVect1 = (float *) pVect1v + qty16; + float *pVect2 = (float *) pVect2v + qty16; + + size_t qty_left = qty - qty16; float res_tail = L2Sqr(pVect1, pVect2, &qty_left); return (res + res_tail); } @@ -137,8 +147,19 @@ namespace hnswlib { sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); } _mm_store_ps(TmpRes, sum); - float res = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - size_t qty_left = qty - (qty4 << 2); + return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + } + + static float + L2SqrSIMD4ExtResiduals(const void *pVect1v, const void *pVect2v, const void *qty_ptr) { + size_t qty = *((size_t *) qty_ptr); + size_t qty4 = qty >> 2 << 2; + + float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4); + size_t qty_left = qty - qty4; + + float *pVect1 = (float *) pVect1v + qty4; + float *pVect2 = (float *) pVect2v + qty4; float res_tail = L2Sqr(pVect1, pVect2, &qty_left); return (res + res_tail); @@ -154,10 +175,14 @@ namespace hnswlib { L2Space(size_t dim) { fstdistfunc_ = L2Sqr; #if defined(USE_SSE) || defined(USE_AVX) - if (dim >= 16) + if (dim % 16 == 0) fstdistfunc_ = L2SqrSIMD16Ext; - else if (dim >= 4) + else if (dim % 4 == 0) fstdistfunc_ = L2SqrSIMD4Ext; + else if (dim > 16) + fstdistfunc_ = L2SqrSIMD16ExtResiduals; + else if (dim > 4) + fstdistfunc_ = L2SqrSIMD4ExtResiduals; #endif dim_ = dim; data_size_ = dim * sizeof(float);