diff --git a/README.md b/README.md index 89650841..02b9e0c7 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,20 @@ NEW: Added support for cosine similarity and inner product distances Part of the nmslib project https://github.com/nmslib/nmslib +Offers less memory footprint and faster builds compared to current nmslib's version. +### Python bindings -Supported distances: -1) Squared L2 ('l2') -2) Inner product ('ip',the distance is 1.0 - $inner product$) -3) Cosine similarity ('cosine', the same as the inner product, but vectors are normalized) + +#### Supported distances: + +| Distance | parameter | Equation | +| ------------- |:---------------:| -----------------------:| +|Squared L2 |'l2' | d = sum((Ai-Bi)^2) | +|Inner product |'ip' | d = 1.0 - sum(Ai\*Bi)) | +|Cosine similarity |'cosine' | d = 1.0 - sum(Ai\*Bi) / sqrt(sum(Ai\*Ai) * sum(Bi\*Bi))| + +Note that inner product is not a metric. An element can be closer to some other element than to itself. For other spaces use the main library https://github.com/nmslib/nmslib @@ -44,13 +52,76 @@ p.set_ef(50) # ef should always be > k labels, distances = p.knn_query(data, k = 1) ``` -To compile run: +An example with updates after serialization/deserialization: +```python +import hnswlib +import numpy as np + +dim = 16 +num_elements = 10000 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# Declaring index +p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + +# Initing index +# max_elements - the maximum number of elements, should be known beforehand +# (probably will be made optional in the future) +# +# ef_construction - controls index search speed/build speed tradeoff +# M - is tightly connected with internal dimensionality of the data +# stronlgy affects the memory consumption + +p.init_index(max_elements=num_elements, ef_construction=100, M=16) + +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +p.set_ef(10) + +p.set_num_threads(4) # by default using all available cores + +# We split the data in two batches: +data1 = data[:num_elements // 2] +data2 = data[num_elements // 2:] + +print("Adding first batch of %d elements" % (len(data1))) +p.add_items(data1) + +# Query the elements for themselves and measure recall: +labels, distances = p.knn_query(data1, k=1) +print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n") + +# Serializing and deleting the index: +index_path='first_half.bin' +print("Saving index to '%s'" % index_path) +p.save_index("first_half.bin") +del p + +# Reiniting, loading the index +p = hnswlib.Index(space='l2', dim=dim) # you can change the sa + +print("\nLoading index from 'first_half.bin'\n") +p.load_index("first_half.bin") + +print("Adding the second batch of %d elements" % (len(data2))) +p.add_items(data2) + +# Query the elements for themselves and measure recall: +labels, distances = p.knn_query(data, k=1) +print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n") + +``` + +#### Bindings installation ```bash +pip3 install pybind11 numpy setuptools cd python_bindings python3 setup.py install ``` -#### 200M SIFT test reproduction steps: +### 200M SIFT test reproduction To download and extract the bigann dataset: ```bash python3 download_bigann.py @@ -71,4 +142,4 @@ The size of the bigann subset (in millions) is controlled by the variable **subs References: -Malkov, Yu A., and D. A. Yashunin. "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." arXiv preprint arXiv:1603.09320 (2016). +Malkov, Yu A., and D. A. Yashunin. "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." arXiv preprint arXiv:1603.09320 (2016). https://arxiv.org/abs/1603.09320 diff --git a/examples/example.py b/examples/example.py new file mode 100644 index 00000000..6654a027 --- /dev/null +++ b/examples/example.py @@ -0,0 +1,57 @@ +import hnswlib +import numpy as np + +dim = 16 +num_elements = 10000 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# Declaring index +p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + +# Initing index +# max_elements - the maximum number of elements, should be known beforehand +# (probably will be made optional in the future) +# +# ef_construction - controls index search speed/build speed tradeoff +# M - is tightly connected with internal dimensionality of the data +# stronlgy affects the memory consumption + +p.init_index(max_elements=num_elements, ef_construction=100, M=16) + +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +p.set_ef(10) + +p.set_num_threads(4) # by default using all available cores + +# We split the data in two batches: +data1 = data[:num_elements // 2] +data2 = data[num_elements // 2:] + +print("Adding first batch of %d elements" % (len(data1))) +p.add_items(data1) + +# Query the elements for themselves and measure recall: +labels, distances = p.knn_query(data1, k=1) +print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n") + +# Serializing and deleting the index: +index_path='first_half.bin' +print("Saving index to '%s'" % index_path) +p.save_index("first_half.bin") +del p + +# Reiniting, loading the index +p = hnswlib.Index(space='l2', dim=dim) # you can change the sa + +print("\nLoading index from 'first_half.bin'\n") +p.load_index("first_half.bin") + +print("Adding the second batch of %d elements" % (len(data2))) +p.add_items(data2) + +# Query the elements for themselves and measure recall: +labels, distances = p.knn_query(data, k=1) +print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n") diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h index 2a303c23..967d1ea8 100644 --- a/hnswlib/hnswalg.h +++ b/hnswlib/hnswalg.h @@ -49,8 +49,8 @@ namespace hnswlib { M_ = M; maxM_ = M_; maxM0_ = M_ * 2; - ef_construction_ = ef_construction; - ef_ = 7; + ef_construction_ = std::max(ef_construction,M_); + ef_ = 10; @@ -492,8 +492,6 @@ namespace hnswlib { }; void saveIndex(const string &location) { - - cout << "Saving index to " << location.c_str() << "\n"; std::ofstream output(location, std::ios::binary); streampos position; @@ -544,7 +542,6 @@ namespace hnswlib { readBinaryPOD(input, M_); readBinaryPOD(input, mult_); readBinaryPOD(input, ef_construction_); - cout << ef_construction_ << "\n"; data_size_ = s->get_data_size(); @@ -557,11 +554,15 @@ namespace hnswlib { size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint); + + size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint); + vector(max_elements_).swap(link_list_locks_); + + visited_list_pool_ = new VisitedListPool(1, max_elements_); linkLists_ = (char **) malloc(sizeof(void *) * max_elements_); - cout << max_elements_ << "\n"; element_levels_ = vector(max_elements_); revSize_ = 1.0 / mult_; ef_ = 10; @@ -578,12 +579,8 @@ namespace hnswlib { input.read(linkLists_[i], linkListSize); } } - - input.close(); - size_t predicted_size_per_element = size_data_per_element_ + sizeof(void *) + 8 + 8 + 2 * 8; - cout << "Loaded index, predicted size=" << max_elements_ * (predicted_size_per_element) / (1000 * 1000) - << "\n"; + return; } @@ -682,7 +679,7 @@ namespace hnswlib { return cur_c; }; - std::priority_queue> searchKnn(void *query_data, int k) { + std::priority_queue> searchKnn(void *query_data, size_t k) { tableint currObj = enterpoint_node_; dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_); @@ -711,7 +708,7 @@ namespace hnswlib { std::priority_queue, vector>, CompareByFirst> top_candidates = searchBaseLayerST( - currObj, query_data, ef_); + currObj, query_data, std::max(ef_,k)); std::priority_queue> results; while (top_candidates.size() > k) { top_candidates.pop(); diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h index 26d0fe35..a97ea545 100644 --- a/hnswlib/hnswlib.h +++ b/hnswlib/hnswlib.h @@ -27,7 +27,7 @@ namespace hnswlib { class AlgorithmInterface { public: //virtual void addPoint(void *, labeltype) = 0; - virtual std::priority_queue> searchKnn(void *, int) = 0; + virtual std::priority_queue> searchKnn(void *, size_t) = 0; }; template diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp index 578112ff..69f82e31 100644 --- a/python_bindings/bindings.cpp +++ b/python_bindings/bindings.cpp @@ -128,36 +128,54 @@ class Index { if (num_threads <= 0) num_threads = num_threads_default; - size_t rows = buffer.shape[0], features = buffer.shape[1]; + size_t rows, features; + + if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array"); + if (buffer.ndim == 2) { + rows = buffer.shape[0]; + features = buffer.shape[1]; + } + else{ + rows = 1; + features = buffer.shape[0]; + } + + if (features != dim) + throw std::runtime_error("wrong dimensionality of the vectors"); + + // avoid using threads when the number of searches is small: + + if(rows<=num_threads*4){ + num_threads=1; + } std::vector ids; if (!ids_.is_none()) { py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_); auto ids_numpy = items.request(); - std::vector ids1(ids_numpy.shape[0]); - for (size_t i = 0; i < ids1.size(); i++) { - ids1[i] = items.data()[i]; + if(ids_numpy.ndim==1 && ids_numpy.shape[0]==rows) { + std::vector ids1(ids_numpy.shape[0]); + for (size_t i = 0; i < ids1.size(); i++) { + ids1[i] = items.data()[i]; + } + ids.swap(ids1); } - ids.swap(ids1); + else if(ids_numpy.ndim==0 && rows==1) { + ids.push_back(*items.data()); + } + else + throw std::runtime_error("wrong dimensionality of the labels"); } hnswlib::tableint *data_numpy; { - py::gil_scoped_release l; - - - if (buffer.ndim != 2) throw std::runtime_error("data must be a 2d array"); - - if (features != dim) - throw std::runtime_error("wrong dimensionality of the vectors"); - data_numpy = new hnswlib::tableint[rows]; int start = 0; if (!ep_added) { - size_t id = ids.size() ? ids.at(0) : (cur_l++); + size_t id = ids.size() ? ids.at(0) : (cur_l); float *vector_data=(float *) items.data(0); if(normalize){ std::vector norm_array(dim); @@ -169,20 +187,25 @@ class Index { start = 1; ep_added = true; } + + py::gil_scoped_release l; if(normalize==false) { ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) { - size_t id = ids.size() ? ids.at(row) : (cur_l++); + size_t id = ids.size() ? ids.at(row) : (cur_l+row); data_numpy[row] = appr_alg->addPoint((void *) items.data(row), (size_t) id); }); } else{ std::vector norm_array(num_threads * dim); ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) { + // normalize vector: size_t start_idx = threadId * dim; normalize_vector((float *) items.data(row), (norm_array.data()+start_idx)); - size_t id = ids.size() ? ids.at(row) : (cur_l++); + + size_t id = ids.size() ? ids.at(row) : (cur_l+row); data_numpy[row] = appr_alg->addPoint((void *) (norm_array.data()+start_idx), (size_t) id); }); - } + }; + cur_l+=rows; } @@ -213,11 +236,22 @@ class Index { { py::gil_scoped_release l; - if (buffer.ndim != 2) throw std::runtime_error("data must be a 2d array"); + if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array"); + if (buffer.ndim == 2) { + rows = buffer.shape[0]; + features = buffer.shape[1]; + } + else{ + rows = 1; + features = buffer.shape[0]; + } - rows = buffer.shape[0]; - features = buffer.shape[1]; + // avoid using threads when the number of searches is small: + + if(rows<=num_threads*4){ + num_threads=1; + } data_numpy_l = new hnswlib::labeltype[rows * k]; data_numpy_d = new dist_t[rows * k];