Skip to content

fix bugs, update readme #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 6, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 78 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,20 @@ NEW: Added support for cosine similarity and inner product distances

Part of the nmslib project https://github.com/nmslib/nmslib

Offers less memory footprint and faster builds compared to current nmslib's version.

### Python bindings

Supported distances:
1) Squared L2 ('l2')
2) Inner product ('ip',the distance is 1.0 - $inner product$)
3) Cosine similarity ('cosine', the same as the inner product, but vectors are normalized)

#### Supported distances:

| Distance | parameter | Equation |
| ------------- |:---------------:| -----------------------:|
|Squared L2 |'l2' | d = sum((Ai-Bi)^2) |
|Inner product |'ip' | d = 1.0 - sum(Ai\*Bi)) |
|Cosine similarity |'cosine' | d = 1.0 - sum(Ai\*Bi) / sqrt(sum(Ai\*Ai) * sum(Bi\*Bi))|

Note that inner product is not a metric. An element can be closer to some other element than to itself.

For other spaces use the main library https://github.com/nmslib/nmslib

Expand Down Expand Up @@ -44,13 +52,76 @@ p.set_ef(50) # ef should always be > k
labels, distances = p.knn_query(data, k = 1)

```
To compile run:
An example with updates after serialization/deserialization:
```python
import hnswlib
import numpy as np

dim = 16
num_elements = 10000

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))

# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initing index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# stronlgy affects the memory consumption

p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(10)

p.set_num_threads(4) # by default using all available cores

# We split the data in two batches:
data1 = data[:num_elements // 2]
data2 = data[num_elements // 2:]

print("Adding first batch of %d elements" % (len(data1)))
p.add_items(data1)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)
print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n")

# Serializing and deleting the index:
index_path='first_half.bin'
print("Saving index to '%s'" % index_path)
p.save_index("first_half.bin")
del p

# Reiniting, loading the index
p = hnswlib.Index(space='l2', dim=dim) # you can change the sa

print("\nLoading index from 'first_half.bin'\n")
p.load_index("first_half.bin")

print("Adding the second batch of %d elements" % (len(data2)))
p.add_items(data2)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")

```

#### Bindings installation
```bash
pip3 install pybind11 numpy setuptools
cd python_bindings
python3 setup.py install
```

#### 200M SIFT test reproduction steps:
### 200M SIFT test reproduction
To download and extract the bigann dataset:
```bash
python3 download_bigann.py
Expand All @@ -71,4 +142,4 @@ The size of the bigann subset (in millions) is controlled by the variable **subs


References:
Malkov, Yu A., and D. A. Yashunin. "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." arXiv preprint arXiv:1603.09320 (2016).
Malkov, Yu A., and D. A. Yashunin. "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." arXiv preprint arXiv:1603.09320 (2016). https://arxiv.org/abs/1603.09320
57 changes: 57 additions & 0 deletions examples/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import hnswlib
import numpy as np

dim = 16
num_elements = 10000

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))

# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initing index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# stronlgy affects the memory consumption

p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(10)

p.set_num_threads(4) # by default using all available cores

# We split the data in two batches:
data1 = data[:num_elements // 2]
data2 = data[num_elements // 2:]

print("Adding first batch of %d elements" % (len(data1)))
p.add_items(data1)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)
print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n")

# Serializing and deleting the index:
index_path='first_half.bin'
print("Saving index to '%s'" % index_path)
p.save_index("first_half.bin")
del p

# Reiniting, loading the index
p = hnswlib.Index(space='l2', dim=dim) # you can change the sa

print("\nLoading index from 'first_half.bin'\n")
p.load_index("first_half.bin")

print("Adding the second batch of %d elements" % (len(data2)))
p.add_items(data2)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
23 changes: 10 additions & 13 deletions hnswlib/hnswalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ namespace hnswlib {
M_ = M;
maxM_ = M_;
maxM0_ = M_ * 2;
ef_construction_ = ef_construction;
ef_ = 7;
ef_construction_ = std::max(ef_construction,M_);
ef_ = 10;



Expand Down Expand Up @@ -492,8 +492,6 @@ namespace hnswlib {
};

void saveIndex(const string &location) {

cout << "Saving index to " << location.c_str() << "\n";
std::ofstream output(location, std::ios::binary);
streampos position;

Expand Down Expand Up @@ -544,7 +542,6 @@ namespace hnswlib {
readBinaryPOD(input, M_);
readBinaryPOD(input, mult_);
readBinaryPOD(input, ef_construction_);
cout << ef_construction_ << "\n";


data_size_ = s->get_data_size();
Expand All @@ -557,11 +554,15 @@ namespace hnswlib {

size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);


size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
vector<mutex>(max_elements_).swap(link_list_locks_);


visited_list_pool_ = new VisitedListPool(1, max_elements_);


linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
cout << max_elements_ << "\n";
element_levels_ = vector<int>(max_elements_);
revSize_ = 1.0 / mult_;
ef_ = 10;
Expand All @@ -578,12 +579,8 @@ namespace hnswlib {
input.read(linkLists_[i], linkListSize);
}
}


input.close();
size_t predicted_size_per_element = size_data_per_element_ + sizeof(void *) + 8 + 8 + 2 * 8;
cout << "Loaded index, predicted size=" << max_elements_ * (predicted_size_per_element) / (1000 * 1000)
<< "\n";

return;
}

Expand Down Expand Up @@ -682,7 +679,7 @@ namespace hnswlib {
return cur_c;
};

std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *query_data, int k) {
std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *query_data, size_t k) {
tableint currObj = enterpoint_node_;
dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);

Expand Down Expand Up @@ -711,7 +708,7 @@ namespace hnswlib {


std::priority_queue<std::pair<dist_t, tableint>, vector<pair<dist_t, tableint>>, CompareByFirst> top_candidates = searchBaseLayerST(
currObj, query_data, ef_);
currObj, query_data, std::max(ef_,k));
std::priority_queue<std::pair<dist_t, labeltype >> results;
while (top_candidates.size() > k) {
top_candidates.pop();
Expand Down
2 changes: 1 addition & 1 deletion hnswlib/hnswlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace hnswlib {
class AlgorithmInterface {
public:
//virtual void addPoint(void *, labeltype) = 0;
virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *, int) = 0;
virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *, size_t) = 0;
};

template<typename MTYPE>
Expand Down
74 changes: 54 additions & 20 deletions python_bindings/bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,36 +128,54 @@ class Index {
if (num_threads <= 0)
num_threads = num_threads_default;

size_t rows = buffer.shape[0], features = buffer.shape[1];
size_t rows, features;

if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
if (buffer.ndim == 2) {
rows = buffer.shape[0];
features = buffer.shape[1];
}
else{
rows = 1;
features = buffer.shape[0];
}

if (features != dim)
throw std::runtime_error("wrong dimensionality of the vectors");

// avoid using threads when the number of searches is small:

if(rows<=num_threads*4){
num_threads=1;
}

std::vector<size_t> ids;

if (!ids_.is_none()) {
py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_);
auto ids_numpy = items.request();
std::vector<size_t> ids1(ids_numpy.shape[0]);
for (size_t i = 0; i < ids1.size(); i++) {
ids1[i] = items.data()[i];
if(ids_numpy.ndim==1 && ids_numpy.shape[0]==rows) {
std::vector<size_t> ids1(ids_numpy.shape[0]);
for (size_t i = 0; i < ids1.size(); i++) {
ids1[i] = items.data()[i];
}
ids.swap(ids1);
}
ids.swap(ids1);
else if(ids_numpy.ndim==0 && rows==1) {
ids.push_back(*items.data());
}
else
throw std::runtime_error("wrong dimensionality of the labels");
}

hnswlib::tableint *data_numpy;

{

py::gil_scoped_release l;


if (buffer.ndim != 2) throw std::runtime_error("data must be a 2d array");

if (features != dim)
throw std::runtime_error("wrong dimensionality of the vectors");

data_numpy = new hnswlib::tableint[rows];
int start = 0;
if (!ep_added) {
size_t id = ids.size() ? ids.at(0) : (cur_l++);
size_t id = ids.size() ? ids.at(0) : (cur_l);
float *vector_data=(float *) items.data(0);
if(normalize){
std::vector<float> norm_array(dim);
Expand All @@ -169,20 +187,25 @@ class Index {
start = 1;
ep_added = true;
}

py::gil_scoped_release l;
if(normalize==false) {
ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) {
size_t id = ids.size() ? ids.at(row) : (cur_l++);
size_t id = ids.size() ? ids.at(row) : (cur_l+row);
data_numpy[row] = appr_alg->addPoint((void *) items.data(row), (size_t) id);
});
} else{
std::vector<float> norm_array(num_threads * dim);
ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) {
// normalize vector:
size_t start_idx = threadId * dim;
normalize_vector((float *) items.data(row), (norm_array.data()+start_idx));
size_t id = ids.size() ? ids.at(row) : (cur_l++);

size_t id = ids.size() ? ids.at(row) : (cur_l+row);
data_numpy[row] = appr_alg->addPoint((void *) (norm_array.data()+start_idx), (size_t) id);
});
}
};
cur_l+=rows;


}
Expand Down Expand Up @@ -213,11 +236,22 @@ class Index {
{
py::gil_scoped_release l;

if (buffer.ndim != 2) throw std::runtime_error("data must be a 2d array");
if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
if (buffer.ndim == 2) {
rows = buffer.shape[0];
features = buffer.shape[1];
}
else{
rows = 1;
features = buffer.shape[0];
}

rows = buffer.shape[0];
features = buffer.shape[1];

// avoid using threads when the number of searches is small:

if(rows<=num_threads*4){
num_threads=1;
}

data_numpy_l = new hnswlib::labeltype[rows * k];
data_numpy_d = new dist_t[rows * k];
Expand Down