Skip to content

Commit aed9f3a

Browse files
authored
Merge pull request #5 from nmslib/update_readme
fix bugs, update readme
2 parents a253314 + a2c2c9b commit aed9f3a

File tree

5 files changed

+200
-41
lines changed

5 files changed

+200
-41
lines changed

README.md

Lines changed: 78 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,20 @@ NEW: Added support for cosine similarity and inner product distances
66

77
Part of the nmslib project https://github.com/nmslib/nmslib
88

9+
Offers less memory footprint and faster builds compared to current nmslib's version.
910

11+
### Python bindings
1012

11-
Supported distances:
12-
1) Squared L2 ('l2')
13-
2) Inner product ('ip',the distance is 1.0 - $inner product$)
14-
3) Cosine similarity ('cosine', the same as the inner product, but vectors are normalized)
13+
14+
#### Supported distances:
15+
16+
| Distance | parameter | Equation |
17+
| ------------- |:---------------:| -----------------------:|
18+
|Squared L2 |'l2' | d = sum((Ai-Bi)^2) |
19+
|Inner product |'ip' | d = 1.0 - sum(Ai\*Bi)) |
20+
|Cosine similarity |'cosine' | d = 1.0 - sum(Ai\*Bi) / sqrt(sum(Ai\*Ai) * sum(Bi\*Bi))|
21+
22+
Note that inner product is not a metric. An element can be closer to some other element than to itself.
1523

1624
For other spaces use the main library https://github.com/nmslib/nmslib
1725

@@ -44,13 +52,76 @@ p.set_ef(50) # ef should always be > k
4452
labels, distances = p.knn_query(data, k = 1)
4553

4654
```
47-
To compile run:
55+
An example with updates after serialization/deserialization:
56+
```python
57+
import hnswlib
58+
import numpy as np
59+
60+
dim = 16
61+
num_elements = 10000
62+
63+
# Generating sample data
64+
data = np.float32(np.random.random((num_elements, dim)))
65+
66+
# Declaring index
67+
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
68+
69+
# Initing index
70+
# max_elements - the maximum number of elements, should be known beforehand
71+
# (probably will be made optional in the future)
72+
#
73+
# ef_construction - controls index search speed/build speed tradeoff
74+
# M - is tightly connected with internal dimensionality of the data
75+
# stronlgy affects the memory consumption
76+
77+
p.init_index(max_elements=num_elements, ef_construction=100, M=16)
78+
79+
# Controlling the recall by setting ef:
80+
# higher ef leads to better accuracy, but slower search
81+
p.set_ef(10)
82+
83+
p.set_num_threads(4) # by default using all available cores
84+
85+
# We split the data in two batches:
86+
data1 = data[:num_elements // 2]
87+
data2 = data[num_elements // 2:]
88+
89+
print("Adding first batch of %d elements" % (len(data1)))
90+
p.add_items(data1)
91+
92+
# Query the elements for themselves and measure recall:
93+
labels, distances = p.knn_query(data1, k=1)
94+
print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n")
95+
96+
# Serializing and deleting the index:
97+
index_path='first_half.bin'
98+
print("Saving index to '%s'" % index_path)
99+
p.save_index("first_half.bin")
100+
del p
101+
102+
# Reiniting, loading the index
103+
p = hnswlib.Index(space='l2', dim=dim) # you can change the sa
104+
105+
print("\nLoading index from 'first_half.bin'\n")
106+
p.load_index("first_half.bin")
107+
108+
print("Adding the second batch of %d elements" % (len(data2)))
109+
p.add_items(data2)
110+
111+
# Query the elements for themselves and measure recall:
112+
labels, distances = p.knn_query(data, k=1)
113+
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
114+
115+
```
116+
117+
#### Bindings installation
48118
```bash
119+
pip3 install pybind11 numpy setuptools
49120
cd python_bindings
50121
python3 setup.py install
51122
```
52123

53-
#### 200M SIFT test reproduction steps:
124+
### 200M SIFT test reproduction
54125
To download and extract the bigann dataset:
55126
```bash
56127
python3 download_bigann.py
@@ -71,4 +142,4 @@ The size of the bigann subset (in millions) is controlled by the variable **subs
71142

72143

73144
References:
74-
Malkov, Yu A., and D. A. Yashunin. "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." arXiv preprint arXiv:1603.09320 (2016).
145+
Malkov, Yu A., and D. A. Yashunin. "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." arXiv preprint arXiv:1603.09320 (2016). https://arxiv.org/abs/1603.09320

examples/example.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import hnswlib
2+
import numpy as np
3+
4+
dim = 16
5+
num_elements = 10000
6+
7+
# Generating sample data
8+
data = np.float32(np.random.random((num_elements, dim)))
9+
10+
# Declaring index
11+
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
12+
13+
# Initing index
14+
# max_elements - the maximum number of elements, should be known beforehand
15+
# (probably will be made optional in the future)
16+
#
17+
# ef_construction - controls index search speed/build speed tradeoff
18+
# M - is tightly connected with internal dimensionality of the data
19+
# stronlgy affects the memory consumption
20+
21+
p.init_index(max_elements=num_elements, ef_construction=100, M=16)
22+
23+
# Controlling the recall by setting ef:
24+
# higher ef leads to better accuracy, but slower search
25+
p.set_ef(10)
26+
27+
p.set_num_threads(4) # by default using all available cores
28+
29+
# We split the data in two batches:
30+
data1 = data[:num_elements // 2]
31+
data2 = data[num_elements // 2:]
32+
33+
print("Adding first batch of %d elements" % (len(data1)))
34+
p.add_items(data1)
35+
36+
# Query the elements for themselves and measure recall:
37+
labels, distances = p.knn_query(data1, k=1)
38+
print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n")
39+
40+
# Serializing and deleting the index:
41+
index_path='first_half.bin'
42+
print("Saving index to '%s'" % index_path)
43+
p.save_index("first_half.bin")
44+
del p
45+
46+
# Reiniting, loading the index
47+
p = hnswlib.Index(space='l2', dim=dim) # you can change the sa
48+
49+
print("\nLoading index from 'first_half.bin'\n")
50+
p.load_index("first_half.bin")
51+
52+
print("Adding the second batch of %d elements" % (len(data2)))
53+
p.add_items(data2)
54+
55+
# Query the elements for themselves and measure recall:
56+
labels, distances = p.knn_query(data, k=1)
57+
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")

hnswlib/hnswalg.h

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ namespace hnswlib {
4949
M_ = M;
5050
maxM_ = M_;
5151
maxM0_ = M_ * 2;
52-
ef_construction_ = ef_construction;
53-
ef_ = 7;
52+
ef_construction_ = std::max(ef_construction,M_);
53+
ef_ = 10;
5454

5555

5656

@@ -492,8 +492,6 @@ namespace hnswlib {
492492
};
493493

494494
void saveIndex(const string &location) {
495-
496-
cout << "Saving index to " << location.c_str() << "\n";
497495
std::ofstream output(location, std::ios::binary);
498496
streampos position;
499497

@@ -544,7 +542,6 @@ namespace hnswlib {
544542
readBinaryPOD(input, M_);
545543
readBinaryPOD(input, mult_);
546544
readBinaryPOD(input, ef_construction_);
547-
cout << ef_construction_ << "\n";
548545

549546

550547
data_size_ = s->get_data_size();
@@ -557,11 +554,15 @@ namespace hnswlib {
557554

558555
size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
559556

557+
558+
size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
559+
vector<mutex>(max_elements_).swap(link_list_locks_);
560+
561+
560562
visited_list_pool_ = new VisitedListPool(1, max_elements_);
561563

562564

563565
linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
564-
cout << max_elements_ << "\n";
565566
element_levels_ = vector<int>(max_elements_);
566567
revSize_ = 1.0 / mult_;
567568
ef_ = 10;
@@ -578,12 +579,8 @@ namespace hnswlib {
578579
input.read(linkLists_[i], linkListSize);
579580
}
580581
}
581-
582-
583582
input.close();
584-
size_t predicted_size_per_element = size_data_per_element_ + sizeof(void *) + 8 + 8 + 2 * 8;
585-
cout << "Loaded index, predicted size=" << max_elements_ * (predicted_size_per_element) / (1000 * 1000)
586-
<< "\n";
583+
587584
return;
588585
}
589586

@@ -682,7 +679,7 @@ namespace hnswlib {
682679
return cur_c;
683680
};
684681

685-
std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *query_data, int k) {
682+
std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *query_data, size_t k) {
686683
tableint currObj = enterpoint_node_;
687684
dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
688685

@@ -711,7 +708,7 @@ namespace hnswlib {
711708

712709

713710
std::priority_queue<std::pair<dist_t, tableint>, vector<pair<dist_t, tableint>>, CompareByFirst> top_candidates = searchBaseLayerST(
714-
currObj, query_data, ef_);
711+
currObj, query_data, std::max(ef_,k));
715712
std::priority_queue<std::pair<dist_t, labeltype >> results;
716713
while (top_candidates.size() > k) {
717714
top_candidates.pop();

hnswlib/hnswlib.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ namespace hnswlib {
2727
class AlgorithmInterface {
2828
public:
2929
//virtual void addPoint(void *, labeltype) = 0;
30-
virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *, int) = 0;
30+
virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *, size_t) = 0;
3131
};
3232

3333
template<typename MTYPE>

python_bindings/bindings.cpp

Lines changed: 54 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -128,36 +128,54 @@ class Index {
128128
if (num_threads <= 0)
129129
num_threads = num_threads_default;
130130

131-
size_t rows = buffer.shape[0], features = buffer.shape[1];
131+
size_t rows, features;
132+
133+
if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
134+
if (buffer.ndim == 2) {
135+
rows = buffer.shape[0];
136+
features = buffer.shape[1];
137+
}
138+
else{
139+
rows = 1;
140+
features = buffer.shape[0];
141+
}
142+
143+
if (features != dim)
144+
throw std::runtime_error("wrong dimensionality of the vectors");
145+
146+
// avoid using threads when the number of searches is small:
147+
148+
if(rows<=num_threads*4){
149+
num_threads=1;
150+
}
132151

133152
std::vector<size_t> ids;
134153

135154
if (!ids_.is_none()) {
136155
py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_);
137156
auto ids_numpy = items.request();
138-
std::vector<size_t> ids1(ids_numpy.shape[0]);
139-
for (size_t i = 0; i < ids1.size(); i++) {
140-
ids1[i] = items.data()[i];
157+
if(ids_numpy.ndim==1 && ids_numpy.shape[0]==rows) {
158+
std::vector<size_t> ids1(ids_numpy.shape[0]);
159+
for (size_t i = 0; i < ids1.size(); i++) {
160+
ids1[i] = items.data()[i];
161+
}
162+
ids.swap(ids1);
141163
}
142-
ids.swap(ids1);
164+
else if(ids_numpy.ndim==0 && rows==1) {
165+
ids.push_back(*items.data());
166+
}
167+
else
168+
throw std::runtime_error("wrong dimensionality of the labels");
143169
}
144170

145171
hnswlib::tableint *data_numpy;
146172

147173
{
148174

149-
py::gil_scoped_release l;
150-
151-
152-
if (buffer.ndim != 2) throw std::runtime_error("data must be a 2d array");
153-
154-
if (features != dim)
155-
throw std::runtime_error("wrong dimensionality of the vectors");
156-
157175
data_numpy = new hnswlib::tableint[rows];
158176
int start = 0;
159177
if (!ep_added) {
160-
size_t id = ids.size() ? ids.at(0) : (cur_l++);
178+
size_t id = ids.size() ? ids.at(0) : (cur_l);
161179
float *vector_data=(float *) items.data(0);
162180
if(normalize){
163181
std::vector<float> norm_array(dim);
@@ -169,20 +187,25 @@ class Index {
169187
start = 1;
170188
ep_added = true;
171189
}
190+
191+
py::gil_scoped_release l;
172192
if(normalize==false) {
173193
ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) {
174-
size_t id = ids.size() ? ids.at(row) : (cur_l++);
194+
size_t id = ids.size() ? ids.at(row) : (cur_l+row);
175195
data_numpy[row] = appr_alg->addPoint((void *) items.data(row), (size_t) id);
176196
});
177197
} else{
178198
std::vector<float> norm_array(num_threads * dim);
179199
ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) {
200+
// normalize vector:
180201
size_t start_idx = threadId * dim;
181202
normalize_vector((float *) items.data(row), (norm_array.data()+start_idx));
182-
size_t id = ids.size() ? ids.at(row) : (cur_l++);
203+
204+
size_t id = ids.size() ? ids.at(row) : (cur_l+row);
183205
data_numpy[row] = appr_alg->addPoint((void *) (norm_array.data()+start_idx), (size_t) id);
184206
});
185-
}
207+
};
208+
cur_l+=rows;
186209

187210

188211
}
@@ -213,11 +236,22 @@ class Index {
213236
{
214237
py::gil_scoped_release l;
215238

216-
if (buffer.ndim != 2) throw std::runtime_error("data must be a 2d array");
239+
if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
240+
if (buffer.ndim == 2) {
241+
rows = buffer.shape[0];
242+
features = buffer.shape[1];
243+
}
244+
else{
245+
rows = 1;
246+
features = buffer.shape[0];
247+
}
217248

218-
rows = buffer.shape[0];
219-
features = buffer.shape[1];
220249

250+
// avoid using threads when the number of searches is small:
251+
252+
if(rows<=num_threads*4){
253+
num_threads=1;
254+
}
221255

222256
data_numpy_l = new hnswlib::labeltype[rows * k];
223257
data_numpy_d = new dist_t[rows * k];

0 commit comments

Comments
 (0)