Skip to content

Feature/pep #284

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/pyw_hnswlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def __init__(self, space, dim):
self.dict_labels = {}
self.cur_ind = 0

def init_index(self, max_elements, ef_construction = 200, M = 16):
self.index.init_index(max_elements = max_elements, ef_construction = ef_construction, M = M)
def init_index(self, max_elements, ef_construction=200, M=16):
self.index.init_index(max_elements=max_elements, ef_construction=ef_construction, M=M)

def add_items(self, data, ids=None):
if ids is not None:
Expand Down
8 changes: 4 additions & 4 deletions python_bindings/tests/bindings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ def testRandomSelf(self):
# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initing index
# Initiating index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# stronlgy affects the memory consumption
# strongly affects the memory consumption

p.init_index(max_elements = num_elements, ef_construction = 100, M = 16)
p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
Expand All @@ -51,7 +51,7 @@ def testRandomSelf(self):
p.save_index(index_path)
del p

# Reiniting, loading the index
# Re-initiating, loading the index
p = hnswlib.Index(space='l2', dim=dim) # you can change the sa

print("\nLoading index from '%s'\n" % index_path)
Expand Down
4 changes: 2 additions & 2 deletions python_bindings/tests/bindings_test_getdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ def testGettingItems(self):
# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initing index
# Initiating index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# stronlgy affects the memory consumption
# strongly affects the memory consumption

p.init_index(max_elements=num_elements, ef_construction=100, M=16)

Expand Down
16 changes: 8 additions & 8 deletions python_bindings/tests/bindings_test_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ def testRandomSelf(self):
# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initing index
# Initiating index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# stronlgy affects the memory consumption
# strongly affects the memory consumption

p.init_index(max_elements=num_elements, ef_construction=100, M=16)

Expand All @@ -47,7 +47,7 @@ def testRandomSelf(self):
# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)

items=p.get_items(labels)
items = p.get_items(labels)

# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)
Expand All @@ -67,8 +67,8 @@ def testRandomSelf(self):
print("Deleted")

print("\n**** Mark delete test ****\n")
# Reiniting, loading the index
print("Reiniting")
# Re-initiating, loading the index
print("Re-initiating")
p = hnswlib.Index(space='l2', dim=dim)

print("\nLoading index from '%s'\n" % index_path)
Expand All @@ -80,17 +80,17 @@ def testRandomSelf(self):

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)
items=p.get_items(labels)
items = p.get_items(labels)

# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)

# Check that the returned element data is correct:
diff_with_gt_labels=np.mean(np.abs(data-items))
diff_with_gt_labels = np.mean(np.abs(data-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index.

# Checking that all labels are returned correctly:
sorted_labels=sorted(p.get_ids_list())
sorted_labels = sorted(p.get_ids_list())
self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)

# Delete data1
Expand Down
36 changes: 18 additions & 18 deletions python_bindings/tests/bindings_test_pickle.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,38 +60,38 @@ def test_space_main(self, space, dim):

p.num_threads = self.num_threads # by default using all available cores

p0 = pickle.loads(pickle.dumps(p)) ### pickle un-initialized Index
p0 = pickle.loads(pickle.dumps(p)) # pickle un-initialized Index
p.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M)
p0.init_index(max_elements=self.num_elements, ef_construction=self.ef_construction, M=self.M)

p.ef = self.ef
p0.ef = self.ef

p1 = pickle.loads(pickle.dumps(p)) ### pickle Index before adding items
p1 = pickle.loads(pickle.dumps(p)) # pickle Index before adding items

### add items to ann index p,p0,p1
# add items to ann index p,p0,p1
p.add_items(data)
p1.add_items(data)
p0.add_items(data)

p2=pickle.loads(pickle.dumps(p)) ### pickle Index before adding items
p2=pickle.loads(pickle.dumps(p)) # pickle Index before adding items

self.assertTrue(np.allclose(p.get_items(), p0.get_items()), "items for p and p0 must be same")
self.assertTrue(np.allclose(p0.get_items(), p1.get_items()), "items for p0 and p1 must be same")
self.assertTrue(np.allclose(p1.get_items(), p2.get_items()), "items for p1 and p2 must be same")

### Test if returned distances are same
# Test if returned distances are same
l, d = p.knn_query(test_data, k=self.k)
l0, d0 = p0.knn_query(test_data, k=self.k)
l1, d1 = p1.knn_query(test_data, k=self.k)
l2, d2 = p2.knn_query(test_data, k=self.k)

self.assertLessEqual(np.sum(((d-d0)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match")
self.assertLessEqual(np.sum(((d0-d1)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match")
self.assertLessEqual(np.sum(((d1-d2)**2.)>1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match")
self.assertLessEqual(np.sum(((d-d0)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p and p0 must match")
self.assertLessEqual(np.sum(((d0-d1)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p0 and p1 must match")
self.assertLessEqual(np.sum(((d1-d2)**2.) > 1e-3), self.dists_err_thresh, msg=f"knn distances returned by p1 and p2 must match")

### check if ann results match brute-force search
### allow for 2 labels to be missing from ann results
# check if ann results match brute-force search
# allow for 2 labels to be missing from ann results
check_ann_results(self, space, data, test_data, self.k, l, d,
err_thresh=self.label_err_thresh,
total_thresh=self.item_err_thresh,
Expand All @@ -102,19 +102,19 @@ def test_space_main(self, space, dim):
total_thresh=self.item_err_thresh,
dists_thresh=self.dists_err_thresh)

### Check ef parameter value
# Check ef parameter value
self.assertEqual(p.ef, self.ef, "incorrect value of p.ef")
self.assertEqual(p0.ef, self.ef, "incorrect value of p0.ef")
self.assertEqual(p2.ef, self.ef, "incorrect value of p2.ef")
self.assertEqual(p1.ef, self.ef, "incorrect value of p1.ef")

### Check M parameter value
# Check M parameter value
self.assertEqual(p.M, self.M, "incorrect value of p.M")
self.assertEqual(p0.M, self.M, "incorrect value of p0.M")
self.assertEqual(p1.M, self.M, "incorrect value of p1.M")
self.assertEqual(p2.M, self.M, "incorrect value of p2.M")

### Check ef_construction parameter value
# Check ef_construction parameter value
self.assertEqual(p.ef_construction, self.ef_construction, "incorrect value of p.ef_construction")
self.assertEqual(p0.ef_construction, self.ef_construction, "incorrect value of p0.ef_construction")
self.assertEqual(p1.ef_construction, self.ef_construction, "incorrect value of p1.ef_construction")
Expand All @@ -135,12 +135,12 @@ def setUp(self):
self.num_threads = 4
self.k = 25

self.label_err_thresh = 5 ### max number of missing labels allowed per test item
self.item_err_thresh = 5 ### max number of items allowed with incorrect labels
self.label_err_thresh = 5 # max number of missing labels allowed per test item
self.item_err_thresh = 5 # max number of items allowed with incorrect labels

self.dists_err_thresh = 50 ### for two matrices, d1 and d2, dists_err_thresh controls max
### number of value pairs that are allowed to be different in d1 and d2
### i.e., number of values that are (d1-d2)**2>1e-3
self.dists_err_thresh = 50 # for two matrices, d1 and d2, dists_err_thresh controls max
# number of value pairs that are allowed to be different in d1 and d2
# i.e., number of values that are (d1-d2)**2>1e-3

def test_inner_product_space(self):
test_space_main(self, 'ip', 48)
Expand Down
98 changes: 49 additions & 49 deletions python_bindings/tests/bindings_test_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,71 +7,71 @@

class RandomSelfTestCase(unittest.TestCase):
def testRandomSelf(self):
for idx in range(16):
print("\n**** Index resize test ****\n")
for idx in range(16):
print("\n**** Index resize test ****\n")

np.random.seed(idx)
dim = 16
num_elements = 10000
np.random.seed(idx)
dim = 16
num_elements = 10000

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))
# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))

# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip
# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initing index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# stronlgy affects the memory consumption
# Initiating index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# strongly affects the memory consumption
Comment on lines +23 to +29
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same comments are in bindings_test_labels.py, bindings_test_getdata.py, bindings_test.py. Could please correct these files too ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh sorry, I missed it. Fixed in 0e3845f.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you very much!


p.init_index(max_elements=num_elements//2, ef_construction=100, M=16)
p.init_index(max_elements=num_elements//2, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(20)
# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(20)

p.set_num_threads(idx%8) # by default using all available cores
p.set_num_threads(idx % 8) # by default using all available cores

# We split the data in two batches:
data1 = data[:num_elements // 2]
data2 = data[num_elements // 2:]
# We split the data in two batches:
data1 = data[:num_elements // 2]
data2 = data[num_elements // 2:]

print("Adding first batch of %d elements" % (len(data1)))
p.add_items(data1)
print("Adding first batch of %d elements" % (len(data1)))
p.add_items(data1)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)
# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)

items = p.get_items(list(range(len(data1))))
items = p.get_items(list(range(len(data1))))

# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)
# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)

# Check that the returned element data is correct:
diff_with_gt_labels = np.max(np.abs(data1-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)
# Check that the returned element data is correct:
diff_with_gt_labels = np.max(np.abs(data1-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)

print("Resizing the index")
p.resize_index(num_elements)
print("Resizing the index")
p.resize_index(num_elements)

print("Adding the second batch of %d elements" % (len(data2)))
p.add_items(data2)
print("Adding the second batch of %d elements" % (len(data2)))
p.add_items(data2)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)
items=p.get_items(list(range(num_elements)))
# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)
items=p.get_items(list(range(num_elements)))

# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)
# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)

# Check that the returned element data is correct:
diff_with_gt_labels=np.max(np.abs(data-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)
# Check that the returned element data is correct:
diff_with_gt_labels = np.max(np.abs(data-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)

# Checking that all labels are returned correcly:
sorted_labels=sorted(p.get_ids_list())
self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)
# Checking that all labels are returned correctly:
sorted_labels = sorted(p.get_ids_list())
self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)