diff --git a/README.md b/README.md
index 89650841..02b9e0c7 100644
--- a/README.md
+++ b/README.md
@@ -6,12 +6,20 @@ NEW: Added support for cosine similarity and inner product distances
 
 Part of the nmslib project https://github.com/nmslib/nmslib
 
+Offers less memory footprint and faster builds compared to current nmslib's version.
 
+### Python bindings
 
-Supported distances:
-1) Squared L2 ('l2')
-2) Inner product ('ip',the distance is 1.0 - $inner product$)
-3) Cosine similarity ('cosine', the same as the inner product, but vectors are normalized)
+
+#### Supported distances:
+
+| Distance         | parameter       | Equation                |
+| -------------    |:---------------:| -----------------------:|
+|Squared L2        |'l2'             | d = sum((Ai-Bi)^2)      |
+|Inner product     |'ip'             | d = 1.0 - sum(Ai\*Bi))  |
+|Cosine similarity |'cosine'         | d = 1.0 - sum(Ai\*Bi) / sqrt(sum(Ai\*Ai) * sum(Bi\*Bi))|
+
+Note that inner product is not a metric. An element can be closer to some other element than to itself.
 
 For other spaces use the main library https://github.com/nmslib/nmslib 
 
@@ -44,13 +52,76 @@ p.set_ef(50) # ef should always be > k
 labels, distances = p.knn_query(data, k = 1)
 
 ```
-To compile run:
+An example with updates after serialization/deserialization:
+```python
+import hnswlib
+import numpy as np
+
+dim = 16
+num_elements = 10000
+
+# Generating sample data
+data = np.float32(np.random.random((num_elements, dim)))
+
+# Declaring index
+p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
+
+# Initing index
+# max_elements - the maximum number of elements, should be known beforehand
+#     (probably will be made optional in the future)
+#
+# ef_construction - controls index search speed/build speed tradeoff
+# M - is tightly connected with internal dimensionality of the data
+#     stronlgy affects the memory consumption
+
+p.init_index(max_elements=num_elements, ef_construction=100, M=16)
+
+# Controlling the recall by setting ef:
+# higher ef leads to better accuracy, but slower search
+p.set_ef(10)
+
+p.set_num_threads(4)  # by default using all available cores
+
+# We split the data in two batches:
+data1 = data[:num_elements // 2]
+data2 = data[num_elements // 2:]
+
+print("Adding first batch of %d elements" % (len(data1)))
+p.add_items(data1)
+
+# Query the elements for themselves and measure recall:
+labels, distances = p.knn_query(data1, k=1)
+print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n")
+
+# Serializing and deleting the index:
+index_path='first_half.bin'
+print("Saving index to '%s'" % index_path)
+p.save_index("first_half.bin")
+del p
+
+# Reiniting, loading the index
+p = hnswlib.Index(space='l2', dim=dim)  # you can change the sa
+
+print("\nLoading index from 'first_half.bin'\n")
+p.load_index("first_half.bin")
+
+print("Adding the second batch of %d elements" % (len(data2)))
+p.add_items(data2)
+
+# Query the elements for themselves and measure recall:
+labels, distances = p.knn_query(data, k=1)
+print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
+
+```
+
+#### Bindings installation
 ```bash
+pip3 install pybind11 numpy setuptools
 cd python_bindings
 python3 setup.py install
 ```
 
-#### 200M SIFT test reproduction steps:
+### 200M SIFT test reproduction 
 To download and extract the bigann dataset:
 ```bash
 python3 download_bigann.py
@@ -71,4 +142,4 @@ The size of the bigann subset (in millions) is controlled by the variable **subs
 
 
 References:
-Malkov, Yu A., and D. A. Yashunin. "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." arXiv preprint arXiv:1603.09320 (2016).
+Malkov, Yu A., and D. A. Yashunin. "Efficient and robust approximate nearest neighbor search using Hierarchical Navigable Small World graphs." arXiv preprint arXiv:1603.09320 (2016). https://arxiv.org/abs/1603.09320
diff --git a/examples/example.py b/examples/example.py
new file mode 100644
index 00000000..6654a027
--- /dev/null
+++ b/examples/example.py
@@ -0,0 +1,57 @@
+import hnswlib
+import numpy as np
+
+dim = 16
+num_elements = 10000
+
+# Generating sample data
+data = np.float32(np.random.random((num_elements, dim)))
+
+# Declaring index
+p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
+
+# Initing index
+# max_elements - the maximum number of elements, should be known beforehand
+#     (probably will be made optional in the future)
+#
+# ef_construction - controls index search speed/build speed tradeoff
+# M - is tightly connected with internal dimensionality of the data
+#     stronlgy affects the memory consumption
+
+p.init_index(max_elements=num_elements, ef_construction=100, M=16)
+
+# Controlling the recall by setting ef:
+# higher ef leads to better accuracy, but slower search
+p.set_ef(10)
+
+p.set_num_threads(4)  # by default using all available cores
+
+# We split the data in two batches:
+data1 = data[:num_elements // 2]
+data2 = data[num_elements // 2:]
+
+print("Adding first batch of %d elements" % (len(data1)))
+p.add_items(data1)
+
+# Query the elements for themselves and measure recall:
+labels, distances = p.knn_query(data1, k=1)
+print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n")
+
+# Serializing and deleting the index:
+index_path='first_half.bin'
+print("Saving index to '%s'" % index_path)
+p.save_index("first_half.bin")
+del p
+
+# Reiniting, loading the index
+p = hnswlib.Index(space='l2', dim=dim)  # you can change the sa
+
+print("\nLoading index from 'first_half.bin'\n")
+p.load_index("first_half.bin")
+
+print("Adding the second batch of %d elements" % (len(data2)))
+p.add_items(data2)
+
+# Query the elements for themselves and measure recall:
+labels, distances = p.knn_query(data, k=1)
+print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
diff --git a/hnswlib/hnswalg.h b/hnswlib/hnswalg.h
index 2a303c23..967d1ea8 100644
--- a/hnswlib/hnswalg.h
+++ b/hnswlib/hnswalg.h
@@ -49,8 +49,8 @@ namespace hnswlib {
             M_ = M;
             maxM_ = M_;
             maxM0_ = M_ * 2;
-            ef_construction_ = ef_construction;
-            ef_ = 7;
+            ef_construction_ = std::max(ef_construction,M_);
+            ef_ = 10;
 
 
 
@@ -492,8 +492,6 @@ namespace hnswlib {
         };
 
         void saveIndex(const string &location) {
-
-            cout << "Saving index to " << location.c_str() << "\n";
             std::ofstream output(location, std::ios::binary);
             streampos position;
 
@@ -544,7 +542,6 @@ namespace hnswlib {
             readBinaryPOD(input, M_);
             readBinaryPOD(input, mult_);
             readBinaryPOD(input, ef_construction_);
-            cout << ef_construction_ << "\n";
 
 
             data_size_ = s->get_data_size();
@@ -557,11 +554,15 @@ namespace hnswlib {
 
             size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
 
+
+            size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
+            vector<mutex>(max_elements_).swap(link_list_locks_);
+
+
             visited_list_pool_ = new VisitedListPool(1, max_elements_);
 
 
             linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
-            cout << max_elements_ << "\n";
             element_levels_ = vector<int>(max_elements_);
             revSize_ = 1.0 / mult_;
             ef_ = 10;
@@ -578,12 +579,8 @@ namespace hnswlib {
                     input.read(linkLists_[i], linkListSize);
                 }
             }
-
-
             input.close();
-            size_t predicted_size_per_element = size_data_per_element_ + sizeof(void *) + 8 + 8 + 2 * 8;
-            cout << "Loaded index, predicted size=" << max_elements_ * (predicted_size_per_element) / (1000 * 1000)
-                 << "\n";
+
             return;
         }
 
@@ -682,7 +679,7 @@ namespace hnswlib {
             return cur_c;
         };
 
-        std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *query_data, int k) {
+        std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *query_data, size_t k) {
             tableint currObj = enterpoint_node_;
             dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_);
 
@@ -711,7 +708,7 @@ namespace hnswlib {
 
 
             std::priority_queue<std::pair<dist_t, tableint>, vector<pair<dist_t, tableint>>, CompareByFirst> top_candidates = searchBaseLayerST(
-                    currObj, query_data, ef_);
+                    currObj, query_data, std::max(ef_,k));
             std::priority_queue<std::pair<dist_t, labeltype >> results;
             while (top_candidates.size() > k) {
                 top_candidates.pop();
diff --git a/hnswlib/hnswlib.h b/hnswlib/hnswlib.h
index 26d0fe35..a97ea545 100644
--- a/hnswlib/hnswlib.h
+++ b/hnswlib/hnswlib.h
@@ -27,7 +27,7 @@ namespace hnswlib {
     class AlgorithmInterface {
     public:
         //virtual void addPoint(void *, labeltype) = 0;
-        virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *, int) = 0;
+        virtual std::priority_queue<std::pair<dist_t, labeltype >> searchKnn(void *, size_t) = 0;
     };
 
     template<typename MTYPE>
diff --git a/python_bindings/bindings.cpp b/python_bindings/bindings.cpp
index 578112ff..69f82e31 100644
--- a/python_bindings/bindings.cpp
+++ b/python_bindings/bindings.cpp
@@ -128,36 +128,54 @@ class Index {
         if (num_threads <= 0)
             num_threads = num_threads_default;
 
-        size_t rows = buffer.shape[0], features = buffer.shape[1];
+        size_t rows, features;
+
+        if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
+        if (buffer.ndim == 2) {
+            rows = buffer.shape[0];
+            features = buffer.shape[1];
+        }
+        else{
+            rows = 1;
+            features = buffer.shape[0];
+        }
+
+        if (features != dim)
+            throw std::runtime_error("wrong dimensionality of the vectors");
+
+        // avoid using threads when the number of searches is small:
+
+        if(rows<=num_threads*4){
+            num_threads=1;
+        }
 
         std::vector<size_t> ids;
 
         if (!ids_.is_none()) {
             py::array_t < size_t, py::array::c_style | py::array::forcecast > items(ids_);
             auto ids_numpy = items.request();
-            std::vector<size_t> ids1(ids_numpy.shape[0]);
-            for (size_t i = 0; i < ids1.size(); i++) {
-                ids1[i] = items.data()[i];
+            if(ids_numpy.ndim==1 && ids_numpy.shape[0]==rows) {
+                std::vector<size_t> ids1(ids_numpy.shape[0]);
+                for (size_t i = 0; i < ids1.size(); i++) {
+                    ids1[i] = items.data()[i];
+                }
+                ids.swap(ids1);
             }
-            ids.swap(ids1);
+            else if(ids_numpy.ndim==0 && rows==1) {
+                ids.push_back(*items.data());
+            }
+            else
+                throw std::runtime_error("wrong dimensionality of the labels");
         }
 
         hnswlib::tableint *data_numpy;
 
         {
 
-            py::gil_scoped_release l;
-
-
-            if (buffer.ndim != 2) throw std::runtime_error("data must be a 2d array");
-
-            if (features != dim)
-                throw std::runtime_error("wrong dimensionality of the vectors");
-
             data_numpy = new hnswlib::tableint[rows];
             int start = 0;
             if (!ep_added) {
-                size_t id = ids.size() ? ids.at(0) : (cur_l++);
+                size_t id = ids.size() ? ids.at(0) : (cur_l);
 				float *vector_data=(float *) items.data(0);
 				if(normalize){
 					std::vector<float> norm_array(dim);
@@ -169,20 +187,25 @@ class Index {
                 start = 1;
                 ep_added = true;
             }
+
+            py::gil_scoped_release l;
             if(normalize==false) {
                 ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) {
-                    size_t id = ids.size() ? ids.at(row) : (cur_l++);
+                    size_t id = ids.size() ? ids.at(row) : (cur_l+row);
                     data_numpy[row] = appr_alg->addPoint((void *) items.data(row), (size_t) id);
                 });
             } else{
                 std::vector<float> norm_array(num_threads * dim);
                 ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) {
+                    // normalize vector:
 					size_t start_idx = threadId * dim;
                     normalize_vector((float *) items.data(row), (norm_array.data()+start_idx));
-                    size_t id = ids.size() ? ids.at(row) : (cur_l++);
+
+                    size_t id = ids.size() ? ids.at(row) : (cur_l+row);
                     data_numpy[row] = appr_alg->addPoint((void *) (norm_array.data()+start_idx), (size_t) id);
                 });
-            }
+            };
+            cur_l+=rows;
 
 
         }
@@ -213,11 +236,22 @@ class Index {
         {
             py::gil_scoped_release l;
 
-            if (buffer.ndim != 2) throw std::runtime_error("data must be a 2d array");
+            if (buffer.ndim != 2 && buffer.ndim != 1) throw std::runtime_error("data must be a 1d/2d array");
+            if (buffer.ndim == 2) {
+                rows = buffer.shape[0];
+                features = buffer.shape[1];
+            }
+            else{
+                rows = 1;
+                features = buffer.shape[0];
+            }
 
-            rows = buffer.shape[0];
-            features = buffer.shape[1];
 
+            // avoid using threads when the number of searches is small:
+
+            if(rows<=num_threads*4){
+                num_threads=1;
+            }
 
             data_numpy_l = new hnswlib::labeltype[rows * k];
             data_numpy_d = new dist_t[rows * k];