From b6e5aa9ae669b3a008632d5cfa2ac0dff0410df2 Mon Sep 17 00:00:00 2001
From: Maciej Buszka <maciej@buszka.eu>
Date: Fri, 8 Jun 2018 11:34:22 +0200
Subject: [PATCH] FIX #3: Update Annoy version

- Update Annoy library to current master
- Add Manhattan metric option to wrapper
- getDistance now returns proper distance, not squared
- fixed tests, added test for manhattan distance
---
 Makefile                     |   1 +
 annoyindexwrapper.cc         |   8 +-
 annoylib.h                   | 474 ++++++++++++++++++++++++++++-------
 kissrandom.h                 | 106 ++++++++
 tests/basic-config.js        |   2 +-
 tests/basictests.js          |   2 +-
 tests/smalltest-manhattan.js |  81 ++++++
 tests/smalltest.js           |   2 +-
 tests/very-big-config.js     |   2 +-
 9 files changed, 580 insertions(+), 98 deletions(-)
 create mode 100644 kissrandom.h
 create mode 100644 tests/smalltest-manhattan.js
diff --git a/Makefile b/Makefile
index 8671ce1..66e1c8f 100644
--- a/Makefile
+++ b/Makefile
@@ -8,6 +8,7 @@ build-wrapper:
 
 test: tests/data/text8-vector.json
 	node tests/smalltest.js
+	node tests/smalltest-manhattan.js
 	node tests/basictests.js basic-config.js
 
 big-test: tests/data/GoogleNews-vectors-negative300.json
diff --git a/annoyindexwrapper.cc b/annoyindexwrapper.cc
index 687e922..ecd62c5 100644
--- a/annoyindexwrapper.cc
+++ b/annoyindexwrapper.cc
@@ -1,4 +1,5 @@
 #include "annoyindexwrapper.h"
+#include "kissrandom.h"
 #include <vector>
 
 using namespace v8;
@@ -10,10 +11,13 @@ AnnoyIndexWrapper::AnnoyIndexWrapper(int dimensions, const char *metricString) :
   annoyDimensions(dimensions) {
 
   if (strcmp(metricString, "Angular") == 0) {
-    annoyIndex = new AnnoyIndex<int, float, Angular, RandRandom>(dimensions);
+    annoyIndex = new AnnoyIndex<int, float, Angular, Kiss64Random>(dimensions);
+  }
+  else if (strcmp(metricString, "Manhattan") == 0) {
+    annoyIndex = new AnnoyIndex<int, float, Manhattan, Kiss64Random>(dimensions);
   }
   else {
-    annoyIndex = new AnnoyIndex<int, float, Euclidean, RandRandom>(dimensions);    
+    annoyIndex = new AnnoyIndex<int, float, Euclidean, Kiss64Random>(dimensions);    
   }
 }
 
diff --git a/annoylib.h b/annoylib.h
index d8d4b88..14e35ab 100644
--- a/annoylib.h
+++ b/annoylib.h
@@ -12,21 +12,30 @@
 // License for the specific language governing permissions and limitations under
 // the License.
 
+
 #ifndef ANNOYLIB_H
 #define ANNOYLIB_H
 
 #include <stdio.h>
 #include <string>
 #include <sys/stat.h>
+#ifndef _MSC_VER
 #include <unistd.h>
+#endif
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/types.h>
 #include <fcntl.h>
 #include <stddef.h>
+#if defined(_MSC_VER) && _MSC_VER == 1500
+typedef unsigned char     uint8_t;
+typedef signed __int32    int32_t;
+#else
 #include <stdint.h>
+#endif
 
-#ifdef __MINGW32__
+#ifdef _MSC_VER
+#define NOMINMAX
 #include "mman.h"
 #include <windows.h>
 #else
@@ -40,6 +49,11 @@
 #include <queue>
 #include <limits>
 
+#ifdef _MSC_VER
+// Needed for Visual Studio to disable runtime checks for mempcy
+#pragma runtime_checks("s", off)
+#endif
+
 // This allows others to supply their own logger / error printer without
 // requiring Annoy to import their headers. See RcppAnnoy for a use case.
 #ifndef __ERROR_PRINTER_OVERRIDE__
@@ -48,9 +62,34 @@
   #define showUpdate(...) { __ERROR_PRINTER_OVERRIDE__( __VA_ARGS__ ); }
 #endif
 
+
+#ifndef _MSC_VER
+#define popcount __builtin_popcountll
+#else
+#define popcount __popcnt64
+#endif
+
+#ifndef NO_MANUAL_VECTORIZATION
+#if defined(__AVX__) && defined (__SSE__) && defined(__SSE2__) && defined(__SSE3__)
+#define USE_AVX
+#endif
+#endif
+
+#ifdef USE_AVX
+#if defined(_MSC_VER)
+#include <intrin.h>
+#elif defined(__GNUC__)
+#include <x86intrin.h>
+#endif
+#endif
+
 #ifndef ANNOY_NODE_ATTRIBUTE
-  #define ANNOY_NODE_ATTRIBUTE __attribute__((__packed__))
-  // TODO: this is turned on by default, but may not work for all architectures! Need to investigate.
+    #ifndef _MSC_VER
+        #define ANNOY_NODE_ATTRIBUTE __attribute__((__packed__))
+        // TODO: this is turned on by default, but may not work for all architectures! Need to investigate.
+    #else
+        #define ANNOY_NODE_ATTRIBUTE
+    #endif
 #endif
 
 
@@ -60,36 +99,103 @@ using std::pair;
 using std::numeric_limits;
 using std::make_pair;
 
-struct RandRandom {
-  // Default implementation of annoy-specific random number generator that uses rand() from standard library.
-  // Owned by the AnnoyIndex, passed around to the distance metrics
-  inline int flip() {
-    // Draw random 0 or 1
-    return rand() & 1;
+namespace {
+
+template<typename T>
+inline T dot(const T* x, const T* y, int f) {
+  T s = 0;
+  for (int z = 0; z < f; z++) {
+    s += (*x) * (*y);
+    x++;
+    y++;
+  }
+  return s;
+}
+
+template<typename T>
+inline T manhattan_distance(const T* x, const T* y, int f) {
+  T d = 0.0;
+  for (int i = 0; i < f; i++)
+    d += fabs(x[i] - y[i]);
+  return d;
+}
+
+#ifdef USE_AVX
+// Horizontal single sum of 256bit vector.
+inline float hsum256_ps_avx(__m256 v) {
+  const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v));
+  const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
+  const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
+  return _mm_cvtss_f32(x32);
+}
+
+template<>
+inline float dot<float>(const float* x, const float *y, int f) {
+  float result = 0;
+  if (f > 7) {
+    __m256 d = _mm256_setzero_ps();
+    for (; f > 7; f -= 8) {
+      d = _mm256_add_ps(d, _mm256_mul_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)));
+      x += 8;
+      y += 8;
+    }
+    // Sum all floats in dot register.
+    result += hsum256_ps_avx(d);
   }
-  inline size_t index(size_t n) {
-    // Draw random integer between 0 and n-1 where n is at most the number of data points you have
-    return rand() % n;
+  // Don't forget the remaining values.
+  for (; f > 0; f--) {
+    result += *x * *y;
+    x++;
+    y++;
   }
-};
+  return result;
+}
+
+template<>
+inline float manhattan_distance<float>(const float* x, const float* y, int f) {
+  float result = 0;
+  int i = f;
+  if (f > 7) {
+    __m256 manhattan = _mm256_setzero_ps();
+    __m256 minus_zero = _mm256_set1_ps(-0.0f);
+    for (; i > 7; i -= 8) {
+      const __m256 x_minus_y = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y));
+      const __m256 distance = _mm256_andnot_ps(minus_zero, x_minus_y); // Absolute value of x_minus_y (forces sign bit to zero)
+      manhattan = _mm256_add_ps(manhattan, distance);
+      x += 8;
+      y += 8;
+    }
+    // Sum all floats in manhattan register.
+    result = hsum256_ps_avx(manhattan);
+  }
+  // Don't forget the remaining values.
+  for (; i > 0; i--) {
+    result += fabsf(*x - *y);
+    x++;
+    y++;
+  }
+  return result;
+}
 
+#endif
+
+ 
 template<typename T>
 inline T get_norm(T* v, int f) {
-  T sq_norm = 0;
-  for (int z = 0; z < f; z++)
-    sq_norm += v[z] * v[z];
-  return sqrt(sq_norm);
+  return sqrt(dot(v, v, f));
 }
 
 template<typename T>
 inline void normalize(T* v, int f) {
   T norm = get_norm(v, f);
-  for (int z = 0; z < f; z++)
-    v[z] /= norm;
+  if (norm > 0) {
+    for (int z = 0; z < f; z++)
+      v[z] /= norm;
+  }
 }
 
 template<typename T, typename Random, typename Distance, typename Node>
-inline void two_means(const vector<Node*>& nodes, int f, Random& random, bool cosine, T* iv, T* jv) {
+inline void two_means(const vector<Node*>& nodes, int f, Random& random, bool cosine, Node* p, Node* q) {
   /*
     This algorithm is a huge heuristic. Empirically it works really well, but I
     can't motivate it well. The basic idea is to keep two centroids and assign
@@ -102,28 +208,36 @@ inline void two_means(const vector<Node*>& nodes, int f, Random& random, bool co
   size_t i = random.index(count);
   size_t j = random.index(count-1);
   j += (j >= i); // ensure that i != j
-  std::copy(&nodes[i]->v[0], &nodes[i]->v[f], &iv[0]);
-  std::copy(&nodes[j]->v[0], &nodes[j]->v[f], &jv[0]);
-  if (cosine) { normalize(&iv[0], f); normalize(&jv[0], f); }
+  memcpy(p->v, nodes[i]->v, f * sizeof(T));
+  memcpy(q->v, nodes[j]->v, f * sizeof(T));
+  if (cosine) { normalize(p->v, f); normalize(q->v, f); }
+  Distance::init_node(p, f);
+  Distance::init_node(q, f);
 
   int ic = 1, jc = 1;
   for (int l = 0; l < iteration_steps; l++) {
     size_t k = random.index(count);
-    T di = ic * Distance::distance(&iv[0], nodes[k]->v, f),
-      dj = jc * Distance::distance(&jv[0], nodes[k]->v, f);
+    T di = ic * Distance::distance(p, nodes[k], f),
+      dj = jc * Distance::distance(q, nodes[k], f);
     T norm = cosine ? get_norm(nodes[k]->v, f) : 1.0;
+    if (!(norm > T(0))) {
+      continue;
+    }
     if (di < dj) {
       for (int z = 0; z < f; z++)
-	iv[z] = (iv[z] * ic + nodes[k]->v[z] / norm) / (ic + 1);
+	p->v[z] = (p->v[z] * ic + nodes[k]->v[z] / norm) / (ic + 1);
+      Distance::init_node(p, f);
       ic++;
     } else if (dj < di) {
       for (int z = 0; z < f; z++)
-	jv[z] = (jv[z] * jc + nodes[k]->v[z] / norm) / (jc + 1);
+	q->v[z] = (q->v[z] * jc + nodes[k]->v[z] / norm) / (jc + 1);
+      Distance::init_node(q, f);
       jc++;
     }
   }
 }
 
+} // namespace
 
 struct Angular {
   template<typename S, typename T>
@@ -143,30 +257,27 @@ struct Angular {
      * more memory to be able to fit the vector outside
      */
     S n_descendants;
-    S children[2]; // Will possibly store more than 2
+    union {
+      S children[2]; // Will possibly store more than 2
+      T norm;
+    };
     T v[1]; // We let this one overflow intentionally. Need to allocate at least 1 to make GCC happy
   };
-  template<typename T>
-  static inline T distance(const T* x, const T* y, int f) {
+  template<typename S, typename T>
+  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
     // want to calculate (a/|a| - b/|b|)^2
     // = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b|
     // = 2 - 2cos
-    T pp = 0, qq = 0, pq = 0;
-    for (int z = 0; z < f; z++, x++, y++) {
-      pp += (*x) * (*x);
-      qq += (*y) * (*y);
-      pq += (*x) * (*y);
-    }
+    T pp = x->norm ? x->norm : dot(x->v, x->v, f); // For backwards compatibility reasons, we need to fall back and compute the norm here
+    T qq = y->norm ? y->norm : dot(y->v, y->v, f);
+    T pq = dot(x->v, y->v, f);
     T ppqq = pp * qq;
     if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq);
     else return 2.0; // cos is 0
   }
   template<typename S, typename T>
   static inline T margin(const Node<S, T>* n, const T* y, int f) {
-    T dot = 0;
-    for (int z = 0; z < f; z++)
-      dot += n->v[z] * y[z];
-    return dot;
+    return dot(n->v, y, f);
   }
   template<typename S, typename T, typename Random>
   static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
@@ -177,12 +288,15 @@ struct Angular {
       return random.flip();
   }
   template<typename S, typename T, typename Random>
-  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, Random& random, Node<S, T>* n) {
-    vector<T> best_iv(f, 0), best_jv(f, 0); // TODO: avoid allocation
-    two_means<T, Random, Angular, Node<S, T> >(nodes, f, random, true, &best_iv[0], &best_jv[0]);
+  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
+    Node<S, T>* p = (Node<S, T>*)malloc(s); // TODO: avoid
+    Node<S, T>* q = (Node<S, T>*)malloc(s); // TODO: avoid
+    two_means<T, Random, Angular, Node<S, T> >(nodes, f, random, true, p, q);
     for (int z = 0; z < f; z++)
-      n->v[z] = best_iv[z] - best_jv[z];
+      n->v[z] = p->v[z] - q->v[z];
     normalize(n->v, f);
+    free(p);
+    free(q);
   }
   template<typename T>
   static inline T normalized_distance(T distance) {
@@ -191,32 +305,123 @@ struct Angular {
     // so we have to make sure it's a positive number.
     return sqrt(std::max(distance, T(0)));
   }
+  template<typename T>
+  static inline T pq_distance(T distance, T margin, int child_nr) {
+    if (child_nr == 0)
+      margin = -margin;
+    return std::min(distance, margin);
+  }
+  template<typename T>
+  static inline T pq_initial_value() {
+    return numeric_limits<T>::infinity();
+  }
+  template<typename S, typename T>
+  static inline void init_node(Node<S, T>* n, int f) {
+    n->norm = dot(n->v, n->v, f);
+  }
   static const char* name() {
     return "angular";
   }
 };
 
-struct Euclidean {
+struct Hamming {
   template<typename S, typename T>
   struct ANNOY_NODE_ATTRIBUTE Node {
     S n_descendants;
-    T a; // need an extra constant term to determine the offset of the plane
     S children[2];
     T v[1];
   };
+
+  static const size_t max_iterations = 20;
+
+  template<typename T>
+  static inline T pq_distance(T distance, T margin, int child_nr) {
+    return distance - (margin != (unsigned int) child_nr);
+  }
+
   template<typename T>
-  static inline T distance(const T* x, const T* y, int f) {
-    T d = 0.0;
-    for (int i = 0; i < f; i++, x++, y++)
-      d += ((*x) - (*y)) * ((*x) - (*y));
-    return d;
+  static inline T pq_initial_value() {
+    return numeric_limits<T>::max();
+  }
+  template<typename S, typename T>
+  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
+    size_t dist = 0;
+    for (int i = 0; i < f; i++) {
+      dist += popcount(x->v[i] ^ y->v[i]);
+    }
+    return dist;
   }
   template<typename S, typename T>
+  static inline bool margin(const Node<S, T>* n, const T* y, int f) {
+    static const size_t n_bits = sizeof(T) * 8;
+    T chunk = n->v[0] / n_bits;
+    return (y[chunk] & (static_cast<T>(1) << (n_bits - 1 - (n->v[0] % n_bits)))) != 0;
+  }
+  template<typename S, typename T, typename Random>
+  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
+    return margin(n, y, f);
+  }
+  template<typename S, typename T, typename Random>
+  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
+    size_t cur_size = 0;
+    size_t i = 0;
+    int dim = f * 8 * sizeof(T);
+    for (; i < max_iterations; i++) {
+      // choose random position to split at
+      n->v[0] = random.index(dim);
+      cur_size = 0;
+      for (typename vector<Node<S, T>*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) {
+        if (margin(n, (*it)->v, f)) {
+          cur_size++;
+        }
+      }
+      if (cur_size > 0 && cur_size < nodes.size()) {
+        break;
+      }
+    }
+    // brute-force search for splitting coordinate
+    if (i == max_iterations) {
+      int j = 0;
+      for (; j < dim; j++) {
+        n->v[0] = j;
+        cur_size = 0;
+	for (typename vector<Node<S, T>*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) {
+          if (margin(n, (*it)->v, f)) {
+            cur_size++;
+          }
+        }
+        if (cur_size > 0 && cur_size < nodes.size()) {
+          break;
+        }
+      }
+    }
+  }
+  template<typename T>
+  static inline T normalized_distance(T distance) {
+    return distance;
+  }
+  template<typename S, typename T>
+  static inline void init_node(Node<S, T>* n, int f) {
+  }
+  static const char* name() {
+    return "hamming";
+  }
+};
+
+struct Minkowski {
+  template<typename S, typename T>
+  struct ANNOY_NODE_ATTRIBUTE Node {
+    S n_descendants;
+    T a; // need an extra constant term to determine the offset of the plane
+    union {
+      S children[2];
+      T norm;
+    };
+    T v[1];
+  };
+  template<typename S, typename T>
   static inline T margin(const Node<S, T>* n, const T* y, int f) {
-    T dot = n->a;
-    for (int z = 0; z < f; z++)
-      dot += n->v[z] * y[z];
-    return dot;
+    return n->a + dot(n->v, y, f);
   }
   template<typename S, typename T, typename Random>
   static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
@@ -226,33 +431,94 @@ struct Euclidean {
     else
       return random.flip();
   }
+  template<typename T>
+  static inline T pq_distance(T distance, T margin, int child_nr) {
+    if (child_nr == 0)
+      margin = -margin;
+    return std::min(distance, margin);
+  }
+  template<typename T>
+  static inline T pq_initial_value() {
+    return numeric_limits<T>::infinity();
+  }
+};
+
+
+struct Euclidean : Minkowski{
+  template<typename S, typename T>
+  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
+    T pp = x->norm ? x->norm : dot(x->v, x->v, f); // For backwards compatibility reasons, we need to fall back and compute the norm here
+    T qq = y->norm ? y->norm : dot(y->v, y->v, f);
+    T pq = dot(x->v, y->v, f);
+    return pp + qq - 2*pq;
+  }
   template<typename S, typename T, typename Random>
-  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, Random& random, Node<S, T>* n) {
-    vector<T> best_iv(f, 0), best_jv(f, 0);
-    two_means<T, Random, Euclidean, Node<S, T> >(nodes, f, random, false, &best_iv[0], &best_jv[0]);
+  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
+    Node<S, T>* p = (Node<S, T>*)malloc(s); // TODO: avoid
+    Node<S, T>* q = (Node<S, T>*)malloc(s); // TODO: avoid
+    two_means<T, Random, Euclidean, Node<S, T> >(nodes, f, random, false, p, q);
 
     for (int z = 0; z < f; z++)
-      n->v[z] = best_iv[z] - best_jv[z];
+      n->v[z] = p->v[z] - q->v[z];
     normalize(n->v, f);
     n->a = 0.0;
     for (int z = 0; z < f; z++)
-      n->a += -n->v[z] * (best_iv[z] + best_jv[z]) / 2;
+      n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;
+    free(p);
+    free(q);
   }
   template<typename T>
   static inline T normalized_distance(T distance) {
     return sqrt(std::max(distance, T(0)));
   }
+  template<typename S, typename T>
+  static inline void init_node(Node<S, T>* n, int f) {
+    n->norm = dot(n->v, n->v, f);
+  }
   static const char* name() {
     return "euclidean";
   }
 };
 
+struct Manhattan : Minkowski{
+  template<typename S, typename T>
+  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
+    return manhattan_distance(x->v, y->v, f);
+  }
+  template<typename S, typename T, typename Random>
+  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
+    Node<S, T>* p = (Node<S, T>*)malloc(s); // TODO: avoid
+    Node<S, T>* q = (Node<S, T>*)malloc(s); // TODO: avoid
+    two_means<T, Random, Manhattan, Node<S, T> >(nodes, f, random, false, p, q);
+
+    for (int z = 0; z < f; z++)
+      n->v[z] = p->v[z] - q->v[z];
+    normalize(n->v, f);
+    n->a = 0.0;
+    for (int z = 0; z < f; z++)
+      n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;
+    free(p);
+    free(q);
+  }
+  template<typename T>
+  static inline T normalized_distance(T distance) {
+    return std::max(distance, T(0));
+  }
+  template<typename S, typename T>
+  static inline void init_node(Node<S, T>* n, int f) {
+  }
+  static const char* name() {
+    return "manhattan";
+  }
+};
+
 template<typename S, typename T>
 class AnnoyIndexInterface {
  public:
   virtual ~AnnoyIndexInterface() {};
   virtual void add_item(S item, const T* w) = 0;
   virtual void build(int q) = 0;
+  virtual void unbuild() = 0;
   virtual bool save(const char* filename) = 0;
   virtual void unload() = 0;
   virtual bool load(const char* filename) = 0;
@@ -262,6 +528,7 @@ class AnnoyIndexInterface {
   virtual S get_n_items() = 0;
   virtual void verbose(bool v) = 0;
   virtual void get_item(S item, T* v) = 0;
+  virtual void set_seed(int q) = 0;
 };
 
 template<typename S, typename T, typename Distance, typename Random>
@@ -319,10 +586,9 @@ template<typename S, typename T, typename Distance, typename Random>
     n->children[1] = 0;
     n->n_descendants = 1;
 
-    for (int z = 0; z < _f; z++) {
+    for (int z = 0; z < _f; z++)
       n->v[z] = w[z];
-      // printf("adding: %f", w[z]);
-    }
+    D::init_node(n, _f);
 
     if (item >= _n_items)
       _n_items = item + 1;
@@ -343,10 +609,12 @@ template<typename S, typename T, typename Distance, typename Random>
       if (_verbose) showUpdate("pass %zd...\n", _roots.size());
 
       vector<S> indices;
-      for (S i = 0; i < _n_items; i++)
-        indices.push_back(i);
+      for (S i = 0; i < _n_items; i++) {
+	if (_get(i)->n_descendants >= 1) // Issue #223
+          indices.push_back(i);
+      }
 
-      _roots.push_back(_make_tree(indices));
+      _roots.push_back(_make_tree(indices, true));
     }
     // Also, copy the roots into the last segment of the array
     // This way we can load them faster without reading the whole file
@@ -357,6 +625,16 @@ template<typename S, typename T, typename Distance, typename Random>
 
     if (_verbose) showUpdate("has %d nodes\n", _n_nodes);
   }
+  
+  void unbuild() {
+    if (_loaded) {
+      showUpdate("You can't unbuild a loaded index\n");
+      return;
+    }
+
+    _roots.clear();
+    _n_nodes = _n_items;
+  }
 
   bool save(const char* filename) {
     FILE *f = fopen(filename, "wb");
@@ -395,9 +673,11 @@ template<typename S, typename T, typename Distance, typename Random>
   }
 
   bool load(const char* filename) {
-    _fd = open(filename, O_RDONLY, (mode_t)0400);
-    if (_fd == -1)
+    _fd = open(filename, O_RDONLY, (int)0400);
+    if (_fd == -1) {
+      _fd = 0;
       return false;
+    }
     off_t size = lseek(_fd, 0, SEEK_END);
 #ifdef MAP_POPULATE
     _nodes = (Node*)mmap(
@@ -410,6 +690,7 @@ template<typename S, typename T, typename Distance, typename Random>
     _n_nodes = (S)(size / _s);
 
     // Find the roots by scanning the end of the file and taking the nodes with most descendants
+    _roots.clear();
     S m = -1;
     for (S i = _n_nodes - 1; i >= 0; i--) {
       S k = _get(i)->n_descendants;
@@ -430,9 +711,7 @@ template<typename S, typename T, typename Distance, typename Random>
   }
 
   T get_distance(S i, S j) {
-    const T* x = _get(i)->v;
-    const T* y = _get(j)->v;
-    return D::distance(x, y, _f);
+    return D::normalized_distance(D::distance(_get(i), _get(j), _f));
   }
 
   void get_nns_by_item(S item, size_t n, size_t search_k, vector<S>* result, vector<T>* distances) {
@@ -452,12 +731,11 @@ template<typename S, typename T, typename Distance, typename Random>
 
   void get_item(S item, T* v) {
     Node* m = _get(item);
-      // printf("%s\n", "get_item!");
-    // printf("Copying item!");
-    // for (int i = 0; i < _f; ++i) {
-    //   printf("%f, ", m->v[i]);
-    // }
-    std::copy(&m->v[0], &m->v[_f], v);
+    memcpy(v, m->v, _f * sizeof(T));
+  }
+
+  void set_seed(int seed) {
+    _random.set_seed(seed);
   }
 
 protected:
@@ -477,19 +755,25 @@ template<typename S, typename T, typename Distance, typename Random>
     return (Node*)((uint8_t *)_nodes + (_s * i));
   }
 
-  S _make_tree(const vector<S >& indices) {
-    if (indices.size() == 1)
+  S _make_tree(const vector<S >& indices, bool is_root) {
+    // The basic rule is that if we have <= _K items, then it's a leaf node, otherwise it's a split node.
+    // There's some regrettable complications caused by the problem that root nodes have to be "special":
+    // 1. We identify root nodes by the arguable logic that _n_items == n->n_descendants, regardless of how many descendants they actually have
+    // 2. Root nodes with only 1 child need to be a "dummy" parent
+    // 3. Due to the _n_items "hack", we need to be careful with the cases where _n_items <= _K or _n_items > _K
+    if (indices.size() == 1 && !is_root)
       return indices[0];
 
-    if (indices.size() <= (size_t)_K) {
+    if (indices.size() <= (size_t)_K && (!is_root || _n_items <= (size_t)_K || indices.size() == 1)) {
       _allocate_size(_n_nodes + 1);
       S item = _n_nodes++;
       Node* m = _get(item);
-      m->n_descendants = (S)indices.size();
+      m->n_descendants = is_root ? _n_items : (S)indices.size();
 
       // Using std::copy instead of a loop seems to resolve issues #3 and #13,
       // probably because gcc 4.8 goes overboard with optimizations.
-      std::copy(indices.begin(), indices.end(), m->children);
+      // Using memcpy instead of std::copy for MSVC compatibility. #235
+      memcpy(m->children, &indices[0], indices.size() * sizeof(S));
       return item;
     }
 
@@ -503,7 +787,7 @@ template<typename S, typename T, typename Distance, typename Random>
 
     vector<S> children_indices[2];
     Node* m = (Node*)malloc(_s); // TODO: avoid
-    D::create_split(children, _f, _random, m);
+    D::create_split(children, _f, _s, _random, m);
 
     for (size_t i = 0; i < indices.size(); i++) {
       S j = indices[i];
@@ -535,10 +819,10 @@ template<typename S, typename T, typename Distance, typename Random>
 
     int flip = (children_indices[0].size() > children_indices[1].size());
 
-    m->n_descendants = (S)indices.size();
+    m->n_descendants = is_root ? _n_items : (S)indices.size();
     for (int side = 0; side < 2; side++)
       // run _make_tree for the smallest child first (for cache locality)
-      m->children[side^flip] = _make_tree(children_indices[side^flip]);
+      m->children[side^flip] = _make_tree(children_indices[side^flip], false);
 
     _allocate_size(_n_nodes + 1);
     S item = _n_nodes++;
@@ -549,31 +833,35 @@ template<typename S, typename T, typename Distance, typename Random>
   }
 
   void _get_all_nns(const T* v, size_t n, size_t search_k, vector<S>* result, vector<T>* distances) {
+    Node* v_node = (Node *)malloc(_s); // TODO: avoid
+    memcpy(v_node->v, v, sizeof(T)*_f);
+    D::init_node(v_node, _f);
+
     std::priority_queue<pair<T, S> > q;
 
     if (search_k == (size_t)-1)
       search_k = n * _roots.size(); // slightly arbitrary default value
 
     for (size_t i = 0; i < _roots.size(); i++) {
-      q.push(make_pair(numeric_limits<T>::infinity(), _roots[i]));
+      q.push(make_pair(Distance::template pq_initial_value<T>(), _roots[i]));
     }
 
-    vector<S> nns;
+    std::vector<S> nns;
     while (nns.size() < search_k && !q.empty()) {
       const pair<T, S>& top = q.top();
       T d = top.first;
       S i = top.second;
       Node* nd = _get(i);
       q.pop();
-      if (nd->n_descendants == 1) {
+      if (nd->n_descendants == 1 && i < _n_items) {
         nns.push_back(i);
       } else if (nd->n_descendants <= _K) {
         const S* dst = nd->children;
         nns.insert(nns.end(), dst, &dst[nd->n_descendants]);
       } else {
         T margin = D::margin(nd, v, _f);
-        q.push(make_pair(std::min(d, +margin), nd->children[1]));
-        q.push(make_pair(std::min(d, -margin), nd->children[0]));
+        q.push(make_pair(D::pq_distance(d, margin, 1), nd->children[1]));
+        q.push(make_pair(D::pq_distance(d, margin, 0), nd->children[0]));
       }
     }
 
@@ -587,17 +875,19 @@ template<typename S, typename T, typename Distance, typename Random>
       if (j == last)
         continue;
       last = j;
-      nns_dist.push_back(make_pair(D::distance(v, _get(j)->v, _f), j));
+      if (_get(j)->n_descendants == 1)  // This is only to guard a really obscure case, #284
+	nns_dist.push_back(make_pair(D::distance(v_node, _get(j), _f), j));
     }
 
     size_t m = nns_dist.size();
     size_t p = n < m ? n : m; // Return this many items
-    std::partial_sort(&nns_dist[0], &nns_dist[p], &nns_dist[m]);
+    std::partial_sort(nns_dist.begin(), nns_dist.begin() + p, nns_dist.end());
     for (size_t i = 0; i < p; i++) {
       if (distances)
         distances->push_back(D::normalized_distance(nns_dist[i].first));
       result->push_back(nns_dist[i].second);
     }
+    free(v_node);
   }
 };
 
diff --git a/kissrandom.h b/kissrandom.h
new file mode 100644
index 0000000..c423a7c
--- /dev/null
+++ b/kissrandom.h
@@ -0,0 +1,106 @@
+#ifndef KISSRANDOM_H
+#define KISSRANDOM_H
+
+#if defined(_MSC_VER) && _MSC_VER == 1500
+typedef unsigned __int32    uint32_t;
+typedef unsigned __int32    uint64_t;
+#else
+#include <stdint.h>
+#endif
+
+// KISS = "keep it simple, stupid", but high quality random number generator
+// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
+// http://mathforum.org/kb/message.jspa?messageID=6627731
+// https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)
+
+// 32 bit KISS
+struct Kiss32Random {
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+  uint32_t c;
+
+  // seed must be != 0
+  Kiss32Random(uint32_t seed = 123456789) {
+    x = seed;
+    y = 362436000;
+    z = 521288629;
+    c = 7654321;
+  }
+
+  uint32_t kiss() {
+    // Linear congruence generator
+    x = 69069 * x + 12345;
+
+    // Xor shift
+    y ^= y << 13;
+    y ^= y >> 17;
+    y ^= y << 5;
+
+    // Multiply-with-carry
+    uint64_t t = 698769069ULL * z + c;
+    c = t >> 32;
+    z = (uint32_t) t;
+
+    return x + y + z;
+  }
+  inline int flip() {
+    // Draw random 0 or 1
+    return kiss() & 1;
+  }
+  inline size_t index(size_t n) {
+    // Draw random integer between 0 and n-1 where n is at most the number of data points you have
+    return kiss() % n;
+  }
+  inline void set_seed(uint32_t seed) {
+    x = seed;
+  }
+};
+
+// 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
+struct Kiss64Random {
+  uint64_t x;
+  uint64_t y;
+  uint64_t z;
+  uint64_t c;
+
+  // seed must be != 0
+  Kiss64Random(uint64_t seed = 1234567890987654321ULL) {
+    x = seed;
+    y = 362436362436362436ULL;
+    z = 1066149217761810ULL;
+    c = 123456123456123456ULL;
+  }
+
+  uint64_t kiss() {
+    // Linear congruence generator
+    z = 6906969069LL*z+1234567;
+
+    // Xor shift
+    y ^= (y<<13);
+    y ^= (y>>17);
+    y ^= (y<<43);
+
+    // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
+    uint64_t t = (x<<58)+c;
+    c = (x>>6);
+    x += t;
+    c += (x<t);
+
+    return x + y + z;
+  }
+  inline int flip() {
+    // Draw random 0 or 1
+    return kiss() & 1;
+  }
+  inline size_t index(size_t n) {
+    // Draw random integer between 0 and n-1 where n is at most the number of data points you have
+    return kiss() % n;
+  }
+  inline void set_seed(uint32_t seed) {
+    x = seed;
+  }
+};
+
+#endif
+// vim: tabstop=2 shiftwidth=2
diff --git a/tests/basic-config.js b/tests/basic-config.js
index 5a92de9..2c41448 100644
--- a/tests/basic-config.js
+++ b/tests/basic-config.js
@@ -4,6 +4,6 @@ module.exports = {
   dimensions: 200,
   lookupWord1: 'big',
   lookupWord2: 'dog',
-  distanceBetweenWord1And2: 9.2809362,
+  distanceBetweenWord1And2: 3.046463,
   indexLookupWord: 'fire'
 };
diff --git a/tests/basictests.js b/tests/basictests.js
index 5138434..8eaaadf 100644
--- a/tests/basictests.js
+++ b/tests/basictests.js
@@ -64,7 +64,7 @@ function usingTest(t) {
     annoyIndex.getDistance(
       indexesForWords[config.lookupWord1], indexesForWords[config.lookupWord2]
     )
-    .toPrecision(8),
+    .toPrecision(7),
     config.distanceBetweenWord1And2.toString(),
     'getDistance calculates correct distance between items for ' +
       config.lookupWord1 + ' and ' + config.lookupWord2
diff --git a/tests/smalltest-manhattan.js b/tests/smalltest-manhattan.js
new file mode 100644
index 0000000..a3d8206
--- /dev/null
+++ b/tests/smalltest-manhattan.js
@@ -0,0 +1,81 @@
+/* global __dirname */
+
+var test = require('tape');
+var Annoy = require('../index');
+
+var annoyPath = __dirname + '/data/test.annoy';
+
+items =
+  [ [-5.0, -4.5, -3.2, -2.8, -2.1, -1.5, -0.34, 0, 3.7, 6]
+  , [5.0, 4.5, 3.2, 2.8, 2.1, 1.5, 0.34, 0, -3.7, -6]
+  , [0, 0, 0, 0, 0, -1, -1, -0.2, 0.1, 0.8]
+  ]
+
+test('Add test', addTest);
+test('Load test', loadTest);
+
+
+function addTest(t) {
+  var obj = new Annoy(10, 'Manhattan');
+
+  obj.addItem(0, items[0]);
+  obj.addItem(1, items[1]);
+  obj.addItem(2, items[2]);
+  
+  t.equal(obj.getNItems(), 3, 'Index has all the added items.');
+
+  obj.build();
+  t.ok(obj.save(annoyPath), 'Saved successfully.');
+  obj.unload();
+  t.end();
+}
+
+function loadTest(t) {
+  var obj2 = new Annoy(10, 'Manhattan');
+  var loadResult = obj2.load(annoyPath);
+  t.ok(loadResult, 'Loads successfully.');
+
+  if (loadResult) {
+    t.equal(obj2.getNItems(), 3, 'Number of items in index is correct.');
+    
+    var dist = 0;
+    for (var i = 0; i < items[0].length; i ++) {
+      dist += Math.abs(items[0][i] - items[1][i])
+    }
+
+    t.equal(
+      obj2.getDistance(0, 1).toPrecision(2),
+      dist.toPrecision(2),
+      'getDistance calculates correct distance between items 0 and 1.'
+    );
+
+    var v1 = obj2.getItem(0);
+    var v2 = obj2.getItem(1);
+
+    var sum = [];
+    for (var i = 0; i < v1.length; ++i) {
+      sum.push(v1[i] + v2[i]);
+    }
+    // console.log('Sum:', sum);
+    var neighbors = obj2.getNNsByVector(sum, 10, -1, false);
+    t.ok(Array.isArray(neighbors), 'getNNsByVector result is an array.');
+    // console.log('Nearest neighbors to sum', neighbors);
+
+    var nnResult = obj2.getNNsByVector(sum, 10, -1, true);
+    checkNeighborsAndDistancesResult(nnResult);
+
+    var neighborsByItem = obj2.getNNsByItem(1, 10, -1, false);
+    t.ok(Array.isArray(neighborsByItem), 'NN by item result is an array.');
+    var nnResultByItem = obj2.getNNsByItem(1, 10, -1, true);
+    checkNeighborsAndDistancesResult(nnResultByItem);
+  }
+
+  t.end();
+
+  function checkNeighborsAndDistancesResult(result) {
+    t.equal(typeof result, 'object', 'NN result is an object.');
+    t.ok(Array.isArray(result.neighbors), 'NN result has a neighbors array.');
+    t.ok(Array.isArray(result.distances), 'NN result has a distances array.');
+    // console.log('Nearest neighbors to sum with distances', result);
+  }
+}
diff --git a/tests/smalltest.js b/tests/smalltest.js
index deeca4f..8b64168 100644
--- a/tests/smalltest.js
+++ b/tests/smalltest.js
@@ -33,7 +33,7 @@ function loadTest(t) {
 
     t.equal(
       obj2.getDistance(0, 1),
-      4.0,
+      2.0,
       'getDistance calculates correct distance between items 0 and 1.'
     );
 
diff --git a/tests/very-big-config.js b/tests/very-big-config.js
index 4b01539..6fa15ec 100644
--- a/tests/very-big-config.js
+++ b/tests/very-big-config.js
@@ -4,6 +4,6 @@ module.exports = {
   dimensions: 300,
   lookupWord1: 'king',
   lookupWord2: 'woman',
-  distanceBetweenWord1And2: 13.494979,
+  distanceBetweenWord1And2: 3.673551,
   indexLookupWord: 'perpendicular'
 };